From 277c158c2e1820c0738ff90e2013493211140a25 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:03:02 +0000 Subject: [PATCH 1/3] Initial plan From d5ac527eb77122c674b9de1e89208a34132f9d5a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:09:53 +0000 Subject: [PATCH 2/3] Fix major README discrepancies with codebase Co-authored-by: KSemenenko <4385716+KSemenenko@users.noreply.github.com> --- README.md | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 01d7bb5ff..229eab78e 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ Install-Package ManagedCode.MarkItDown dotnet add package ManagedCode.MarkItDown # PackageReference (add to your .csproj) - + ``` ### Prerequisites @@ -219,21 +219,17 @@ Console.WriteLine(urlResult.Title); ### Customise the pipeline with options ```csharp -using Azure; using MarkItDown; var options = new MarkItDownOptions { - // Plug in your own services (Azure AI, OpenAI, etc.) + // Plug in your own services (custom image captioning, audio transcription, etc.) ImageCaptioner = async (bytes, info, token) => await myCaptionService.DescribeAsync(bytes, info, token), AudioTranscriber = async (bytes, info, token) => await speechClient.TranscribeAsync(bytes, info, token), - DocumentIntelligence = new DocumentIntelligenceOptions - { - Endpoint = "https://.cognitiveservices.azure.com/", - Credential = new AzureKeyCredential("") - } + // Note: Azure Document Intelligence integration is planned but not yet implemented + ExifToolPath = "/usr/local/bin/exiftool" }; var markItDown = new MarkItDown(options); @@ -309,20 +305,20 @@ markItDown.RegisterConverter(new MyCustomConverter()); git clone https://github.com/managedcode/markitdown.git cd markitdown -# Build the solution -dotnet build +# Build the solution (requires .NET 9 SDK for .slnx support) +dotnet build src/MarkItDown/MarkItDown.csproj # Run tests -dotnet test +dotnet test tests/MarkItDown.Tests/MarkItDown.Tests.csproj # Create NuGet package -dotnet pack --configuration Release +dotnet pack src/MarkItDown/MarkItDown.csproj --configuration Release ``` ### Tests & Coverage ```bash -dotnet test --collect:"XPlat Code Coverage" +dotnet test tests/MarkItDown.Tests/MarkItDown.Tests.csproj --collect:"XPlat Code Coverage" ``` The command emits standard test results plus a Cobertura coverage report at @@ -334,13 +330,12 @@ HTML or Markdown dashboards. ``` ├── src/ -│ ├── MarkItDown/ # Core library -│ │ ├── Converters/ # Format-specific converters (HTML, PDF, audio, etc.) -│ │ ├── MarkItDown.cs # Main conversion engine -│ │ ├── StreamInfoGuesser.cs # MIME/charset/extension detection helpers -│ │ ├── MarkItDownOptions.cs # Runtime configuration flags -│ │ └── ... # Shared utilities (UriUtilities, MimeMapping, etc.) -│ └── MarkItDown.Cli/ # CLI host (under active development) +│ └── MarkItDown/ # Core library +│ ├── Converters/ # Format-specific converters (HTML, PDF, audio, etc.) +│ ├── MarkItDown.cs # Main conversion engine +│ ├── StreamInfoGuesser.cs # MIME/charset/extension detection helpers +│ ├── MarkItDownOptions.cs # Runtime configuration flags +│ └── ... # Shared utilities (UriUtilities, MimeMapping, etc.) ├── tests/ │ └── MarkItDown.Tests/ # xUnit + Shouldly tests, Python parity vectors (WIP) ├── Directory.Build.props # Shared build + packaging settings @@ -359,9 +354,9 @@ HTML or Markdown dashboards. ## 🗺️ Roadmap ### 🎯 Near-Term -- Azure Document Intelligence converter (options already scaffolded) +- Azure Document Intelligence converter (options already scaffolded, implementation pending) +- CLI tool for command-line usage - Outlook `.msg` ingestion via MIT-friendly dependencies -- Expanded CLI commands (batch mode, globbing, JSON output) - Richer regression suite mirroring Python test vectors ### 🎯 Future Ideas From 43b98fff5b639071d1ed7257281d2348a5ede49c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Sep 2025 20:12:24 +0000 Subject: [PATCH 3/3] Complete README update to match codebase Co-authored-by: KSemenenko <4385716+KSemenenko@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 229eab78e..55ae71515 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ dotnet add package ManagedCode.MarkItDown ``` ### Prerequisites -- .NET 9.0 SDK or later +- .NET 9.0 SDK or later (project targets net9.0) - Compatible with .NET 9 apps and libraries ### Optional Dependencies for Advanced Features