From 92df59ce9ab236afdf2d76f379ab0c1c8c831546 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Mar 2026 10:15:26 +0000 Subject: [PATCH 1/3] Initial plan From e6036e1ebe72539106a594253add8325991c7072 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 28 Mar 2026 10:27:19 +0000 Subject: [PATCH 2/3] Remove Document property from IngestionChunk, add document parameter to WriteAsync - Remove Document property and constructor parameter from IngestionChunk - Add IngestionDocument document parameter to IngestionChunkWriter.WriteAsync - Update VectorStoreWriter to use the new document parameter - Update IngestionPipeline to pass document to WriteAsync - Update all chunkers (DocumentTokenChunker, ElementsChunker, HeaderChunker, SectionChunker, SemanticSimilarityChunker) to not pass document to chunks - Update all tests to match the new API Agent-Logs-Url: https://github.com/dotnet/extensions/sessions/d041591e-b70e-45f7-9302-c04e4787e92e Co-authored-by: adamsitnik <6011991+adamsitnik@users.noreply.github.com> --- .../IngestionChunk.cs | 11 ++---- .../IngestionChunkWriter.cs | 5 +-- .../Chunkers/DocumentTokenChunker.cs | 1 - .../Chunkers/ElementsChunker.cs | 6 ++-- .../Chunkers/HeaderChunker.cs | 8 ++--- .../Chunkers/SectionChunker.cs | 8 ++--- .../Chunkers/SemanticSimilarityChunker.cs | 6 ++-- .../IngestionPipeline.cs | 2 +- .../Writers/VectorStoreWriter.cs | 7 ++-- .../SemanticSimilarityChunkerTests.cs | 1 - .../IngestionChunkTests.cs | 11 ++---- .../IngestionPipelineTests.cs | 1 - .../Processors/ClassificationEnricherTests.cs | 8 ++--- .../Processors/KeywordEnricherTests.cs | 6 ++-- .../Processors/SentimentEnricherTests.cs | 10 +++--- .../Processors/SummaryEnricherTests.cs | 6 ++-- .../Utils/TestChunkFactory.cs | 4 +-- .../Writers/VectorStoreWriterTests.cs | 36 +++++++++---------- 18 files changed, 58 insertions(+), 79 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs index 55bc6cb4255..2cd22d862c2 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs @@ -21,11 +21,10 @@ public sealed class IngestionChunk /// Initializes a new instance of the class. /// /// The content of the chunk. - /// The document from which this chunk was extracted. /// The number of tokens used to represent the chunk. /// Additional context for the chunk. /// - /// or is . + /// is . /// /// /// is a string that is empty or contains only white-space characters. @@ -33,7 +32,7 @@ public sealed class IngestionChunk /// /// is negative. /// - public IngestionChunk(T content, IngestionDocument document, int tokenCount, string? context = null) + public IngestionChunk(T content, int tokenCount, string? context = null) { if (typeof(T) == typeof(string)) { @@ -44,7 +43,6 @@ public IngestionChunk(T content, IngestionDocument document, int tokenCount, str Content = Throw.IfNull(content); } - Document = Throw.IfNull(document); Context = context; TokenCount = Throw.IfLessThanOrEqual(tokenCount, 0); } @@ -54,11 +52,6 @@ public IngestionChunk(T content, IngestionDocument document, int tokenCount, str /// public T Content { get; } - /// - /// Gets the document from which this chunk was extracted. - /// - public IngestionDocument Document { get; } - /// /// Gets additional context for the chunk. /// diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs index 119265caf6e..000dec6c6b9 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs @@ -15,12 +15,13 @@ namespace Microsoft.Extensions.DataIngestion; public abstract class IngestionChunkWriter : IDisposable { /// - /// Writes chunks asynchronously. + /// Writes the chunks of a single document asynchronously. /// /// The chunks to write. + /// The document from which the chunks were extracted. /// The token to monitor for cancellation requests. /// A task representing the asynchronous write operation. - public abstract Task WriteAsync(IAsyncEnumerable> chunks, CancellationToken cancellationToken = default); + public abstract Task WriteAsync(IAsyncEnumerable> chunks, IngestionDocument document, CancellationToken cancellationToken = default); /// /// Disposes the writer and releases all associated resources. diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs index 14858dc6f1a..ae1dd405e4a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs @@ -94,7 +94,6 @@ IngestionChunk FinalizeChunk() { IngestionChunk chunk = new IngestionChunk( content: stringBuilder.ToString(), - document: document, tokenCount: stringBuilderTokenCount, context: string.Empty); _ = stringBuilder.Clear(); diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs index 6383af3387a..83764b9d53a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs @@ -32,7 +32,7 @@ internal ElementsChunker(IngestionChunkerOptions options) // 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized. // 2. Maintain context in each chunk. // 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows). - internal IEnumerable> Process(IngestionDocument document, string context, List elements) + internal IEnumerable> Process(string context, List elements) { // Not using yield return here as we use ref structs. List> chunks = []; @@ -198,7 +198,7 @@ internal IEnumerable> Process(IngestionDocument document, { string chunkContent = _currentChunk.ToString(); int chunkTokenCount = CountTokens(chunkContent.AsSpan()); - chunks.Add(new(chunkContent, document, chunkTokenCount, context)); + chunks.Add(new(chunkContent, chunkTokenCount, context)); } _currentChunk.Clear(); @@ -209,7 +209,7 @@ void Commit() { string chunkContent = _currentChunk.ToString(); int chunkTokenCount = CountTokens(chunkContent.AsSpan()); - chunks.Add(new(chunkContent, document, chunkTokenCount, context)); + chunks.Add(new(chunkContent, chunkTokenCount, context)); // We keep the context in the current chunk as it's the same for all elements. _currentChunk.Remove( diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs index 8f3039c7b2f..6c46641309e 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs @@ -43,7 +43,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge if (element is IngestionDocumentHeader header) { - foreach (var chunk in SplitIntoChunks(document, headers, elements)) + foreach (var chunk in SplitIntoChunks(headers, elements)) { yield return chunk; } @@ -59,19 +59,19 @@ public override async IAsyncEnumerable> ProcessAsync(Inge } // take care of any remaining paragraphs - foreach (var chunk in SplitIntoChunks(document, headers, elements)) + foreach (var chunk in SplitIntoChunks(headers, elements)) { yield return chunk; } } - private IEnumerable> SplitIntoChunks(IngestionDocument document, string?[] headers, List elements) + private IEnumerable> SplitIntoChunks(string?[] headers, List elements) { if (elements.Count > 0) { string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h))); - foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements)) + foreach (var chunk in _elementsChunker.Process(chunkHeader, elements)) { yield return chunk; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs index c584ece12c4..2998cb5fba8 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs @@ -35,7 +35,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge { cancellationToken.ThrowIfCancellationRequested(); - Process(document, section, chunks); + Process(section, chunks); foreach (var chunk in chunks) { yield return chunk; @@ -44,7 +44,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge } } - private void Process(IngestionDocument document, IngestionDocumentSection section, List> chunks, string? parentContext = null) + private void Process(IngestionDocumentSection section, List> chunks, string? parentContext = null) { List elements = new(section.Elements.Count); string context = parentContext ?? string.Empty; @@ -62,7 +62,7 @@ private void Process(IngestionDocument document, IngestionDocumentSection sectio break; case IngestionDocumentSection nestedSection: Commit(); - Process(document, nestedSection, chunks, context); + Process(nestedSection, chunks, context); break; default: elements.Add(section.Elements[i]); @@ -76,7 +76,7 @@ void Commit() { if (elements.Count > 0) { - foreach (var chunk in _elementsChunker.Process(document, context, elements)) + foreach (var chunk in _elementsChunker.Process(context, elements)) { chunks.Add(chunk); } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs index 78971cfe920..177b6ff90d8 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs @@ -50,7 +50,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge _ = Throw.IfNull(document); List<(IngestionDocumentElement, float)> distances = await CalculateDistancesAsync(document, cancellationToken).ConfigureAwait(false); - foreach (var chunk in MakeChunks(document, distances)) + foreach (var chunk in MakeChunks(distances)) { yield return chunk; } @@ -93,7 +93,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge return elementDistances; } - private IEnumerable> MakeChunks(IngestionDocument document, List<(IngestionDocumentElement element, float distance)> elementDistances) + private IEnumerable> MakeChunks(List<(IngestionDocumentElement element, float distance)> elementDistances) { float distanceThreshold = Percentile(elementDistances); @@ -106,7 +106,7 @@ private IEnumerable> MakeChunks(IngestionDocument documen elementAccumulator.Add(element); if (distance > distanceThreshold || i == elementDistances.Count - 1) { - foreach (var chunk in _elementsChunker.Process(document, context, elementAccumulator)) + foreach (var chunk in _elementsChunker.Process(context, elementAccumulator)) { yield return chunk; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs index 1eeb94058ee..bfb014afc38 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs @@ -187,7 +187,7 @@ private async Task IngestAsync(IngestionDocument document, Ac } _logger?.WritingChunks(GetShortName(_writer)); - await _writer.WriteAsync(chunks, cancellationToken).ConfigureAwait(false); + await _writer.WriteAsync(chunks, document, cancellationToken).ConfigureAwait(false); _logger?.WroteChunks(document.Identifier); return document; diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs index 124c33ab644..53c346f4b36 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs @@ -43,9 +43,10 @@ public VectorStoreWriter(VectorStoreCollection collection, Vector public VectorStoreCollection VectorStoreCollection { get; } /// - public override async Task WriteAsync(IAsyncEnumerable> chunks, CancellationToken cancellationToken = default) + public override async Task WriteAsync(IAsyncEnumerable> chunks, IngestionDocument document, CancellationToken cancellationToken = default) { _ = Throw.IfNull(chunks); + _ = Throw.IfNull(document); IReadOnlyList? preExistingKeys = null; List? batch = null; @@ -62,13 +63,13 @@ public override async Task WriteAsync(IAsyncEnumerable> c // We obtain the IDs of the pre-existing chunks for given document, // and delete them after we finish inserting the new chunks, // to avoid a situation where we delete the chunks and then fail to insert the new ones. - preExistingKeys ??= await GetPreExistingChunksIdsAsync(chunk.Document, cancellationToken).ConfigureAwait(false); + preExistingKeys ??= await GetPreExistingChunksIdsAsync(document, cancellationToken).ConfigureAwait(false); TRecord record = new() { Content = chunk.Content, Context = chunk.Context, - DocumentId = chunk.Document.Identifier, + DocumentId = document.Identifier, }; if (chunk.HasMetadata) diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs index 354cebf1565..680e5a12d6d 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs @@ -176,7 +176,6 @@ public async Task TwoSeparateTopicsWithAllKindsOfElements() IReadOnlyList> chunks = await chunker.ProcessAsync(doc).ToListAsync(); Assert.Equal(3, chunks.Count); - Assert.All(chunks, chunk => Assert.Same(doc, chunk.Document)); Assert.Equal($@"# .NET Supported Languages The .NET platform supports multiple programming languages: {dotNetTableMarkdown} diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs index 384321de854..f9b49c5268e 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs @@ -11,8 +11,7 @@ public class IngestionChunkTests [Fact] public void Constructor_SetsTokenCountProperty() { - IngestionDocument document = new("test"); - IngestionChunk chunk = new("test content", document, 42); + IngestionChunk chunk = new("test content", 42); Assert.Equal(42, chunk.TokenCount); } @@ -20,10 +19,8 @@ public void Constructor_SetsTokenCountProperty() [Fact] public void Constructor_ThrowsWhenTokenCountIsNegative() { - IngestionDocument document = new("test"); - ArgumentOutOfRangeException exception = Assert.Throws( - () => new IngestionChunk("test content", document, -1)); + () => new IngestionChunk("test content", -1)); Assert.Equal("tokenCount", exception.ParamName); } @@ -31,10 +28,8 @@ public void Constructor_ThrowsWhenTokenCountIsNegative() [Fact] public void Constructor_ThrowsWhenTokenCountIsZero() { - IngestionDocument document = new("test"); - ArgumentOutOfRangeException exception = Assert.Throws( - () => new IngestionChunk("test content", document, 0)); + () => new IngestionChunk("test content", 0)); Assert.Equal("tokenCount", exception.ParamName); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs index e865ff39d9b..bdd05d153fe 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs @@ -190,7 +190,6 @@ public override IAsyncEnumerable> ProcessAsync(Inges .OfType() .Select(image => new IngestionChunk( content: new(image.Content.GetValueOrDefault(), image.MediaType!), - document: document, tokenCount: 123)) // made up number as we currently don't have the ability to easily count exact tokens .ToAsyncEnumerable(); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs index 66426a42b4d..b52247d8447 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs @@ -16,8 +16,6 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class ClassificationEnricherTests { - private static readonly IngestionDocument _document = new("test"); - [Fact] public void ThrowsOnNullOptions() { @@ -122,9 +120,9 @@ private static List> CreateChunks() => [ TestChunkFactory.CreateChunk(".NET developers need to integrate and interact with a growing variety of artificial intelligence (AI) services in their apps. " + "The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components, and enable seamless" + - " integration and interoperability with various AI services.", _document), + " integration and interoperability with various AI services."), TestChunkFactory.CreateChunk("Rabbits are small mammals in the family Leporidae of the order Lagomorpha (along with the hare and the pika)." + - "They are herbivorous animals and are known for their long ears, large hind legs, and short fluffy tails.", _document), - TestChunkFactory.CreateChunk("This text does not belong to any category.", _document), + "They are herbivorous animals and are known for their long ears, large hind legs, and short fluffy tails."), + TestChunkFactory.CreateChunk("This text does not belong to any category."), ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs index e26d9948da2..d44b624b655 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs @@ -16,8 +16,6 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class KeywordEnricherTests { - private static readonly IngestionDocument _document = new("test"); - [Fact] public void ThrowsOnNullOptions() { @@ -120,7 +118,7 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components", _document), - TestChunkFactory.CreateChunk("Rabbits are great pets. They are friendly and make excellent companions.", _document) + TestChunkFactory.CreateChunk("The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components"), + TestChunkFactory.CreateChunk("Rabbits are great pets. They are friendly and make excellent companions.") ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs index 4ac188b684a..bd1fd3346c5 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs @@ -16,8 +16,6 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class SentimentEnricherTests { - private static readonly IngestionDocument _document = new("test"); - [Fact] public void ThrowsOnNullOptions() { @@ -107,9 +105,9 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding.", _document), - TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.", _document), - TestChunkFactory.CreateChunk("The weather is okay, not too bad but not great either.", _document), - TestChunkFactory.CreateChunk("I hate you. I am sorry, I actually don't. I am not sure myself what my feelings are.", _document) + TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding."), + TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming."), + TestChunkFactory.CreateChunk("The weather is okay, not too bad but not great either."), + TestChunkFactory.CreateChunk("I hate you. I am sorry, I actually don't. I am not sure myself what my feelings are.") ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs index 937036afe83..4e0401f257d 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs @@ -16,8 +16,6 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class SummaryEnricherTests { - private static readonly IngestionDocument _document = new("test"); - [Fact] public void ThrowsOnNullOptions() { @@ -104,7 +102,7 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding.", _document), - TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.", _document) + TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding."), + TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.") ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs index e443f9fc656..a5fdbf9b4b3 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs @@ -9,9 +9,9 @@ public static class TestChunkFactory { private static readonly Tokenizer _tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); - public static IngestionChunk CreateChunk(string content, IngestionDocument document) + public static IngestionChunk CreateChunk(string content) { int tokenCount = _tokenizer.CountTokens(content, considerNormalization: false); - return new IngestionChunk(content, document, tokenCount); + return new IngestionChunk(content, tokenCount); } } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs index e8ab6ab6d73..b00160c012c 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs @@ -47,11 +47,11 @@ public async Task CanWriteChunksWithCustomDefinition() using VectorStoreWriter> writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("custom schema content", document); + IngestionChunk chunk = TestChunkFactory.CreateChunk("custom schema content"); List> chunks = [chunk]; - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); IngestionChunkVectorRecord record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -77,12 +77,12 @@ public async Task CanWriteChunks() using VectorStoreWriter> writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("some content", document); + IngestionChunk chunk = TestChunkFactory.CreateChunk("some content"); List> chunks = [chunk]; Assert.False(testEmbeddingGenerator.WasCalled); - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); IngestionChunkVectorRecord record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -107,12 +107,12 @@ public async Task CanWriteChunksWithMetadata() using TestVectorStoreWriterWithMetadata writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("some content", document); + IngestionChunk chunk = TestChunkFactory.CreateChunk("some content"); chunk.Metadata["Classification"] = "important"; List> chunks = [chunk]; - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); TestChunkRecordWithMetadata record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -143,12 +143,12 @@ public async Task DoesSupportIncrementalIngestion() }); IngestionDocument document = new(documentId); - IngestionChunk chunk1 = TestChunkFactory.CreateChunk("first chunk", document); - IngestionChunk chunk2 = TestChunkFactory.CreateChunk("second chunk", document); + IngestionChunk chunk1 = TestChunkFactory.CreateChunk("first chunk"); + IngestionChunk chunk2 = TestChunkFactory.CreateChunk("second chunk"); List> chunks = [chunk1, chunk2]; - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 100) @@ -156,11 +156,11 @@ public async Task DoesSupportIncrementalIngestion() Assert.Equal(chunks.Count, recordCount); // Now we will do an incremental ingestion that updates the chunk(s). - IngestionChunk updatedChunk = TestChunkFactory.CreateChunk("different content", document); + IngestionChunk updatedChunk = TestChunkFactory.CreateChunk("different content"); List> updatedChunks = [updatedChunk]; - await writer.WriteAsync(updatedChunks.ToAsyncEnumerable()); + await writer.WriteAsync(updatedChunks.ToAsyncEnumerable(), document); // We ask for 100 records, but we expect only 1 as the previous 2 should have been deleted. IngestionChunkVectorRecord record = await writer.VectorStoreCollection @@ -213,10 +213,10 @@ public async Task BatchesChunks(int? batchTokenCount, int[] chunkTokenCounts) List> chunks = []; for (int i = 0; i < chunkTokenCounts.Length; i++) { - chunks.Add(new($"chunk {i + 1}", document, context: null, tokenCount: chunkTokenCounts[i])); + chunks.Add(new($"chunk {i + 1}", tokenCount: chunkTokenCounts[i])); } - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 100) @@ -249,10 +249,10 @@ public async Task IncrementalIngestion_WithManyRecords_DeletesAllPreExistingChun List> chunks = []; for (int i = 0; i < 50; i++) { - chunks.Add(TestChunkFactory.CreateChunk($"chunk {i}", document)); + chunks.Add(TestChunkFactory.CreateChunk($"chunk {i}")); } - await writer.WriteAsync(chunks.ToAsyncEnumerable()); + await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 10000) @@ -262,11 +262,11 @@ public async Task IncrementalIngestion_WithManyRecords_DeletesAllPreExistingChun // Now we will do an incremental ingestion that should delete all pre-existing chunks List> updatedChunks = [ - TestChunkFactory.CreateChunk("updated chunk 1", document), - TestChunkFactory.CreateChunk("updated chunk 2", document) + TestChunkFactory.CreateChunk("updated chunk 1"), + TestChunkFactory.CreateChunk("updated chunk 2") ]; - await writer.WriteAsync(updatedChunks.ToAsyncEnumerable()); + await writer.WriteAsync(updatedChunks.ToAsyncEnumerable(), document); // Verify that all old records were deleted and only the new ones remain List> records = await writer.VectorStoreCollection From fbc1730f3b327dc9e3545a5b49c14259c06193dc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:47:18 +0000 Subject: [PATCH 3/3] Restore Document on IngestionChunk; make document first arg of WriteAsync Addresses review feedback: - Restore Document property on IngestionChunk (revert removal) - Keep IngestionDocument document as explicit parameter on WriteAsync - Make document the first argument of WriteAsync - Only 4 files changed vs base: IngestionChunkWriter, VectorStoreWriter, IngestionPipeline, VectorStoreWriterTests Agent-Logs-Url: https://github.com/dotnet/extensions/sessions/1e9e1aa4-84c0-4e3c-a6b1-b1f90cafeda3 Co-authored-by: adamsitnik <6011991+adamsitnik@users.noreply.github.com> --- .../IngestionChunk.cs | 11 ++++-- .../IngestionChunkWriter.cs | 4 +-- .../Chunkers/DocumentTokenChunker.cs | 1 + .../Chunkers/ElementsChunker.cs | 6 ++-- .../Chunkers/HeaderChunker.cs | 8 ++--- .../Chunkers/SectionChunker.cs | 8 ++--- .../Chunkers/SemanticSimilarityChunker.cs | 6 ++-- .../IngestionPipeline.cs | 2 +- .../Writers/VectorStoreWriter.cs | 4 +-- .../SemanticSimilarityChunkerTests.cs | 1 + .../IngestionChunkTests.cs | 11 ++++-- .../IngestionPipelineTests.cs | 1 + .../Processors/ClassificationEnricherTests.cs | 8 +++-- .../Processors/KeywordEnricherTests.cs | 6 ++-- .../Processors/SentimentEnricherTests.cs | 10 +++--- .../Processors/SummaryEnricherTests.cs | 6 ++-- .../Utils/TestChunkFactory.cs | 4 +-- .../Writers/VectorStoreWriterTests.cs | 36 +++++++++---------- 18 files changed, 78 insertions(+), 55 deletions(-) diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs index 2cd22d862c2..55bc6cb4255 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunk.cs @@ -21,10 +21,11 @@ public sealed class IngestionChunk /// Initializes a new instance of the class. /// /// The content of the chunk. + /// The document from which this chunk was extracted. /// The number of tokens used to represent the chunk. /// Additional context for the chunk. /// - /// is . + /// or is . /// /// /// is a string that is empty or contains only white-space characters. @@ -32,7 +33,7 @@ public sealed class IngestionChunk /// /// is negative. /// - public IngestionChunk(T content, int tokenCount, string? context = null) + public IngestionChunk(T content, IngestionDocument document, int tokenCount, string? context = null) { if (typeof(T) == typeof(string)) { @@ -43,6 +44,7 @@ public IngestionChunk(T content, int tokenCount, string? context = null) Content = Throw.IfNull(content); } + Document = Throw.IfNull(document); Context = context; TokenCount = Throw.IfLessThanOrEqual(tokenCount, 0); } @@ -52,6 +54,11 @@ public IngestionChunk(T content, int tokenCount, string? context = null) /// public T Content { get; } + /// + /// Gets the document from which this chunk was extracted. + /// + public IngestionDocument Document { get; } + /// /// Gets additional context for the chunk. /// diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs index 000dec6c6b9..7eb4292a582 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionChunkWriter.cs @@ -17,11 +17,11 @@ public abstract class IngestionChunkWriter : IDisposable /// /// Writes the chunks of a single document asynchronously. /// - /// The chunks to write. /// The document from which the chunks were extracted. + /// The chunks to write. /// The token to monitor for cancellation requests. /// A task representing the asynchronous write operation. - public abstract Task WriteAsync(IAsyncEnumerable> chunks, IngestionDocument document, CancellationToken cancellationToken = default); + public abstract Task WriteAsync(IngestionDocument document, IAsyncEnumerable> chunks, CancellationToken cancellationToken = default); /// /// Disposes the writer and releases all associated resources. diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs index ae1dd405e4a..14858dc6f1a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/DocumentTokenChunker.cs @@ -94,6 +94,7 @@ IngestionChunk FinalizeChunk() { IngestionChunk chunk = new IngestionChunk( content: stringBuilder.ToString(), + document: document, tokenCount: stringBuilderTokenCount, context: string.Empty); _ = stringBuilder.Clear(); diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs index 83764b9d53a..6383af3387a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs @@ -32,7 +32,7 @@ internal ElementsChunker(IngestionChunkerOptions options) // 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized. // 2. Maintain context in each chunk. // 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows). - internal IEnumerable> Process(string context, List elements) + internal IEnumerable> Process(IngestionDocument document, string context, List elements) { // Not using yield return here as we use ref structs. List> chunks = []; @@ -198,7 +198,7 @@ internal IEnumerable> Process(string context, List> ProcessAsync(Inge if (element is IngestionDocumentHeader header) { - foreach (var chunk in SplitIntoChunks(headers, elements)) + foreach (var chunk in SplitIntoChunks(document, headers, elements)) { yield return chunk; } @@ -59,19 +59,19 @@ public override async IAsyncEnumerable> ProcessAsync(Inge } // take care of any remaining paragraphs - foreach (var chunk in SplitIntoChunks(headers, elements)) + foreach (var chunk in SplitIntoChunks(document, headers, elements)) { yield return chunk; } } - private IEnumerable> SplitIntoChunks(string?[] headers, List elements) + private IEnumerable> SplitIntoChunks(IngestionDocument document, string?[] headers, List elements) { if (elements.Count > 0) { string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h))); - foreach (var chunk in _elementsChunker.Process(chunkHeader, elements)) + foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements)) { yield return chunk; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs index 2998cb5fba8..c584ece12c4 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SectionChunker.cs @@ -35,7 +35,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge { cancellationToken.ThrowIfCancellationRequested(); - Process(section, chunks); + Process(document, section, chunks); foreach (var chunk in chunks) { yield return chunk; @@ -44,7 +44,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge } } - private void Process(IngestionDocumentSection section, List> chunks, string? parentContext = null) + private void Process(IngestionDocument document, IngestionDocumentSection section, List> chunks, string? parentContext = null) { List elements = new(section.Elements.Count); string context = parentContext ?? string.Empty; @@ -62,7 +62,7 @@ private void Process(IngestionDocumentSection section, List 0) { - foreach (var chunk in _elementsChunker.Process(context, elements)) + foreach (var chunk in _elementsChunker.Process(document, context, elements)) { chunks.Add(chunk); } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs index 177b6ff90d8..78971cfe920 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/SemanticSimilarityChunker.cs @@ -50,7 +50,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge _ = Throw.IfNull(document); List<(IngestionDocumentElement, float)> distances = await CalculateDistancesAsync(document, cancellationToken).ConfigureAwait(false); - foreach (var chunk in MakeChunks(distances)) + foreach (var chunk in MakeChunks(document, distances)) { yield return chunk; } @@ -93,7 +93,7 @@ public override async IAsyncEnumerable> ProcessAsync(Inge return elementDistances; } - private IEnumerable> MakeChunks(List<(IngestionDocumentElement element, float distance)> elementDistances) + private IEnumerable> MakeChunks(IngestionDocument document, List<(IngestionDocumentElement element, float distance)> elementDistances) { float distanceThreshold = Percentile(elementDistances); @@ -106,7 +106,7 @@ private IEnumerable> MakeChunks(List<(IngestionDocumentEl elementAccumulator.Add(element); if (distance > distanceThreshold || i == elementDistances.Count - 1) { - foreach (var chunk in _elementsChunker.Process(context, elementAccumulator)) + foreach (var chunk in _elementsChunker.Process(document, context, elementAccumulator)) { yield return chunk; } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs index bfb014afc38..35ba3d38823 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs @@ -187,7 +187,7 @@ private async Task IngestAsync(IngestionDocument document, Ac } _logger?.WritingChunks(GetShortName(_writer)); - await _writer.WriteAsync(chunks, document, cancellationToken).ConfigureAwait(false); + await _writer.WriteAsync(document, chunks, cancellationToken).ConfigureAwait(false); _logger?.WroteChunks(document.Identifier); return document; diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs index 53c346f4b36..967e2e91929 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/Writers/VectorStoreWriter.cs @@ -43,10 +43,10 @@ public VectorStoreWriter(VectorStoreCollection collection, Vector public VectorStoreCollection VectorStoreCollection { get; } /// - public override async Task WriteAsync(IAsyncEnumerable> chunks, IngestionDocument document, CancellationToken cancellationToken = default) + public override async Task WriteAsync(IngestionDocument document, IAsyncEnumerable> chunks, CancellationToken cancellationToken = default) { - _ = Throw.IfNull(chunks); _ = Throw.IfNull(document); + _ = Throw.IfNull(chunks); IReadOnlyList? preExistingKeys = null; List? batch = null; diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs index 680e5a12d6d..354cebf1565 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Chunkers/SemanticSimilarityChunkerTests.cs @@ -176,6 +176,7 @@ public async Task TwoSeparateTopicsWithAllKindsOfElements() IReadOnlyList> chunks = await chunker.ProcessAsync(doc).ToListAsync(); Assert.Equal(3, chunks.Count); + Assert.All(chunks, chunk => Assert.Same(doc, chunk.Document)); Assert.Equal($@"# .NET Supported Languages The .NET platform supports multiple programming languages: {dotNetTableMarkdown} diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs index f9b49c5268e..384321de854 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionChunkTests.cs @@ -11,7 +11,8 @@ public class IngestionChunkTests [Fact] public void Constructor_SetsTokenCountProperty() { - IngestionChunk chunk = new("test content", 42); + IngestionDocument document = new("test"); + IngestionChunk chunk = new("test content", document, 42); Assert.Equal(42, chunk.TokenCount); } @@ -19,8 +20,10 @@ public void Constructor_SetsTokenCountProperty() [Fact] public void Constructor_ThrowsWhenTokenCountIsNegative() { + IngestionDocument document = new("test"); + ArgumentOutOfRangeException exception = Assert.Throws( - () => new IngestionChunk("test content", -1)); + () => new IngestionChunk("test content", document, -1)); Assert.Equal("tokenCount", exception.ParamName); } @@ -28,8 +31,10 @@ public void Constructor_ThrowsWhenTokenCountIsNegative() [Fact] public void Constructor_ThrowsWhenTokenCountIsZero() { + IngestionDocument document = new("test"); + ArgumentOutOfRangeException exception = Assert.Throws( - () => new IngestionChunk("test content", 0)); + () => new IngestionChunk("test content", document, 0)); Assert.Equal("tokenCount", exception.ParamName); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs index bdd05d153fe..e865ff39d9b 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs @@ -190,6 +190,7 @@ public override IAsyncEnumerable> ProcessAsync(Inges .OfType() .Select(image => new IngestionChunk( content: new(image.Content.GetValueOrDefault(), image.MediaType!), + document: document, tokenCount: 123)) // made up number as we currently don't have the ability to easily count exact tokens .ToAsyncEnumerable(); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs index b52247d8447..66426a42b4d 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/ClassificationEnricherTests.cs @@ -16,6 +16,8 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class ClassificationEnricherTests { + private static readonly IngestionDocument _document = new("test"); + [Fact] public void ThrowsOnNullOptions() { @@ -120,9 +122,9 @@ private static List> CreateChunks() => [ TestChunkFactory.CreateChunk(".NET developers need to integrate and interact with a growing variety of artificial intelligence (AI) services in their apps. " + "The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components, and enable seamless" + - " integration and interoperability with various AI services."), + " integration and interoperability with various AI services.", _document), TestChunkFactory.CreateChunk("Rabbits are small mammals in the family Leporidae of the order Lagomorpha (along with the hare and the pika)." + - "They are herbivorous animals and are known for their long ears, large hind legs, and short fluffy tails."), - TestChunkFactory.CreateChunk("This text does not belong to any category."), + "They are herbivorous animals and are known for their long ears, large hind legs, and short fluffy tails.", _document), + TestChunkFactory.CreateChunk("This text does not belong to any category.", _document), ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs index d44b624b655..e26d9948da2 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/KeywordEnricherTests.cs @@ -16,6 +16,8 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class KeywordEnricherTests { + private static readonly IngestionDocument _document = new("test"); + [Fact] public void ThrowsOnNullOptions() { @@ -118,7 +120,7 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components"), - TestChunkFactory.CreateChunk("Rabbits are great pets. They are friendly and make excellent companions.") + TestChunkFactory.CreateChunk("The Microsoft.Extensions.AI libraries provide a unified approach for representing generative AI components", _document), + TestChunkFactory.CreateChunk("Rabbits are great pets. They are friendly and make excellent companions.", _document) ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs index bd1fd3346c5..4ac188b684a 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SentimentEnricherTests.cs @@ -16,6 +16,8 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class SentimentEnricherTests { + private static readonly IngestionDocument _document = new("test"); + [Fact] public void ThrowsOnNullOptions() { @@ -105,9 +107,9 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding."), - TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming."), - TestChunkFactory.CreateChunk("The weather is okay, not too bad but not great either."), - TestChunkFactory.CreateChunk("I hate you. I am sorry, I actually don't. I am not sure myself what my feelings are.") + TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding.", _document), + TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.", _document), + TestChunkFactory.CreateChunk("The weather is okay, not too bad but not great either.", _document), + TestChunkFactory.CreateChunk("I hate you. I am sorry, I actually don't. I am not sure myself what my feelings are.", _document) ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs index 4e0401f257d..937036afe83 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Processors/SummaryEnricherTests.cs @@ -16,6 +16,8 @@ namespace Microsoft.Extensions.DataIngestion.Processors.Tests; public class SummaryEnricherTests { + private static readonly IngestionDocument _document = new("test"); + [Fact] public void ThrowsOnNullOptions() { @@ -102,7 +104,7 @@ public async Task FailureDoesNotStopTheProcessing() private static List> CreateChunks() => [ - TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding."), - TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.") + TestChunkFactory.CreateChunk("I love programming! It's so much fun and rewarding.", _document), + TestChunkFactory.CreateChunk("I hate bugs. They are so frustrating and time-consuming.", _document) ]; } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs index a5fdbf9b4b3..e443f9fc656 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Utils/TestChunkFactory.cs @@ -9,9 +9,9 @@ public static class TestChunkFactory { private static readonly Tokenizer _tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o"); - public static IngestionChunk CreateChunk(string content) + public static IngestionChunk CreateChunk(string content, IngestionDocument document) { int tokenCount = _tokenizer.CountTokens(content, considerNormalization: false); - return new IngestionChunk(content, tokenCount); + return new IngestionChunk(content, document, tokenCount); } } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs index b00160c012c..8cb7b0d9768 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Writers/VectorStoreWriterTests.cs @@ -47,11 +47,11 @@ public async Task CanWriteChunksWithCustomDefinition() using VectorStoreWriter> writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("custom schema content"); + IngestionChunk chunk = TestChunkFactory.CreateChunk("custom schema content", document); List> chunks = [chunk]; - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); IngestionChunkVectorRecord record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -77,12 +77,12 @@ public async Task CanWriteChunks() using VectorStoreWriter> writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("some content"); + IngestionChunk chunk = TestChunkFactory.CreateChunk("some content", document); List> chunks = [chunk]; Assert.False(testEmbeddingGenerator.WasCalled); - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); IngestionChunkVectorRecord record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -107,12 +107,12 @@ public async Task CanWriteChunksWithMetadata() using TestVectorStoreWriterWithMetadata writer = new(collection); IngestionDocument document = new(documentId); - IngestionChunk chunk = TestChunkFactory.CreateChunk("some content"); + IngestionChunk chunk = TestChunkFactory.CreateChunk("some content", document); chunk.Metadata["Classification"] = "important"; List> chunks = [chunk]; - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); TestChunkRecordWithMetadata record = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 1) @@ -143,12 +143,12 @@ public async Task DoesSupportIncrementalIngestion() }); IngestionDocument document = new(documentId); - IngestionChunk chunk1 = TestChunkFactory.CreateChunk("first chunk"); - IngestionChunk chunk2 = TestChunkFactory.CreateChunk("second chunk"); + IngestionChunk chunk1 = TestChunkFactory.CreateChunk("first chunk", document); + IngestionChunk chunk2 = TestChunkFactory.CreateChunk("second chunk", document); List> chunks = [chunk1, chunk2]; - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 100) @@ -156,11 +156,11 @@ public async Task DoesSupportIncrementalIngestion() Assert.Equal(chunks.Count, recordCount); // Now we will do an incremental ingestion that updates the chunk(s). - IngestionChunk updatedChunk = TestChunkFactory.CreateChunk("different content"); + IngestionChunk updatedChunk = TestChunkFactory.CreateChunk("different content", document); List> updatedChunks = [updatedChunk]; - await writer.WriteAsync(updatedChunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, updatedChunks.ToAsyncEnumerable()); // We ask for 100 records, but we expect only 1 as the previous 2 should have been deleted. IngestionChunkVectorRecord record = await writer.VectorStoreCollection @@ -213,10 +213,10 @@ public async Task BatchesChunks(int? batchTokenCount, int[] chunkTokenCounts) List> chunks = []; for (int i = 0; i < chunkTokenCounts.Length; i++) { - chunks.Add(new($"chunk {i + 1}", tokenCount: chunkTokenCounts[i])); + chunks.Add(new($"chunk {i + 1}", document, context: null, tokenCount: chunkTokenCounts[i])); } - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 100) @@ -249,10 +249,10 @@ public async Task IncrementalIngestion_WithManyRecords_DeletesAllPreExistingChun List> chunks = []; for (int i = 0; i < 50; i++) { - chunks.Add(TestChunkFactory.CreateChunk($"chunk {i}")); + chunks.Add(TestChunkFactory.CreateChunk($"chunk {i}", document)); } - await writer.WriteAsync(chunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, chunks.ToAsyncEnumerable()); int recordCount = await writer.VectorStoreCollection .GetAsync(filter: record => record.DocumentId == documentId, top: 10000) @@ -262,11 +262,11 @@ public async Task IncrementalIngestion_WithManyRecords_DeletesAllPreExistingChun // Now we will do an incremental ingestion that should delete all pre-existing chunks List> updatedChunks = [ - TestChunkFactory.CreateChunk("updated chunk 1"), - TestChunkFactory.CreateChunk("updated chunk 2") + TestChunkFactory.CreateChunk("updated chunk 1", document), + TestChunkFactory.CreateChunk("updated chunk 2", document) ]; - await writer.WriteAsync(updatedChunks.ToAsyncEnumerable(), document); + await writer.WriteAsync(document, updatedChunks.ToAsyncEnumerable()); // Verify that all old records were deleted and only the new ones remain List> records = await writer.VectorStoreCollection