From 6c3c6e9fe9fb85911cef799b4fbe940e20391981 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Tue, 27 Jan 2026 17:06:40 -0600 Subject: [PATCH 1/2] Fix bug that causes Merge to fail with empty DataFrames --- .../PrimitiveColumnContainer.cs | 5 + .../DataFrameAssert.cs | 42 +++ .../DataFrameTests.Merge.cs | 250 ++++++++++++++++++ 3 files changed, 297 insertions(+) create mode 100644 test/Microsoft.Data.Analysis.Tests/DataFrameAssert.cs diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index be63ae2156..062375c77b 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -460,6 +460,11 @@ private List> CloneNullBitMapBuffers() public PrimitiveColumnContainer Clone(PrimitiveColumnContainer mapIndices, Type type, bool invertMapIndices = false) where U : unmanaged { + if (Buffers.Count == 0) + { + return new PrimitiveColumnContainer(mapIndices.Length); + } + ReadOnlySpan thisSpan = Buffers[0].ReadOnlySpan; ReadOnlySpan thisNullBitMapSpan = NullBitMapBuffers[0].ReadOnlySpan; long minRange = 0; diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameAssert.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameAssert.cs new file mode 100644 index 0000000000..14f51ae043 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameAssert.cs @@ -0,0 +1,42 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public static class DataFrameAssert + { + public static void Equal(DataFrame expected, DataFrame actual) + { + Assert.Equal(expected.Columns.Count, actual.Columns.Count); + Assert.Equal(expected.Rows.Count, actual.Rows.Count); + + for (int c = 0; c < expected.Columns.Count; c++) + { + var expectedColumn = expected.Columns[c]; + var actualColumn = actual.Columns[c]; + + Assert.Equal(expectedColumn.Name, actualColumn.Name); + Assert.Equal(expectedColumn.GetType(), actualColumn.GetType()); + + for (int r = 0; r < expected.Rows.Count; r++) + { + var expectedValue = expectedColumn[r]; + var actualValue = actualColumn[r]; + + if (expectedValue == null || actualValue == null) + { + Assert.Null(expectedValue); + Assert.Null(actualValue); + } + else + { + Assert.Equal(expectedValue, actualValue); + } + } + } + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs index b507e846e8..dbd689ff3d 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs @@ -714,6 +714,256 @@ public void TestMerge_Issue5778() MatchRowsOnMergedDataFrame(merge, left, right, 1, 1, 0); } + public static IEnumerable GenerateData_TestMerge_EmptyDataFrames() + { + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("R1", new[] { 0, 1, 1 }), + new Int32DataFrameColumn("R2", new[] { 1, 1, 2 }), + new StringDataFrameColumn("R3", new[] { "Z", "Y", "B" }) + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Left, + new DataFrame( + new Int32DataFrameColumn("Index_left"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3"), + new Int32DataFrameColumn("Index_right"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Inner, + new DataFrame( + new Int32DataFrameColumn("Index_left"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3"), + new Int32DataFrameColumn("Index_right"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Left, + new DataFrame( + new Int32DataFrameColumn("Index_left"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3"), + new Int32DataFrameColumn("Index_right"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Right, + new DataFrame( + new Int32DataFrameColumn("Index_left"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3"), + new Int32DataFrameColumn("Index_right"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.FullOuter, + new DataFrame( + new Int32DataFrameColumn("Index_left"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3"), + new Int32DataFrameColumn("Index_right"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + }; + } + + [Theory] + [MemberData(nameof(GenerateData_TestMerge_EmptyDataFrames))] + public void TestMerge_EmptyDataFrames(DataFrame left, DataFrame right, string[] leftColumns, string[] rightColumns, JoinAlgorithm joinAlgorithm, DataFrame expectedOutput) + { + DataFrame actualOutput = left.Merge(right, leftColumns, rightColumns, joinAlgorithm: joinAlgorithm); + + DataFrameAssert.Equal(expectedOutput, actualOutput); + } + + public static IEnumerable GenerateData_TestMerge_OuterJoinsPreserveUnmatched() + { + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("L1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("L2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("L3", new[] { "A", "B", "C" }) + ), + new DataFrame( + new Int32DataFrameColumn("Index", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("R1", new[] { 10, 11, 11 }), + new Int32DataFrameColumn("R2", new[] { 1, 1, 2 }), + new StringDataFrameColumn("R3", new[] { "Z", "Y", "B" }) + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Left, + new DataFrame( + new Int32DataFrameColumn("Index_left", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("L1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("L2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("L3", new[] { "A", "B", "C" }), + new Int32DataFrameColumn("Index_right", new int?[] { null, null, null }), + new Int32DataFrameColumn("R1", new int?[] { null, null, null }), + new Int32DataFrameColumn("R2", new int?[] { null, null, null }), + new StringDataFrameColumn("R3", new string[] { null, null, null }) + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("L1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("L2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("L3", new[] { "A", "B", "C" }) + ), + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("R1"), + new Int32DataFrameColumn("R2"), + new StringDataFrameColumn("R3") + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Left, + new DataFrame( + new Int32DataFrameColumn("Index_left", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("L1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("L2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("L3", new[] { "A", "B", "C" }), + new Int32DataFrameColumn("Index_right", new int?[] { null, null, null }), + new Int32DataFrameColumn("R1", new int?[] { null, null, null }), + new Int32DataFrameColumn("R2", new int?[] { null, null, null }), + new StringDataFrameColumn("R3", new string[] { null, null, null }) + ), + }; + yield return new object[] + { + new DataFrame( + new Int32DataFrameColumn("Index"), + new Int32DataFrameColumn("L1"), + new Int32DataFrameColumn("L2"), + new StringDataFrameColumn("L3") + ), + new DataFrame( + new Int32DataFrameColumn("Index", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("R1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("R2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("R3", new[] { "A", "B", "C" }) + ), + new string[]{ "L1" }, + new string[]{ "R1" }, + JoinAlgorithm.Right, + new DataFrame( + new Int32DataFrameColumn("Index_left", new int?[] { null, null, null }), + new Int32DataFrameColumn("L1", new int?[] { null, null, null }), + new Int32DataFrameColumn("L2", new int?[] { null, null, null }), + new StringDataFrameColumn("L3", new string[] { null, null, null }), + new Int32DataFrameColumn("Index_right", new[] { 0, 1, 2 }), + new Int32DataFrameColumn("R1", new[] { 1, 2, 3 }), + new Int32DataFrameColumn("R2", new[] { 1, 2, 1 }), + new StringDataFrameColumn("R3", new[] { "A", "B", "C" }) + ), + }; + } + + [Theory] + [MemberData(nameof(GenerateData_TestMerge_OuterJoinsPreserveUnmatched))] + public void TestMerge_OuterJoinsPreserveUnmatched(DataFrame left, DataFrame right, string[] leftColumns, string[] rightColumns, JoinAlgorithm joinAlgorithm, DataFrame expectedOutput) + { + DataFrame actualOutput = left.Merge(right, leftColumns, rightColumns, joinAlgorithm: joinAlgorithm); + + DataFrameAssert.Equal(expectedOutput, actualOutput); + } + [Fact] //Issue 6127 public void TestMerge_CorrectColumnTypes() From 17caca40101e0dc8eae0a13ba2cd5181969e9374 Mon Sep 17 00:00:00 2001 From: Joshua Zierhut Date: Fri, 30 Jan 2026 13:56:24 -0600 Subject: [PATCH 2/2] Improve readability of Clone method --- src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs index 062375c77b..7c52032785 100644 --- a/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs +++ b/src/Microsoft.Data.Analysis/PrimitiveColumnContainer.cs @@ -460,9 +460,11 @@ private List> CloneNullBitMapBuffers() public PrimitiveColumnContainer Clone(PrimitiveColumnContainer mapIndices, Type type, bool invertMapIndices = false) where U : unmanaged { + PrimitiveColumnContainer ret = new PrimitiveColumnContainer(mapIndices.Length); + if (Buffers.Count == 0) { - return new PrimitiveColumnContainer(mapIndices.Length); + return ret; } ReadOnlySpan thisSpan = Buffers[0].ReadOnlySpan; @@ -470,7 +472,6 @@ public PrimitiveColumnContainer Clone(PrimitiveColumnContainer mapIndic long minRange = 0; long maxRange = DataFrameBuffer.MaxCapacity; long maxCapacity = maxRange; - PrimitiveColumnContainer ret = new PrimitiveColumnContainer(mapIndices.Length); for (int b = 0; b < mapIndices.Buffers.Count; b++) { int index = b;