From 8ae0c0473e79b009d4036969393588e664537bcf Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Tue, 16 Dec 2025 20:15:41 -0500 Subject: [PATCH 1/6] fix: map view types to avoid to_substrait ArrowNotImplementedError --- .../python/vortex/arrow/expression.py | 19 +++- vortex-python/test/test_expression.py | 89 +++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 vortex-python/test/test_expression.py diff --git a/vortex-python/python/vortex/arrow/expression.py b/vortex-python/python/vortex/arrow/expression.py index b306acd1874..931b2222049 100644 --- a/vortex-python/python/vortex/arrow/expression.py +++ b/vortex-python/python/vortex/arrow/expression.py @@ -28,9 +28,26 @@ def ensure_vortex_expression(expression: pc.Expression | Expr | None, *, schema: return expression +def _schema_for_substrait(schema: pa.Schema) -> pa.Schema: + # PyArrow's to_substrait doesn't support view types; map to string/binary. + # This is safe because Vortex handles both equivalently. + # If/When PyArrow to_substrait supports view types, revert. + # Workaround for: https://github.com/vortex-data/vortex/issues/5759 + fields = [] + for field in schema: + if field.type == pa.string_view(): + fields.append(field.with_type(pa.string())) + elif field.type == pa.binary_view(): + fields.append(field.with_type(pa.binary())) + else: + fields.append(field) + return pa.schema(fields) + + def arrow_to_vortex(arrow_expression: pc.Expression, schema: pa.Schema) -> Expr: + compat_schema = _schema_for_substrait(schema) substrait_object = ExtendedExpression() # pyright: ignore[reportUnknownVariableType] - substrait_object.ParseFromString(arrow_expression.to_substrait(schema)) # pyright: ignore[reportUnknownMemberType] + substrait_object.ParseFromString(arrow_expression.to_substrait(compat_schema)) # pyright: ignore[reportUnknownMemberType] expressions = extended_expression(substrait_object) # pyright: ignore[reportUnknownArgumentType] diff --git a/vortex-python/test/test_expression.py b/vortex-python/test/test_expression.py new file mode 100644 index 00000000000..3231c428b91 --- /dev/null +++ b/vortex-python/test/test_expression.py @@ -0,0 +1,89 @@ +# Tests the _schema_for_substrait workaround in vortex/arrow/expression.py + +import pyarrow as pa +import pyarrow.compute as pc +import pytest + +from vortex.arrow.expression import arrow_to_vortex, _schema_for_substrait + + +class TestSchemaForSubstrait: + """Verifies mapping: string_view=>string, binary_view=>binary, else unchanged""" + + def test_string_view_mapped_to_string(self): + schema = pa.schema([("col", pa.string_view())]) + result = _schema_for_substrait(schema) + assert result.field("col").type == pa.string() + + def test_binary_view_mapped_to_binary(self): + schema = pa.schema([("col", pa.binary_view())]) + result = _schema_for_substrait(schema) + assert result.field("col").type == pa.binary() + + def test_other_types_unchanged(self): + schema = pa.schema([ + ("int_col", pa.int64()), + ("str_col", pa.string()), + ("bin_col", pa.binary()), + ("float_col", pa.float64()), + ]) + result = _schema_for_substrait(schema) + assert result == schema + + def test_mixed_schema(self): + schema = pa.schema([ + ("sv", pa.string_view()), + ("bv", pa.binary_view()), + ("s", pa.string()), + ("i", pa.int64()), + ]) + result = _schema_for_substrait(schema) + expected = pa.schema([ + ("sv", pa.string()), + ("bv", pa.binary()), + ("s", pa.string()), + ("i", pa.int64()), + ]) + assert result == expected + + +class TestArrowToVortexWithViews: + """Tests comparisons over string_views and binary_views""" + + def test_string_view_equality_expression(self): + schema = pa.schema([("name", pa.string_view())]) + expr = pc.field("name") == "alice" + vortex_expr = arrow_to_vortex(expr, schema) + assert vortex_expr is not None + + def test_binary_view_equality_expression(self): + schema = pa.schema([("data", pa.binary_view())]) + expr = pc.field("data") == b"hello" + vortex_expr = arrow_to_vortex(expr, schema) + assert vortex_expr is not None + + def test_string_view_comparison_expression(self): + schema = pa.schema([("name", pa.string_view())]) + expr = pc.field("name") > "bob" + vortex_expr = arrow_to_vortex(expr, schema) + assert vortex_expr is not None + + def test_mixed_view_and_regular_types(self): + schema = pa.schema([ + ("id", pa.int64()), + ("name", pa.string_view()), + ("data", pa.binary_view()), + ]) + expr = (pc.field("id") > 10) & (pc.field("name") == "test") + vortex_expr = arrow_to_vortex(expr, schema) + assert vortex_expr is not None + + @pytest.mark.parametrize("view_type,value", [ + (pa.string_view(), "test"), + (pa.binary_view(), b"test"), + ]) + def test_view_types_parametrized(self, view_type, value): + schema = pa.schema([("col", view_type)]) + expr = pc.field("col") == value + vortex_expr = arrow_to_vortex(expr, schema) + assert vortex_expr is not None From c6cfed3a4d3a23cbc69e51f13216488fc8411765 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Tue, 16 Dec 2025 20:21:06 -0500 Subject: [PATCH 2/6] DCO Remediation Commit for Paul Timmins I, Paul Timmins , hereby add my Signed-off-by to this commit: 8ae0c0473e79b009d4036969393588e664537bcf Signed-off-by: Paul Timmins From de97c5b526d9d13de4e0efa60330cbb746312d26 Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Thu, 18 Dec 2025 17:20:51 -0500 Subject: [PATCH 3/6] lint: fix lint error on test_expression (ruff check ---fix'd) --- vortex-python/test/test_expression.py | 68 +++++++++++++++------------ 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/vortex-python/test/test_expression.py b/vortex-python/test/test_expression.py index 3231c428b91..b8e92aaf612 100644 --- a/vortex-python/test/test_expression.py +++ b/vortex-python/test/test_expression.py @@ -3,8 +3,7 @@ import pyarrow as pa import pyarrow.compute as pc import pytest - -from vortex.arrow.expression import arrow_to_vortex, _schema_for_substrait +from vortex.arrow.expression import _schema_for_substrait, arrow_to_vortex class TestSchemaForSubstrait: @@ -21,29 +20,35 @@ def test_binary_view_mapped_to_binary(self): assert result.field("col").type == pa.binary() def test_other_types_unchanged(self): - schema = pa.schema([ - ("int_col", pa.int64()), - ("str_col", pa.string()), - ("bin_col", pa.binary()), - ("float_col", pa.float64()), - ]) + schema = pa.schema( + [ + ("int_col", pa.int64()), + ("str_col", pa.string()), + ("bin_col", pa.binary()), + ("float_col", pa.float64()), + ] + ) result = _schema_for_substrait(schema) assert result == schema def test_mixed_schema(self): - schema = pa.schema([ - ("sv", pa.string_view()), - ("bv", pa.binary_view()), - ("s", pa.string()), - ("i", pa.int64()), - ]) + schema = pa.schema( + [ + ("sv", pa.string_view()), + ("bv", pa.binary_view()), + ("s", pa.string()), + ("i", pa.int64()), + ] + ) result = _schema_for_substrait(schema) - expected = pa.schema([ - ("sv", pa.string()), - ("bv", pa.binary()), - ("s", pa.string()), - ("i", pa.int64()), - ]) + expected = pa.schema( + [ + ("sv", pa.string()), + ("bv", pa.binary()), + ("s", pa.string()), + ("i", pa.int64()), + ] + ) assert result == expected @@ -69,19 +74,24 @@ def test_string_view_comparison_expression(self): assert vortex_expr is not None def test_mixed_view_and_regular_types(self): - schema = pa.schema([ - ("id", pa.int64()), - ("name", pa.string_view()), - ("data", pa.binary_view()), - ]) + schema = pa.schema( + [ + ("id", pa.int64()), + ("name", pa.string_view()), + ("data", pa.binary_view()), + ] + ) expr = (pc.field("id") > 10) & (pc.field("name") == "test") vortex_expr = arrow_to_vortex(expr, schema) assert vortex_expr is not None - @pytest.mark.parametrize("view_type,value", [ - (pa.string_view(), "test"), - (pa.binary_view(), b"test"), - ]) + @pytest.mark.parametrize( + "view_type,value", + [ + (pa.string_view(), "test"), + (pa.binary_view(), b"test"), + ], + ) def test_view_types_parametrized(self, view_type, value): schema = pa.schema([("col", view_type)]) expr = pc.field("col") == value From 521482460cabb9c123d5480fae62e1f9c938ce7a Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Thu, 18 Dec 2025 17:24:54 -0500 Subject: [PATCH 4/6] DCO Remediation Commit for Paul Timmins I, Paul Timmins , hereby add my Signed-off-by to this commit: de97c5b526d9d13de4e0efa60330cbb746312d26 Signed-off-by: Paul Timmins From 36a64add954dc5944c1f2e5276a5428622e650ae Mon Sep 17 00:00:00 2001 From: Paul Timmins Date: Thu, 18 Dec 2025 17:35:52 -0500 Subject: [PATCH 5/6] DCO Remediation Commit for Paul Timmins I, Paul Timmins , hereby add my Signed-off-by to this commit: de97c5b526d9d13de4e0efa60330cbb746312d26 I, Paul Timmins , hereby add my Signed-off-by to this commit: 521482460cabb9c123d5480fae62e1f9c938ce7a Signed-off-by: Paul Timmins From 3107e20814df357f36e08722eb88372eb6934550 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 18 Dec 2025 19:44:14 -0500 Subject: [PATCH 6/6] Update test_expression.py SPDX --- vortex-python/test/test_expression.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vortex-python/test/test_expression.py b/vortex-python/test/test_expression.py index b8e92aaf612..21a6c77eb89 100644 --- a/vortex-python/test/test_expression.py +++ b/vortex-python/test/test_expression.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + # Tests the _schema_for_substrait workaround in vortex/arrow/expression.py import pyarrow as pa