From bdb860dda8fc5f5557ac26c76b1178343d646cb5 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Thu, 19 Mar 2026 14:04:04 +0100 Subject: [PATCH 1/3] chore: Add dedicated PrimaryKeyError --- diffly/__init__.py | 3 ++- diffly/_exceptions.py | 6 ++++++ diffly/comparison.py | 15 +++++++++------ tests/test_dataframe_comparison.py | 12 ++++++------ tests/test_fraction_same.py | 4 ++-- tests/test_joined.py | 4 ++-- 6 files changed, 27 insertions(+), 17 deletions(-) create mode 100644 diffly/_exceptions.py diff --git a/diffly/__init__.py b/diffly/__init__.py index 325702d..27b195b 100644 --- a/diffly/__init__.py +++ b/diffly/__init__.py @@ -11,6 +11,7 @@ __version__ = "unknown" +from ._exceptions import PrimaryKeyError from .comparison import compare_frames -__all__ = ["compare_frames"] +__all__ = ["PrimaryKeyError", "compare_frames"] diff --git a/diffly/_exceptions.py b/diffly/_exceptions.py new file mode 100644 index 0000000..81ddff4 --- /dev/null +++ b/diffly/_exceptions.py @@ -0,0 +1,6 @@ +# Copyright (c) QuantCo 2025-2026 +# SPDX-License-Identifier: BSD-3-Clause + + +class PrimaryKeyError(ValueError): + """Raised when there is an issue with the primary key.""" diff --git a/diffly/comparison.py b/diffly/comparison.py index b580a6a..afdf1a1 100644 --- a/diffly/comparison.py +++ b/diffly/comparison.py @@ -14,6 +14,7 @@ from ._cache import cached_method from ._conditions import condition_equal_columns, condition_equal_rows +from ._exceptions import PrimaryKeyError from ._utils import ( ABS_TOL_DEFAULT, ABS_TOL_TEMPORAL_DEFAULT, @@ -131,23 +132,25 @@ def _init_with_validation( ) if primary_key is not None: if len(primary_key) == 0: - raise ValueError("The primary key columns must not be an empty list.") + raise PrimaryKeyError( + "The primary key columns must not be an empty list." + ) if missing := (set(primary_key) - set(left_schema.names())): - raise ValueError( + raise PrimaryKeyError( f"The primary key columns must be present in the left data frame, " f"but the following are missing: {', '.join(missing)}." ) if missing := (set(primary_key) - set(right_schema.names())): - raise ValueError( + raise PrimaryKeyError( f"The primary key columns must be present in the right data frame, " f"but the following are missing: {', '.join(missing)}." ) if not is_primary_key(left, primary_key): - raise ValueError( + raise PrimaryKeyError( "The columns are not a primary key for the left data frame." ) if not is_primary_key(right, primary_key): - raise ValueError( + raise PrimaryKeyError( "The columns are not a primary key for the right data frame." ) @@ -693,7 +696,7 @@ def summary( def _check_primary_key(self) -> list[str]: if self.primary_key is None: - raise ValueError( + raise PrimaryKeyError( "`primary_key` must be provided to join `left` and `right`." ) return self.primary_key diff --git a/tests/test_dataframe_comparison.py b/tests/test_dataframe_comparison.py index becdba9..5518cc0 100644 --- a/tests/test_dataframe_comparison.py +++ b/tests/test_dataframe_comparison.py @@ -6,7 +6,7 @@ import polars as pl import pytest -from diffly import compare_frames +from diffly import PrimaryKeyError, compare_frames @pytest.mark.parametrize("primary_key", ["name", ["name"], ("name")]) @@ -20,7 +20,7 @@ def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None: def test_empty_primary_key() -> None: left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]}) right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]}) - with pytest.raises(ValueError, match="empty"): + with pytest.raises(PrimaryKeyError, match="empty"): compare_frames(left, right, primary_key=[]) @@ -28,19 +28,19 @@ def test_missing_primary_key() -> None: left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]}) right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]}) # Primary key that neither frame has - with pytest.raises(ValueError, match="left.*missing.*co2_emissions"): + with pytest.raises(PrimaryKeyError, match="left.*missing.*co2_emissions"): compare_frames(left, right, primary_key=["co2_emissions"]) # Primary key that the right frame does not have - with pytest.raises(ValueError, match="right.*missing.*value"): + with pytest.raises(PrimaryKeyError, match="right.*missing.*value"): compare_frames(left, right, primary_key=["value"]) def test_pk_violation() -> None: df_valid = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]}) df_duplicates = pl.DataFrame({"id": ["a", "a"], "value": [1, 2]}) - with pytest.raises(ValueError, match="primary key.*left"): + with pytest.raises(PrimaryKeyError, match="primary key.*left"): compare_frames(df_duplicates, df_valid, primary_key=["id"]) - with pytest.raises(ValueError, match="primary key.*right"): + with pytest.raises(PrimaryKeyError, match="primary key.*right"): compare_frames(df_valid, df_duplicates, primary_key=["id"]) diff --git a/tests/test_fraction_same.py b/tests/test_fraction_same.py index 696df0e..e578aa6 100644 --- a/tests/test_fraction_same.py +++ b/tests/test_fraction_same.py @@ -14,7 +14,7 @@ UNSIGNED_INTEGER_DTYPES, ) -from diffly import compare_frames +from diffly import PrimaryKeyError, compare_frames from .utils import FRAME_TYPES, TYPING_FRAME_TYPES @@ -23,7 +23,7 @@ def test_missing_primary_key_fraction_same() -> None: left = pl.DataFrame({"id": ["a", "b", "c"], "value": [1, 2, 3]}) right = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]}) comparison = compare_frames(left, right) - with pytest.raises(ValueError): + with pytest.raises(PrimaryKeyError): _ = comparison.fraction_same("value") diff --git a/tests/test_joined.py b/tests/test_joined.py index d0901b6..76f56e5 100644 --- a/tests/test_joined.py +++ b/tests/test_joined.py @@ -5,7 +5,7 @@ import pytest from polars.testing import assert_frame_equal -from diffly import compare_frames +from diffly import PrimaryKeyError, compare_frames def test_joined() -> None: @@ -31,7 +31,7 @@ def test_joined_missing_primary_key() -> None: left = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]}) right = pl.DataFrame({"id": ["a"], "value": [1]}) comparison = compare_frames(left, right) - with pytest.raises(ValueError): + with pytest.raises(PrimaryKeyError): _ = comparison.joined() From 3603a40efdc12f9275204e34c3083e1ba40ad528 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Thu, 19 Mar 2026 14:14:18 +0100 Subject: [PATCH 2/3] feedback copilot --- diffly/comparison.py | 4 ++-- tests/test_dataframe_comparison.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/diffly/comparison.py b/diffly/comparison.py index afdf1a1..f74d342 100644 --- a/diffly/comparison.py +++ b/diffly/comparison.py @@ -138,12 +138,12 @@ def _init_with_validation( if missing := (set(primary_key) - set(left_schema.names())): raise PrimaryKeyError( f"The primary key columns must be present in the left data frame, " - f"but the following are missing: {', '.join(missing)}." + f"but the following are missing: {', '.join(sorted(missing))}." ) if missing := (set(primary_key) - set(right_schema.names())): raise PrimaryKeyError( f"The primary key columns must be present in the right data frame, " - f"but the following are missing: {', '.join(missing)}." + f"but the following are missing: {', '.join(sorted(missing))}." ) if not is_primary_key(left, primary_key): raise PrimaryKeyError( diff --git a/tests/test_dataframe_comparison.py b/tests/test_dataframe_comparison.py index 5518cc0..bcc99c7 100644 --- a/tests/test_dataframe_comparison.py +++ b/tests/test_dataframe_comparison.py @@ -9,7 +9,7 @@ from diffly import PrimaryKeyError, compare_frames -@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name")]) +@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name",)]) def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None: left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]}) right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]}) From ee3a10c9214b897733076a3d37b79ab05b417c1b Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Thu, 19 Mar 2026 16:19:49 +0100 Subject: [PATCH 3/3] feedback EK/OB --- diffly/comparison.py | 4 ++-- tests/test_dataframe_comparison.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/diffly/comparison.py b/diffly/comparison.py index f74d342..ca126df 100644 --- a/diffly/comparison.py +++ b/diffly/comparison.py @@ -136,12 +136,12 @@ def _init_with_validation( "The primary key columns must not be an empty list." ) if missing := (set(primary_key) - set(left_schema.names())): - raise PrimaryKeyError( + raise ValueError( f"The primary key columns must be present in the left data frame, " f"but the following are missing: {', '.join(sorted(missing))}." ) if missing := (set(primary_key) - set(right_schema.names())): - raise PrimaryKeyError( + raise ValueError( f"The primary key columns must be present in the right data frame, " f"but the following are missing: {', '.join(sorted(missing))}." ) diff --git a/tests/test_dataframe_comparison.py b/tests/test_dataframe_comparison.py index bcc99c7..5f712e3 100644 --- a/tests/test_dataframe_comparison.py +++ b/tests/test_dataframe_comparison.py @@ -28,10 +28,10 @@ def test_missing_primary_key() -> None: left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]}) right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]}) # Primary key that neither frame has - with pytest.raises(PrimaryKeyError, match="left.*missing.*co2_emissions"): + with pytest.raises(ValueError, match="left.*missing.*co2_emissions"): compare_frames(left, right, primary_key=["co2_emissions"]) # Primary key that the right frame does not have - with pytest.raises(PrimaryKeyError, match="right.*missing.*value"): + with pytest.raises(ValueError, match="right.*missing.*value"): compare_frames(left, right, primary_key=["value"])