Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion diffly/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
__version__ = "unknown"


from ._exceptions import PrimaryKeyError
from .comparison import compare_frames

__all__ = ["compare_frames"]
__all__ = ["PrimaryKeyError", "compare_frames"]
6 changes: 6 additions & 0 deletions diffly/_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) QuantCo 2025-2026
# SPDX-License-Identifier: BSD-3-Clause


class PrimaryKeyError(ValueError):
"""Raised when there is an issue with the primary key."""
15 changes: 9 additions & 6 deletions diffly/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from ._cache import cached_method
from ._conditions import condition_equal_columns, condition_equal_rows
from ._exceptions import PrimaryKeyError
from ._utils import (
ABS_TOL_DEFAULT,
ABS_TOL_TEMPORAL_DEFAULT,
Expand Down Expand Up @@ -131,23 +132,25 @@ def _init_with_validation(
)
if primary_key is not None:
if len(primary_key) == 0:
raise ValueError("The primary key columns must not be an empty list.")
raise PrimaryKeyError(
"The primary key columns must not be an empty list."
)
if missing := (set(primary_key) - set(left_schema.names())):
raise ValueError(
f"The primary key columns must be present in the left data frame, "
f"but the following are missing: {', '.join(missing)}."
f"but the following are missing: {', '.join(sorted(missing))}."
)
if missing := (set(primary_key) - set(right_schema.names())):
raise ValueError(
f"The primary key columns must be present in the right data frame, "
f"but the following are missing: {', '.join(missing)}."
f"but the following are missing: {', '.join(sorted(missing))}."
)
if not is_primary_key(left, primary_key):
raise ValueError(
raise PrimaryKeyError(
"The columns are not a primary key for the left data frame."
)
if not is_primary_key(right, primary_key):
raise ValueError(
raise PrimaryKeyError(
"The columns are not a primary key for the right data frame."
)

Expand Down Expand Up @@ -693,7 +696,7 @@ def summary(

def _check_primary_key(self) -> list[str]:
if self.primary_key is None:
raise ValueError(
raise PrimaryKeyError(
"`primary_key` must be provided to join `left` and `right`."
)
return self.primary_key
Expand Down
10 changes: 5 additions & 5 deletions tests/test_dataframe_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import polars as pl
import pytest

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames


@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name")])
@pytest.mark.parametrize("primary_key", ["name", ["name"], ("name",)])
def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
Expand All @@ -20,7 +20,7 @@ def test_primary_key_sequence_types(primary_key: str | Sequence[str]) -> None:
def test_empty_primary_key() -> None:
left = pl.DataFrame({"name": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"name": ["a", "b"], "other": [3, 4]})
with pytest.raises(ValueError, match="empty"):
with pytest.raises(PrimaryKeyError, match="empty"):
compare_frames(left, right, primary_key=[])


Expand All @@ -38,9 +38,9 @@ def test_missing_primary_key() -> None:
def test_pk_violation() -> None:
df_valid = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
df_duplicates = pl.DataFrame({"id": ["a", "a"], "value": [1, 2]})
with pytest.raises(ValueError, match="primary key.*left"):
with pytest.raises(PrimaryKeyError, match="primary key.*left"):
compare_frames(df_duplicates, df_valid, primary_key=["id"])
with pytest.raises(ValueError, match="primary key.*right"):
with pytest.raises(PrimaryKeyError, match="primary key.*right"):
compare_frames(df_valid, df_duplicates, primary_key=["id"])


Expand Down
4 changes: 2 additions & 2 deletions tests/test_fraction_same.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
UNSIGNED_INTEGER_DTYPES,
)

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames

from .utils import FRAME_TYPES, TYPING_FRAME_TYPES

Expand All @@ -23,7 +23,7 @@ def test_missing_primary_key_fraction_same() -> None:
left = pl.DataFrame({"id": ["a", "b", "c"], "value": [1, 2, 3]})
right = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
comparison = compare_frames(left, right)
with pytest.raises(ValueError):
with pytest.raises(PrimaryKeyError):
_ = comparison.fraction_same("value")


Expand Down
4 changes: 2 additions & 2 deletions tests/test_joined.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from polars.testing import assert_frame_equal

from diffly import compare_frames
from diffly import PrimaryKeyError, compare_frames


def test_joined() -> None:
Expand All @@ -31,7 +31,7 @@ def test_joined_missing_primary_key() -> None:
left = pl.DataFrame({"id": ["a", "b"], "value": [1, 2]})
right = pl.DataFrame({"id": ["a"], "value": [1]})
comparison = compare_frames(left, right)
with pytest.raises(ValueError):
with pytest.raises(PrimaryKeyError):
_ = comparison.joined()


Expand Down
Loading