Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 54 additions & 19 deletions data_diff/abcs/database_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,23 @@
N = TypeVar("N")


@attrs.frozen(kw_only=True, eq=False, order=False, unsafe_hash=True)
@attrs.frozen(kw_only=True, eq=False, order=False, hash=False)
class Collation:
"""
A pre-parsed or pre-known record about db collation, per column.

The "greater" collation should be used as a target collation for textual PKs
on both sides of the diff — by coverting the "lesser" collation to self.
on both sides of the diff — by converting the "lesser" collation to self.

Snowflake easily absorbs the performance losses, so it has a boost to always
be greater than any other collation in non-Snowflake databases.
Snowflake easily absorbs the performance losses, so it is always the "lesser"
(preferred target) collation, ensuring the non-Snowflake side is "greater".
Other databases need to negotiate which side absorbs the performance impact.
"""

# A boost for special databases that are known to absorb the performance dmaage well.
# A boost for special databases that are known to absorb the performance damage well.
absorbs_damage: bool = False

# Ordinal soring by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
# Ordinal sorting by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
ordinal: bool | None = None

# Lowercase first (aAbBcC or abcABC). Otherwise, uppercase first (AaBbCc or ABCabc).
Expand All @@ -49,19 +49,50 @@ class Collation:
# Purely informational, for debugging:
_source: None | str | Collection[str] = None

def _comparison_key(self) -> tuple:
"""Key for equality and hashing — keeps __eq__/__hash__ consistent."""
if self.ordinal:
# Ordinal sorting is by code point; sensitivity flags are irrelevant.
return (self.absorbs_damage, True, self.language)
return (
self.absorbs_damage,
self.ordinal, # None vs False are semantically distinct
self.language,
self.country,
self.case_sensitive,
self.accent_sensitive,
self.lower_first,
)

def _ordering_key(self) -> tuple:
"""Key for deterministic total ordering. Only use via __gt__, not as a standalone sort key.

Unlike _comparison_key, this does not collapse fields for ordinals, so two
ordinals that are equal by __eq__ may have different ordering keys. The __gt__
method guards against this by checking equality first.
"""

# (0,) for None sorts before (1, value) for any real value.
def _wrap(v: object) -> tuple:
return (0,) if v is None else (1, v)

return (
self.absorbs_damage,
_wrap(self.ordinal),
_wrap(self.language),
_wrap(self.country),
_wrap(self.case_sensitive),
_wrap(self.accent_sensitive),
_wrap(self.lower_first),
)

def __eq__(self, other: object) -> bool:
if not isinstance(other, Collation):
return NotImplemented
if self.ordinal and other.ordinal:
# TODO: does it depend on language? what does Albanic_BIN mean in MS SQL?
return True
return (
self.language == other.language
and (self.country is None or other.country is None or self.country == other.country)
and self.case_sensitive == other.case_sensitive
and self.accent_sensitive == other.accent_sensitive
and self.lower_first == other.lower_first
)
return self._comparison_key() == other._comparison_key()

def __hash__(self) -> int:
return hash(self._comparison_key())

def __ne__(self, other: object) -> bool:
if not isinstance(other, Collation):
Expand All @@ -73,16 +104,20 @@ def __gt__(self, other: object) -> bool:
return NotImplemented
if self == other:
return False
# absorbs_damage=True means this db absorbs conversion cost — it should be the target
# (i.e. "lesser"), so the non-absorbing side is "greater".
if self.absorbs_damage and not other.absorbs_damage:
return False
if other.absorbs_damage and not self.absorbs_damage:
return True # this one is preferred if it cannot absorb damage as its counterpart can
return True
if self.ordinal and not other.ordinal:
return True
if other.ordinal and not self.ordinal:
return False
# TODO: try to align the languages & countries?
return False
# Deterministic tuple ordering; by this point absorbs_damage and ordinal
# are resolved, so language, country, and sensitivity flags decide.
# None sorts distinctly from "" / False via (0,) vs (1, value) wrapping.
return self._ordering_key() > other._ordering_key()

def __ge__(self, other: object) -> bool:
if not isinstance(other, Collation):
Expand Down
Loading
Loading