From 9da6796883b2684df5ab6c33cd5defa66fd41239 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 09:57:01 +0100 Subject: [PATCH 01/20] refactor: motives are now instances of own classes --- src/ms_blocking/ms_blocking.py | 240 ++++++++++++++++++++++-- src/ms_blocking/utils.py | 326 +++++++++++---------------------- 2 files changed, 332 insertions(+), 234 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index fffbcc8..ac33759 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -1,5 +1,7 @@ from ms_blocking.utils import * # noqa: F403 +import networkx as nx + class BlockerNode: """Abstract class from which derive all classes in the module""" @@ -76,7 +78,6 @@ def __init__(self, left, right): def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" - def block(self, df, motives=False): # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) @@ -185,9 +186,7 @@ def block(self, data, motives=False): } if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.blocking_columns - } + explanations = [EquivalenceMotive(self.blocking_columns)] return add_motives_to_coords(coords, explanations) else: return set(coords) # set is unnnecessary @@ -276,10 +275,9 @@ def block(self, data, motives=False): coords = block_overlap(groups=groups, overlap=self.overlap) if motives: - explanations = { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.blocking_columns - } + explanations = [ + OverlapMotive(self.blocking_columns, self.overlap, self.word_level) + ] return add_motives_to_coords(coords, explanations) else: return set(coords) @@ -287,7 +285,7 @@ def block(self, data, motives=False): class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM """Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker. - Designed for performance and RAM efficiency. + Used for performance and RAM efficiency. """ def __init__( @@ -426,17 +424,229 @@ def block(self, data, motives=False): coords = coords_equivalence.intersection(coords_overlap) if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.equivalence_columns - } | { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.overlap_columns - } + explanations = [ + EquivalenceMotive(self.equivalence_columns), + OverlapMotive(self.overlap_columns, self.overlap, self.word_level), + ] + return add_motives_to_coords(coords, explanations) else: return set(coords) +def add_blocks_to_dataset( + data: pd.DataFrame, + coords: Coords, + sort: bool = True, + keep_ungrouped_rows: bool = False, + merge_blocks: bool = True, + motives: bool = False, + show_as_pairs: bool = False, + output_columns: Columns = None, +) -> pd.DataFrame: + """Returns the intersection of an array of links + + Takes two lists of paired elements, with or without motives, returns their intersection + + Parameters + ---------- + data : DataFrame + DataFrame for blocking + coords : Array + Blocked coordinates + sort : bool + Whether to sort the result by block, thereby regrouping rows of the same block + keep_ungrouped_rows : bool + Whether to display rows that do not belong to any block + merge_blocks : bool + Whether to merge transitively merge blocks + motives : bool + Whether to display the reason behind each block + show_as_pairs : bool + Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame + output_columns : list + Columns to show. Useful in combination with show_as_pairs as column names are altered + + Returns + ------- + DataFrame + Blocked DataFrame + + Examples + -------- + >>> add_blocks_to_dataset(data=pd.DataFrame( + [ + [0, 'first', 4], + [1, 'second', 6], + [2, 'first', 2], + [3, 'third', 5] + ], + columns=['id', 'rank', 'score']), + coords=np.array([{0, 2}]), + show_as_pairs=True, + output_columns=['id', 'rank']) + id_l rank_l id_r rank_r block + 0 0 first 2 first 0 + """ + + if show_as_pairs and keep_ungrouped_rows: + raise ValueError("Cannot both return pairs and keep ungrouped rows") + + if motives: + if type(coords) is not dict: + raise TypeError("Cannot specify motives=True without passing motives") + + # Ensure the index is a unique identifier + if not data.index.is_unique: + raise ValueError("DataFrame index must be unique to be used as an identifier.") + + if "_motive" in data.columns: + if motives: + raise ValueError( + "Please rename existing '_motive' column OR do not pass 'motives=True'" + ) + + if "_block" in data.columns: + raise ValueError("Please rename existing '_block' column") + + if output_columns is None: + output_columns = data.columns + data = data[output_columns].copy() + + if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph + if show_as_pairs: + columns = [col + "_l" for col in data.columns] + [ + col + "_r" for col in data.columns + ] + output_data = pd.DataFrame(columns=columns) + else: + output_data = pd.DataFrame(columns=data.columns) + else: + output_data = data + # Map coords to connected component labels + if merge_blocks: # We solve the connected components problem + cc_labels = solve_connected_components_from_coords(coords) + # Match original index to new block ID + matcher = { + idx: label + for idx, label in enumerate(cc_labels) + if label != -1 and idx in data.index + } + else: # We solve the cliques problem + g = nx.Graph() + # noinspection PyTypeChecker + g.add_edges_from(coords) + complete_subgraphs = list(nx.find_cliques(g)) + complete_subgraphs = sorted(complete_subgraphs) + # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} + matcher = dict() + for i, clique in enumerate(complete_subgraphs): + for node_idx in clique: + if node_idx in matcher.keys(): + matcher[node_idx].append(i) + else: + matcher[node_idx] = [i] + + if show_as_pairs: + output_data = pd.DataFrame() + for pair in coords: + left_row = data.loc[[tuple(pair)[0]]].copy() + current_index = left_row.index + right_row = data.loc[[tuple(pair)[1]]].copy() + left_row.columns = [col + "_l" for col in left_row.columns] + right_row.columns = [col + "_r" for col in right_row.columns] + current_row = pd.concat( + [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], + axis=1, + ) + current_row.index = current_index + if motives: + current_row["_motive"] = str(solve_motives(coords[pair])) + output_data = pd.concat([output_data, current_row]) + + # Assign blocks to rows based on their original index + output_data["_block"] = output_data.index.map(matcher) + if not merge_blocks: + output_data = output_data.explode("_block") + + if keep_ungrouped_rows: + output_data["_block"] = output_data["_block"].fillna(-1) + matcher_ungrouped_rows = {} + block_temp = [] + i = 0 # Track # of blocks processed + for b in output_data["_block"]: + if b == -1: + block_temp.append(i) + i += 1 + elif b not in matcher_ungrouped_rows: + matcher_ungrouped_rows[b] = i + block_temp.append(i) + i += 1 + else: + block_temp.append(matcher_ungrouped_rows[b]) + output_data["_block"] = block_temp + else: + if not show_as_pairs: + output_data = output_data[ + output_data["_block"].duplicated(keep=False) + & output_data["_block"].notna() + ] + + output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) + + if sort: + # Sort by block, then by original index + sort_cols = ["_block"] + if output_data.index.name: + output_data = output_data.sort_values( + sort_cols + [output_data.index.name] + ) + else: + # If no named index, use the first column of the DataFrame + output_data = output_data.reset_index() + output_data = output_data.sort_values( + sort_cols + [output_data.columns[0]] + ) + output_data = output_data.set_index(output_data.columns[0]) + + if not show_as_pairs and motives: + id_list = flatten(coords.keys()) + motive_matcher = { + row_id: frozenset( + str(solve_motives(coords[pair])) + for pair in coords.keys() + if row_id in pair + ) + for row_id in id_list + } + output_data["_motive"] = output_data.index.map(motive_matcher) + + if "_block" not in output_data.columns: # Empty coords + output_data["_block"] = -1 + + output_data = output_data.reset_index(drop=True) + output_data["_block"] = output_data["_block"].astype(int) + + return output_data + + +def generate_blocking_report( + data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None +) -> pd.DataFrame: + """ + Shorthand for add_blocks_to_dataset with below arguments + """ + return add_blocks_to_dataset( + data, + coords, + sort=True, + merge_blocks=False, + motives=True, + show_as_pairs=True, + output_columns=output_columns, + ) + + def merge_blockers( left: BlockerNode, right: BlockerNode ) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode: diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 837645f..3ee8ead 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -4,17 +4,53 @@ from scipy.sparse import coo_matrix from scipy.sparse.csgraph import connected_components import pandas as pd -import networkx as nx import random from collections import Counter from itertools import combinations from typing import List, Set, Iterable, Dict, Collection, Any + +class EquivalenceMotive: + def __init__(self, blocking_columns): + self.blocking_columns = blocking_columns + + def __eq__(self, other): + return self.blocking_columns == other.blocking_columns + + def __repr__(self): + return ", ".join( + [f"Same '{column_name}'" for column_name in self.blocking_columns] + ) + + +class OverlapMotive: + def __init__(self, blocking_columns, overlap=1, word_level=False): + self.blocking_columns = blocking_columns + self.overlap = overlap + self.word_level = word_level + + def __eq__(self, other): + return ( + self.blocking_columns == other.blocking_columns + and self.overlap == other.overlap + and self.word_level == other.word_level + ) + + def __repr__(self): + return ", ".join( + [ + f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" + for column_name in self.blocking_columns + ] + ) + + Columns = List[str] Pair = Collection[int] +Motive = EquivalenceMotive | OverlapMotive CoordsBasic = Set[Pair] -CoordsMotives = Dict[Pair, Set[str]] +CoordsMotives = Dict[Pair, List[Motive]] Coords = CoordsBasic | CoordsMotives _PUNCT_RE = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~]') @@ -240,7 +276,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: if type(coords_1) is type(coords_2) is dict: # We have motives return { pair: ( - (coords_1[pair] | coords_2[pair]) + (coords_1[pair] + coords_2[pair]) if (pair in coords_1 and pair in coords_2) else coords_1[pair] if (pair in coords_1) @@ -278,7 +314,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: """ if type(coords_1) is type(coords_2) is dict: # We have motives return { - pair: (coords_1[pair] | coords_2[pair]) + pair: (coords_1[pair] + coords_2[pair]) for y in (coords_1, coords_2) for pair in y.keys() if (pair in coords_1 and pair in coords_2) @@ -287,219 +323,6 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: return coords_1.intersection(coords_2) -def add_blocks_to_dataset( - data: pd.DataFrame, - coords: Coords, - sort: bool = True, - keep_ungrouped_rows: bool = False, - merge_blocks: bool = True, - motives: bool = False, - show_as_pairs: bool = False, - output_columns: Columns = None, -) -> pd.DataFrame: - """Returns the intersection of an array of links - - Takes two lists of paired elements, with or without motives, returns their intersection - - Parameters - ---------- - data : DataFrame - DataFrame for blocking - coords : Array - Blocked coordinates - sort : bool - Whether to sort the result by block, thereby regrouping rows of the same block - keep_ungrouped_rows : bool - Whether to display rows that do not belong to any block - merge_blocks : bool - Whether to merge transitively merge blocks - motives : bool - Whether to display the reason behind each block - show_as_pairs : bool - Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame - output_columns : list - Columns to show. Useful in combination with show_as_pairs as column names are altered - - Returns - ------- - DataFrame - Blocked DataFrame - - Examples - -------- - >>> add_blocks_to_dataset(data=pd.DataFrame( - [ - [0, 'first', 4], - [1, 'second', 6], - [2, 'first', 2], - [3, 'third', 5] - ], - columns=['id', 'rank', 'score']), - coords=np.array([{0, 2}]), - show_as_pairs=True, - output_columns=['id', 'rank']) - id_l rank_l id_r rank_r block - 0 0 first 2 first 0 - """ - - if show_as_pairs and keep_ungrouped_rows: - raise ValueError("Cannot both return pairs and keep ungrouped rows") - - if motives: - if type(coords) is not dict: - raise TypeError("Cannot specify motives=True without passing motives") - - # Ensure the index is a unique identifier - if not data.index.is_unique: - raise ValueError("DataFrame index must be unique to be used as an identifier.") - - if "_motive" in data.columns: - if motives: - raise ValueError( - "Please rename existing '_motive' column OR do not pass 'motives=True'" - ) - - if "_block" in data.columns: - raise ValueError("Please rename existing '_block' column") - - if output_columns is None: - output_columns = data.columns - data = data[output_columns].copy() - - if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph - if show_as_pairs: - columns = [col + "_l" for col in data.columns] + [ - col + "_r" for col in data.columns - ] - output_data = pd.DataFrame(columns=columns) - else: - output_data = pd.DataFrame(columns=data.columns) - else: - output_data = data - # Map coords to connected component labels - if merge_blocks: # We solve the connected components problem - cc_labels = solve_connected_components_from_coords(coords) - # Match original index to new block ID - matcher = { - idx: label - for idx, label in enumerate(cc_labels) - if label != -1 and idx in data.index - } - else: # We solve the cliques problem - g = nx.Graph() - # noinspection PyTypeChecker - g.add_edges_from(coords) - complete_subgraphs = list(nx.find_cliques(g)) - complete_subgraphs = sorted(complete_subgraphs) - # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} - matcher = dict() - for i, clique in enumerate(complete_subgraphs): - for node_idx in clique: - if node_idx in matcher.keys(): - matcher[node_idx].append(i) - else: - matcher[node_idx] = [i] - - if show_as_pairs: - output_data = pd.DataFrame() - for pair in coords: - left_row = data.loc[[tuple(pair)[0]]].copy() - current_index = left_row.index - right_row = data.loc[[tuple(pair)[1]]].copy() - left_row.columns = [col + "_l" for col in left_row.columns] - right_row.columns = [col + "_r" for col in right_row.columns] - current_row = pd.concat( - [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], - axis=1, - ) - current_row.index = current_index - output_data = pd.concat([output_data, current_row]) - - # Assign blocks to rows based on their original index - output_data["_block"] = output_data.index.map(matcher) - if not merge_blocks: - output_data = output_data.explode("_block") - - if keep_ungrouped_rows: - output_data["_block"] = output_data["_block"].fillna(-1) - matcher_ungrouped_rows = {} - block_temp = [] - i = 0 # Track # of blocks processed - for b in output_data["_block"]: - if b == -1: - block_temp.append(i) - i += 1 - elif b not in matcher_ungrouped_rows: - matcher_ungrouped_rows[b] = i - block_temp.append(i) - i += 1 - else: - block_temp.append(matcher_ungrouped_rows[b]) - output_data["_block"] = block_temp - else: - if not show_as_pairs: - output_data = output_data[ - output_data["_block"].duplicated(keep=False) - & output_data["_block"].notna() - ] - - output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) - - if sort: - # Sort by block, then by original index - sort_cols = ["_block"] - if output_data.index.name: - output_data = output_data.sort_values( - sort_cols + [output_data.index.name] - ) - else: - # If no named index, use the first column of the DataFrame - output_data = output_data.reset_index() - output_data = output_data.sort_values( - sort_cols + [output_data.columns[0]] - ) - output_data = output_data.set_index(output_data.columns[0]) - - if motives: - output_data["_motive"] = "" - id_list = flatten(coords.keys()) - motive_matcher = { - row_id: frozenset( - reason - for pair in coords.keys() - if row_id in pair - for reason in coords[pair] - ) - for row_id in id_list - } - output_data["_motive"] = output_data.index.map(motive_matcher) - - if "_block" not in output_data.columns: # Empty coords - output_data["_block"] = -1 - - output_data = output_data.reset_index(drop=True) - output_data["_block"] = output_data["_block"].astype(int) - - return output_data - - -def generate_blocking_report( - data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None -) -> pd.DataFrame: - """ - Shorthand for add_blocks_to_dataset with below arguments - """ - return add_blocks_to_dataset( - data, - coords, - sort=True, - merge_blocks=False, - motives=True, - show_as_pairs=True, - output_columns=output_columns, - ) - - def parse_list(s: str | List, word_level: bool = False) -> List[str]: """Turns a stringified list into an actual python list, taking extra inner quotes into account @@ -682,7 +505,9 @@ def block_overlap(groups: Iterable, overlap: int = 1) -> Coords: return coords -def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives: +def add_motives_to_coords( + coords: Coords, explanations: List[Motive] +) -> Dict[Pair, List[Motive]]: """Block a DataFrame based on overlap accross columns Parameters @@ -690,7 +515,7 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv coords : Coords Coords obtained by blocking - explanations : Set[str] + explanations : Set[EquivalenceMotive|OverlapMotive] Set of explanations Returns @@ -718,3 +543,66 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv } """ return {pair: explanations for pair in coords} + + +def solve_motives(motives: List[Motive]) -> List[Motive]: + """Remove duplicated and redundant motives from a list of motives + + Redundant motives refer to OverlapMotives on the same column(s) but with different overlap or word-level condition + + Parameters + ---------- + motives : List[Motive] + Coords obtained by blocking + + Returns + ------- + List[Motive] + Pairs obtained by blocking + + Examples + -------- + >>> solve_motives([OverlapMotive(['websites'], 1), OverlapMotive(['websites'], 2), OverlapMotive(['websites'], 2, word_level=False)]) + OverlapMotive(['websites'], 2, word_level=False) + """ + if not motives: + raise ValueError("Motives must not be empty") + + final_motives = [motives[0]] + for motive in motives[1:]: + if motive not in final_motives: + final_motives.append(motive) + if type(motive) is OverlapMotive: + # Look for redundant motives + for motive_to_compare in final_motives[:-1]: + if ( + type(motive_to_compare) is OverlapMotive + ): # With EquivalenceMotive, equality check suffices + if ( + motive.blocking_columns + == motive_to_compare.blocking_columns + ): + if motive.word_level == motive_to_compare.word_level: + # Replace Blocker with the one with bigger overlap + if motive.overlap < motive_to_compare.overlap: + final_motives.remove(motive) + final_motives.append(motive_to_compare) + elif motive.overlap > motive.overlap: + final_motives.remove(motive_to_compare) + final_motives.append(motive) + elif motive.overlap == motive_to_compare.overlap: + # Replace Blocker with the one with stricter word/element-level condition + if ( + motive.word_level + and not motive_to_compare.word_level + ): + final_motives.remove(motive) + final_motives.append(motive_to_compare) + elif ( + not motive.word_level + and motive_to_compare.word_level + ): + final_motives.remove(motive_to_compare) + final_motives.append(motive) + + return final_motives From 60a3cc9b2b68750c0180cced574864053ed18874 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 09:57:32 +0100 Subject: [PATCH 02/20] docs: update example notebook with new motive system --- docs/example.ipynb | 685 +++++++++++++++++---------------------------- 1 file changed, 252 insertions(+), 433 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 6b82165..7c44012 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,8 +32,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.010997600Z", - "start_time": "2026-01-30T14:21:13.420790Z" + "end_time": "2026-02-03T08:45:42.897197100Z", + "start_time": "2026-02-03T08:45:42.069366700Z" } }, "source": [ @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.049404600Z", - "start_time": "2026-01-30T14:21:14.010997600Z" + "end_time": "2026-02-03T08:45:42.936219800Z", + "start_time": "2026-02-03T08:45:42.901218100Z" } }, "source": [ @@ -282,8 +282,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.190107400Z", - "start_time": "2026-01-30T14:21:14.089762400Z" + "end_time": "2026-02-03T08:45:43.089459800Z", + "start_time": "2026-02-03T08:45:42.974568800Z" } }, "source": [ @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.309413300Z", - "start_time": "2026-01-30T14:21:14.278545600Z" + "end_time": "2026-02-03T08:45:43.241858Z", + "start_time": "2026-02-03T08:45:43.164364500Z" } }, "source": [ @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.378808Z", - "start_time": "2026-01-30T14:21:14.349508200Z" + "end_time": "2026-02-03T08:45:43.293312300Z", + "start_time": "2026-02-03T08:45:43.279951300Z" } }, "source": [ @@ -369,8 +369,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.558644200Z", - "start_time": "2026-01-30T14:21:14.459573100Z" + "end_time": "2026-02-03T08:45:43.491120800Z", + "start_time": "2026-02-03T08:45:43.387967900Z" } }, "source": [ @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.635514Z", - "start_time": "2026-01-30T14:21:14.598913Z" + "end_time": "2026-02-03T08:45:43.564017300Z", + "start_time": "2026-02-03T08:45:43.543375900Z" } }, "source": [ @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.829719100Z", - "start_time": "2026-01-30T14:21:14.676157200Z" + "end_time": "2026-02-03T08:45:43.730577200Z", + "start_time": "2026-02-03T08:45:43.602849600Z" } }, "source": [ @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.027923700Z", - "start_time": "2026-01-30T14:21:14.926401Z" + "end_time": "2026-02-03T08:45:43.963649700Z", + "start_time": "2026-02-03T08:45:43.857183700Z" } }, "source": [ @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.403596500Z", - "start_time": "2026-01-30T14:21:15.279120300Z" + "end_time": "2026-02-03T08:45:44.258242200Z", + "start_time": "2026-02-03T08:45:44.158668200Z" } }, "source": [ @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.686136800Z", - "start_time": "2026-01-30T14:21:15.608444400Z" + "end_time": "2026-02-03T08:45:44.439022100Z", + "start_time": "2026-02-03T08:45:44.392038500Z" } }, "source": [ @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.998425200Z", - "start_time": "2026-01-30T14:21:15.931370100Z" + "end_time": "2026-02-03T08:45:44.704919900Z", + "start_time": "2026-02-03T08:45:44.604905100Z" } }, "source": [ @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.305679100Z", - "start_time": "2026-01-30T14:21:16.212470400Z" + "end_time": "2026-02-03T08:45:45.167225900Z", + "start_time": "2026-02-03T08:45:45.142061100Z" } }, "source": [ @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.678653800Z", - "start_time": "2026-01-30T14:21:16.558976200Z" + "end_time": "2026-02-03T08:45:45.497760900Z", + "start_time": "2026-02-03T08:45:45.335278600Z" } }, "source": [ @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.354294400Z", - "start_time": "2026-01-30T14:21:17.316050200Z" + "end_time": "2026-02-03T08:45:45.879254300Z", + "start_time": "2026-02-03T08:45:45.779256400Z" } }, "source": [ @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.537043700Z", - "start_time": "2026-01-30T14:21:17.392490700Z" + "end_time": "2026-02-03T08:45:46.232628900Z", + "start_time": "2026-02-03T08:45:46.186246600Z" } }, "source": [ @@ -1464,8 +1464,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.655177300Z", - "start_time": "2026-01-30T14:21:17.573776300Z" + "end_time": "2026-02-03T08:45:46.440480Z", + "start_time": "2026-02-03T08:45:46.391161200Z" } }, "source": [ @@ -1589,8 +1589,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.910335600Z", - "start_time": "2026-01-30T14:21:17.821453400Z" + "end_time": "2026-02-03T08:45:46.754986800Z", + "start_time": "2026-02-03T08:45:46.666968100Z" } }, "source": [ @@ -1804,8 +1804,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.279899900Z", - "start_time": "2026-01-30T14:21:18.250988900Z" + "end_time": "2026-02-03T08:45:47.079529400Z", + "start_time": "2026-02-03T08:45:47.029011300Z" } }, "source": [ @@ -1828,8 +1828,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.481263300Z", - "start_time": "2026-01-30T14:21:18.466284300Z" + "end_time": "2026-02-03T08:45:47.289177100Z", + "start_time": "2026-02-03T08:45:47.270625400Z" } }, "source": [ @@ -1849,8 +1849,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.562779600Z", - "start_time": "2026-01-30T14:21:18.520368200Z" + "end_time": "2026-02-03T08:45:47.381218700Z", + "start_time": "2026-02-03T08:45:47.334125300Z" } }, "source": [ @@ -1990,8 +1990,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.843568700Z", - "start_time": "2026-01-30T14:21:18.686911500Z" + "end_time": "2026-02-03T08:45:47.689050500Z", + "start_time": "2026-02-03T08:45:47.511174200Z" } }, "source": [ @@ -2034,8 +2034,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.967168700Z", - "start_time": "2026-01-30T14:21:18.928864500Z" + "end_time": "2026-02-03T08:45:47.818974600Z", + "start_time": "2026-02-03T08:45:47.771680100Z" } }, "source": [ @@ -2213,8 +2213,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.276047300Z", - "start_time": "2026-01-30T14:21:19.146886900Z" + "end_time": "2026-02-03T08:45:48.096706900Z", + "start_time": "2026-02-03T08:45:48.012725300Z" } }, "source": [ @@ -2443,8 +2443,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.820247800Z", - "start_time": "2026-01-30T14:21:19.653280100Z" + "end_time": "2026-02-03T08:45:48.598207900Z", + "start_time": "2026-02-03T08:45:48.541276800Z" } }, "source": [ @@ -2593,8 +2593,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.335572Z", - "start_time": "2026-01-30T14:21:20.302358700Z" + "end_time": "2026-02-03T08:45:49.188205Z", + "start_time": "2026-02-03T08:45:49.122589100Z" } }, "source": [ @@ -2613,12 +2613,12 @@ { "data": { "text/plain": [ - "{frozenset({1, 4}): {\"Same 'City'\"},\n", - " frozenset({8, 11}): {\"Same 'City'\"},\n", - " frozenset({2, 5}): {\"Same 'City'\"},\n", - " frozenset({10, 13}): {\"Same 'City'\"},\n", - " frozenset({3, 8}): {\"Same 'City'\"},\n", - " frozenset({3, 11}): {\"Same 'City'\"}}" + "{frozenset({1, 4}): [Same 'City'],\n", + " frozenset({8, 11}): [Same 'City'],\n", + " frozenset({2, 5}): [Same 'City'],\n", + " frozenset({10, 13}): [Same 'City'],\n", + " frozenset({3, 8}): [Same 'City'],\n", + " frozenset({3, 11}): [Same 'City']}" ] }, "execution_count": 26, @@ -2631,9 +2631,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "Of course, this will induce some overhead." - ] + "source": "This will induce some overhead." }, { "cell_type": "markdown", @@ -2646,8 +2644,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.409405100Z", - "start_time": "2026-01-30T14:21:20.374573700Z" + "end_time": "2026-02-03T08:45:49.300573800Z", + "start_time": "2026-02-03T08:45:49.260624100Z" } }, "source": [ @@ -2668,16 +2666,16 @@ "7 10 Caroline Dufour Lens 45 \n", "8 13 Benoît Benoît Lens 15 \n", "\n", - " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n", - "1 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n", - "3 [] 1 (Same 'City') \n", - "4 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 [] 2 (Same 'City') \n", - "6 [] 2 (Same 'City') \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n", - "8 ['lensfans.fr'] 3 (Same 'City') " + " websites _block _motive \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 ([Same 'City']) \n", + "1 ['jacquesdupond.fr'] 0 ([Same 'City']) \n", + "2 ['somewebsite.com/users/rpz59'] 1 ([Same 'City']) \n", + "3 [] 1 ([Same 'City']) \n", + "4 ['roubaixlove.fr'] 2 ([Same 'City']) \n", + "5 [] 2 ([Same 'City']) \n", + "6 [] 2 ([Same 'City']) \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 ([Same 'City']) \n", + "8 ['lensfans.fr'] 3 ([Same 'City']) " ], "text/html": [ "
\n", @@ -2716,7 +2714,7 @@ " 37\n", " ['somewebsite.com/users/jacquesdupond', 'jacqu...\n", " 0\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 1\n", @@ -2726,7 +2724,7 @@ " 37\n", " ['jacquesdupond.fr']\n", " 0\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 2\n", @@ -2736,7 +2734,7 @@ " 24\n", " ['somewebsite.com/users/rpz59']\n", " 1\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 3\n", @@ -2746,7 +2744,7 @@ " 24\n", " []\n", " 1\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 4\n", @@ -2756,7 +2754,7 @@ " 32\n", " ['roubaixlove.fr']\n", " 2\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 5\n", @@ -2766,7 +2764,7 @@ " 33\n", " []\n", " 2\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 6\n", @@ -2776,7 +2774,7 @@ " 33\n", " []\n", " 2\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 7\n", @@ -2786,7 +2784,7 @@ " 45\n", " ['pythonensamusant.fr', 'lensfans.fr']\n", " 3\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", " 8\n", @@ -2796,7 +2794,7 @@ " 15\n", " ['lensfans.fr']\n", " 3\n", - " (Same 'City')\n", + " ([Same 'City'])\n", " \n", " \n", "\n", @@ -2813,23 +2811,19 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "... Though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." - ] + "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "... Which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" - ] + "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.612990700Z", - "start_time": "2026-01-30T14:21:20.483928200Z" + "end_time": "2026-02-03T08:45:49.556914900Z", + "start_time": "2026-02-03T08:45:49.481507100Z" } }, "source": [ @@ -2855,13 +2849,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " + " City_r Age_r websites_r _motive _block \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [Same 'City'] 0 \n", + "1 Phalempin 24 [] [Same 'City'] 1 \n", + "2 Roubaix 33 [] [Same 'City'] 2 \n", + "3 Roubaix 33 [] [Same 'City'] 2 \n", + "4 Roubaix 32 ['roubaixlove.fr'] [Same 'City'] 2 \n", + "5 Lens 15 ['lensfans.fr'] [Same 'City'] 3 " ], "text/html": [ "
\n", @@ -2892,8 +2886,8 @@ " City_r\n", " Age_r\n", " websites_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -2909,8 +2903,8 @@ " Villeneuve d'Ascq\n", " 37\n", " ['jacquesdupond.fr']\n", + " [Same 'City']\n", " 0\n", - " (Same 'City')\n", " \n", " \n", " 1\n", @@ -2924,8 +2918,8 @@ " Phalempin\n", " 24\n", " []\n", + " [Same 'City']\n", " 1\n", - " (Same 'City')\n", " \n", " \n", " 2\n", @@ -2939,8 +2933,8 @@ " Roubaix\n", " 33\n", " []\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 3\n", @@ -2954,8 +2948,8 @@ " Roubaix\n", " 33\n", " []\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 4\n", @@ -2969,8 +2963,8 @@ " Roubaix\n", " 32\n", " ['roubaixlove.fr']\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 5\n", @@ -2984,8 +2978,8 @@ " Lens\n", " 15\n", " ['lensfans.fr']\n", + " [Same 'City']\n", " 3\n", - " (Same 'City')\n", " \n", " \n", "\n", @@ -3010,8 +3004,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.944670700Z", - "start_time": "2026-01-30T14:21:20.834495500Z" + "end_time": "2026-02-03T08:45:50.016104Z", + "start_time": "2026-02-03T08:45:49.965660200Z" } }, "source": [ @@ -3023,13 +3017,13 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block _motive\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n", - "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n", - "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n", - "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n", - "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')" + " id_l Name_l id_r Name_r _motive _block\n", + "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n", + "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n", + "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n", + "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n", + "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n", + "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3" ], "text/html": [ "
\n", @@ -3054,8 +3048,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -3065,8 +3059,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'City']\n", " 0\n", - " (Same 'City')\n", " \n", " \n", " 1\n", @@ -3074,8 +3068,8 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", + " [Same 'City']\n", " 1\n", - " (Same 'City')\n", " \n", " \n", " 2\n", @@ -3083,8 +3077,8 @@ " Paul Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 3\n", @@ -3092,8 +3086,8 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 4\n", @@ -3101,8 +3095,8 @@ " Sophie Delarue\n", " 3\n", " Paul Delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 5\n", @@ -3110,8 +3104,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [Same 'City']\n", " 3\n", - " (Same 'City')\n", " \n", " \n", "\n", @@ -3132,188 +3126,13 @@ "Motives are dynamic:" ] }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.591044600Z", - "start_time": "2026-01-30T14:21:21.517777200Z" - } - }, - "source": [ - "msb.generate_blocking_report(df, links)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l City_l Age_l \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 8 Sophie Delarue Roubaix 33 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 10 Caroline Dufour Lens 45 \n", - "\n", - " websites_l id_r Name_r \\\n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n", - "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n", - "2 ['roubaixlove.fr'] 11 sophie_delarue \n", - "3 [] 11 sophie_delarue \n", - "4 [] 3 Paul Delarue \n", - "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", - "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lCity_lAge_lwebsites_lid_rName_rCity_rAge_rwebsites_r_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...4Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0(Same 'City')
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']5pierre dusquesnesPhalempin24[]1(Same 'City')
23Paul DelarueRoubaix32['roubaixlove.fr']11sophie_delarueRoubaix33[]2(Same 'City')
38Sophie DelarueRoubaix33[]11sophie_delarueRoubaix33[]2(Same 'City')
48Sophie DelarueRoubaix33[]3Paul DelarueRoubaix32['roubaixlove.fr']2(Same 'City')
510Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']13Benoît BenoîtLens15['lensfans.fr']3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 30 - }, { "cell_type": "code", "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.867809800Z", - "start_time": "2026-01-30T14:21:21.674986800Z" + "end_time": "2026-02-03T08:45:50.383366500Z", + "start_time": "2026-02-03T08:45:50.220420100Z" } }, "source": [ @@ -3344,35 +3163,35 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont \n", + "1 1 Jacques Dupond 6 Jean-Michel Python \n", + "2 1 Jacques Dupond 10 Caroline Dufour \n", + "3 1 Jacques Dupond 4 Jacques Dupont \n", + "4 1 Jacques Dupond 6 Jean-Michel Python \n", + "5 1 Jacques Dupond 10 Caroline Dufour \n", + "6 10 Caroline Dufour 6 Jean-Michel Python \n", + "7 10 Caroline Dufour 13 Benoît Benoît \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", + "9 8 Sophie Delarue 11 sophie_delarue \n", + "10 10 Caroline Dufour 6 Jean-Michel Python \n", + "11 10 Caroline Dufour 13 Benoît Benoît \n", + "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", - " _motive \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "6 (>=1 overlap in 'websites') \n", - "7 (>=1 overlap in 'websites') \n", - "8 (Same 'City', Same 'Age') \n", - "9 (Same 'City', Same 'Age') \n", - "10 (>=1 overlap in 'websites') \n", - "11 (>=1 overlap in 'websites') \n", - "12 (>=1 overlap in 'websites') " + " _motive _block \n", + "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 \n", + "1 [>=1 overlap in 'websites'] 0 \n", + "2 [>=1 overlap in 'websites'] 0 \n", + "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 \n", + "4 [>=1 overlap in 'websites'] 1 \n", + "5 [>=1 overlap in 'websites'] 1 \n", + "6 [>=1 overlap in 'websites'] 1 \n", + "7 [>=1 overlap in 'websites'] 1 \n", + "8 [Same 'Age', Same 'City'] 2 \n", + "9 [Same 'Age', Same 'City'] 3 \n", + "10 [>=1 overlap in 'websites'] 4 \n", + "11 [>=1 overlap in 'websites'] 4 \n", + "12 [>=1 overlap in 'websites'] 4 " ], "text/html": [ "
\n", @@ -3397,8 +3216,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -3408,8 +3227,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 1\n", @@ -3417,8 +3236,8 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 2\n", @@ -3426,8 +3245,8 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 3\n", @@ -3435,8 +3254,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 4\n", @@ -3444,8 +3263,8 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 5\n", @@ -3453,8 +3272,8 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 6\n", @@ -3462,8 +3281,8 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 7\n", @@ -3471,8 +3290,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 8\n", @@ -3480,8 +3299,8 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", + " [Same 'Age', Same 'City']\n", " 2\n", - " (Same 'City', Same 'Age')\n", " \n", " \n", " 9\n", @@ -3489,8 +3308,8 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'Age', Same 'City']\n", " 3\n", - " (Same 'City', Same 'Age')\n", " \n", " \n", " 10\n", @@ -3498,8 +3317,8 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 11\n", @@ -3507,8 +3326,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 12\n", @@ -3516,20 +3335,20 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", "\n", "
" ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 31 + "execution_count": 30 }, { "cell_type": "markdown", @@ -3545,47 +3364,47 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:22.186415700Z", - "start_time": "2026-01-30T14:21:22.127304600Z" + "end_time": "2026-02-03T08:45:50.717126Z", + "start_time": "2026-02-03T08:45:50.677002100Z" } }, "source": [ - "report[\"score\"] = msb.scoring(report)\n", - "report.sort_values(\"score\", ascending=False)" + "report[\"_score\"] = msb.scoring(report)\n", + "report.sort_values(\"_score\", ascending=False)" ], "outputs": [ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont \n", + "3 1 Jacques Dupond 4 Jacques Dupont \n", + "1 1 Jacques Dupond 6 Jean-Michel Python \n", + "2 1 Jacques Dupond 10 Caroline Dufour \n", + "4 1 Jacques Dupond 6 Jean-Michel Python \n", + "5 1 Jacques Dupond 10 Caroline Dufour \n", + "6 10 Caroline Dufour 6 Jean-Michel Python \n", + "7 10 Caroline Dufour 13 Benoît Benoît \n", + "10 10 Caroline Dufour 6 Jean-Michel Python \n", + "12 13 Benoît Benoît 6 Jean-Michel Python \n", + "11 10 Caroline Dufour 13 Benoît Benoît \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", + "9 8 Sophie Delarue 11 sophie_delarue \n", "\n", - " _motive score \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "8 (Same 'City', Same 'Age') 2 \n", - "9 (Same 'City', Same 'Age') 2 \n", - "6 (>=1 overlap in 'websites') 1 \n", - "7 (>=1 overlap in 'websites') 1 \n", - "10 (>=1 overlap in 'websites') 1 \n", - "11 (>=1 overlap in 'websites') 1 \n", - "12 (>=1 overlap in 'websites') 1 " + " _motive _block _score \n", + "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 52 \n", + "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 52 \n", + "1 [>=1 overlap in 'websites'] 0 27 \n", + "2 [>=1 overlap in 'websites'] 0 27 \n", + "4 [>=1 overlap in 'websites'] 1 27 \n", + "5 [>=1 overlap in 'websites'] 1 27 \n", + "6 [>=1 overlap in 'websites'] 1 27 \n", + "7 [>=1 overlap in 'websites'] 1 27 \n", + "10 [>=1 overlap in 'websites'] 4 27 \n", + "12 [>=1 overlap in 'websites'] 4 27 \n", + "11 [>=1 overlap in 'websites'] 4 27 \n", + "8 [Same 'Age', Same 'City'] 2 25 \n", + "9 [Same 'Age', Same 'City'] 3 25 " ], "text/html": [ "
\n", @@ -3610,9 +3429,9 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", - " score\n", + " _block\n", + " _score\n", " \n", " \n", " \n", @@ -3622,9 +3441,19 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", + " 52\n", + " \n", + " \n", + " 3\n", + " 1\n", + " Jacques Dupond\n", + " 4\n", + " Jacques Dupont\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " 1\n", + " 52\n", " \n", " \n", " 1\n", @@ -3632,9 +3461,9 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", + " 27\n", " \n", " \n", " 2\n", @@ -3642,19 +3471,9 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", - " \n", - " \n", - " 3\n", - " 1\n", - " Jacques Dupond\n", - " 4\n", - " Jacques Dupont\n", - " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", + " 27\n", " \n", " \n", " 4\n", @@ -3662,9 +3481,9 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", + " 27\n", " \n", " \n", " 5\n", @@ -3672,29 +3491,9 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", - " \n", - " \n", - " 8\n", - " 2\n", - " Pierre Dusquesnes\n", - " 5\n", - " pierre dusquesnes\n", - " 2\n", - " (Same 'City', Same 'Age')\n", - " 2\n", - " \n", - " \n", - " 9\n", - " 8\n", - " Sophie Delarue\n", - " 11\n", - " sophie_delarue\n", - " 3\n", - " (Same 'City', Same 'Age')\n", - " 2\n", + " 27\n", " \n", " \n", " 6\n", @@ -3702,9 +3501,9 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", - " 1\n", + " 27\n", " \n", " \n", " 7\n", @@ -3712,9 +3511,9 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", - " 1\n", + " 27\n", " \n", " \n", " 10\n", @@ -3722,41 +3521,61 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", - " 1\n", + " 27\n", " \n", " \n", - " 11\n", - " 10\n", - " Caroline Dufour\n", + " 12\n", " 13\n", " Benoît Benoît\n", + " 6\n", + " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", - " 1\n", + " 27\n", " \n", " \n", - " 12\n", + " 11\n", + " 10\n", + " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " 6\n", - " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", - " 1\n", + " 27\n", + " \n", + " \n", + " 8\n", + " 2\n", + " Pierre Dusquesnes\n", + " 5\n", + " pierre dusquesnes\n", + " [Same 'Age', Same 'City']\n", + " 2\n", + " 25\n", + " \n", + " \n", + " 9\n", + " 8\n", + " Sophie Delarue\n", + " 11\n", + " sophie_delarue\n", + " [Same 'Age', Same 'City']\n", + " 3\n", + " 25\n", " \n", " \n", "\n", "
" ] }, - "execution_count": 32, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 32 + "execution_count": 31 } ], "metadata": { From d08173dc432d05ffec1621497f6861e0aee24d93 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 15:54:20 +0100 Subject: [PATCH 03/20] feat: BREAKING CHANGES various improvements and bugfixes --- src/ms_blocking/ms_blocking.py | 64 +++++++++--- src/ms_blocking/utils.py | 174 ++++++++++++++++----------------- 2 files changed, 134 insertions(+), 104 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index ac33759..9d61832 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -186,7 +186,7 @@ def block(self, data, motives=False): } if motives: - explanations = [EquivalenceMotive(self.blocking_columns)] + explanations = [EquivalenceMotive(col) for col in self.blocking_columns] return add_motives_to_coords(coords, explanations) else: return set(coords) # set is unnnecessary @@ -276,7 +276,8 @@ def block(self, data, motives=False): if motives: explanations = [ - OverlapMotive(self.blocking_columns, self.overlap, self.word_level) + OverlapMotive(col, self.overlap, self.word_level) + for col in self.blocking_columns ] return add_motives_to_coords(coords, explanations) else: @@ -425,8 +426,10 @@ def block(self, data, motives=False): if motives: explanations = [ - EquivalenceMotive(self.equivalence_columns), - OverlapMotive(self.overlap_columns, self.overlap, self.word_level), + EquivalenceMotive(col) for col in self.equivalence_columns + ] + [ + OverlapMotive(col, self.overlap, self.word_level) + for col in self.overlap_columns ] return add_motives_to_coords(coords, explanations) @@ -443,6 +446,7 @@ def add_blocks_to_dataset( motives: bool = False, show_as_pairs: bool = False, output_columns: Columns = None, + score: bool = False, ) -> pd.DataFrame: """Returns the intersection of an array of links @@ -466,6 +470,8 @@ def add_blocks_to_dataset( Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame output_columns : list Columns to show. Useful in combination with show_as_pairs as column names are altered + score : bool + Whether to show a score (computed from the number of motives) Returns ------- @@ -494,23 +500,33 @@ def add_blocks_to_dataset( if motives: if type(coords) is not dict: - raise TypeError("Cannot specify motives=True without passing motives") + raise TypeError("Cannot specify 'motives=True' without passing motives") # Ensure the index is a unique identifier if not data.index.is_unique: raise ValueError("DataFrame index must be unique to be used as an identifier.") + if score and not motives: + raise ValueError("Cannot specify 'score=True' without passing motives") + if "_motive" in data.columns: if motives: raise ValueError( "Please rename existing '_motive' column OR do not pass 'motives=True'" ) + if "score" in data.columns: + if score: + raise ValueError( + "Please rename existing '_score' column OR do not pass 'score=True'" + ) + if "_block" in data.columns: raise ValueError("Please rename existing '_block' column") if output_columns is None: output_columns = data.columns + data = data[output_columns].copy() if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph @@ -521,6 +537,13 @@ def add_blocks_to_dataset( output_data = pd.DataFrame(columns=columns) else: output_data = pd.DataFrame(columns=data.columns) + + if motives: + output_data["_motive"] = "" + if score: + output_data["_score"] = 0 + output_data["_block"] = -1 + else: output_data = data # Map coords to connected component labels @@ -561,7 +584,12 @@ def add_blocks_to_dataset( ) current_row.index = current_index if motives: - current_row["_motive"] = str(solve_motives(coords[pair])) + motives_solved = solve_motives(coords[pair]) + current_row["_motive"] = str(list(map(str, motives_solved))) + if score: + current_row["_score"] = len( + motives_solved + ) # Score is simply the number of non-redundant motives output_data = pd.concat([output_data, current_row]) # Assign blocks to rows based on their original index @@ -612,17 +640,24 @@ def add_blocks_to_dataset( if not show_as_pairs and motives: id_list = flatten(coords.keys()) motive_matcher = { - row_id: frozenset( - str(solve_motives(coords[pair])) - for pair in coords.keys() - if row_id in pair - ) + row_id: str(list(map(str, solve_motives(coords[pair])))) + for pair in coords.keys() for row_id in id_list + if row_id in pair } output_data["_motive"] = output_data.index.map(motive_matcher) + if score: + output_data["_score"] = 0 + score_matcher = { # Horribly repetitive + row_id: len(solve_motives(coords[pair])) + for pair in coords.keys() + for row_id in id_list + if row_id in pair + } + output_data["_score"] = output_data.index.map(score_matcher) - if "_block" not in output_data.columns: # Empty coords - output_data["_block"] = -1 + # if "_block" not in output_data.columns: # Empty coords + # output_data["_block"] = -1 output_data = output_data.reset_index(drop=True) output_data["_block"] = output_data["_block"].astype(int) @@ -802,3 +837,6 @@ def merge_blockers( ) else: return AndNode(left, right) + + +# TODO: deport logic in a way that enables .progress_apply diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 3ee8ead..aaa5e08 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -12,38 +12,45 @@ class EquivalenceMotive: - def __init__(self, blocking_columns): - self.blocking_columns = blocking_columns + def __init__(self, blocking_column): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + self.blocking_column = blocking_column def __eq__(self, other): - return self.blocking_columns == other.blocking_columns + return self.blocking_column == other.blocking_column + + def __str__(self): + return f"Same '{self.blocking_column}'" def __repr__(self): - return ", ".join( - [f"Same '{column_name}'" for column_name in self.blocking_columns] - ) + return f"EquivalenceMotive(['{self.blocking_column}'])" class OverlapMotive: - def __init__(self, blocking_columns, overlap=1, word_level=False): - self.blocking_columns = blocking_columns + def __init__(self, blocking_column, overlap=1, word_level=False): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + if not isinstance(overlap, int): + raise TypeError("overlap must be an int") + if not isinstance(word_level, bool): + raise TypeError("word_level must be a boolean") + self.blocking_column = blocking_column self.overlap = overlap self.word_level = word_level def __eq__(self, other): return ( - self.blocking_columns == other.blocking_columns + self.blocking_column == other.blocking_column and self.overlap == other.overlap and self.word_level == other.word_level ) + def __str__(self): + return f">={self.overlap}{' word-level' if self.word_level else ''} overlap in '{self.blocking_column}'" + def __repr__(self): - return ", ".join( - [ - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.blocking_columns - ] - ) + return f"OverlapMotive(['{self.blocking_column}'], {self.overlap}{', word_level=True' if self.word_level else ''})" Columns = List[str] @@ -276,7 +283,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: if type(coords_1) is type(coords_2) is dict: # We have motives return { pair: ( - (coords_1[pair] + coords_2[pair]) + coords_1[pair] + coords_2[pair] if (pair in coords_1 and pair in coords_2) else coords_1[pair] if (pair in coords_1) @@ -287,6 +294,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: } else: return coords_1.union(coords_2) + # TODO: check for merging one with motive and one w/o def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: @@ -314,7 +322,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: """ if type(coords_1) is type(coords_2) is dict: # We have motives return { - pair: (coords_1[pair] + coords_2[pair]) + pair: coords_1[pair] + coords_2[pair] for y in (coords_1, coords_2) for pair in y.keys() if (pair in coords_1 and pair in coords_2) @@ -376,40 +384,6 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: return [s for s in cleaned_items if len(s) > 0] -def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series: - """Add a score to a blocked DataFrame based on the number of motives - - Parameters - ---------- - data : DataFrame - DataFrame with motives - - motives_column : str - Name of the column containing the motives - - Returns - ------- - Series[int] - A column of scores - """ - - # Check that we do have motives - if motives_column not in data.columns: - if motives_column == "_motive": - raise ValueError("No motives in DataFrame") - else: - raise ValueError( - f'Specified motives column "{motives_column}" does not exist' - ) - - if "score" in data.columns: - print("Renaming 'score' column to 'score_old'") - data = data.rename(columns={"score": "score_old"}) - - scores = data[motives_column].apply(len) - return scores - - def must_not_be_different_apply( # WIP temp_data: pd.DataFrame, blocking_columns: List[str], @@ -558,51 +532,69 @@ def solve_motives(motives: List[Motive]) -> List[Motive]: Returns ------- List[Motive] - Pairs obtained by blocking + A list of Motives whose length should be smaller or equal to the original list of motives Examples -------- >>> solve_motives([OverlapMotive(['websites'], 1), OverlapMotive(['websites'], 2), OverlapMotive(['websites'], 2, word_level=False)]) - OverlapMotive(['websites'], 2, word_level=False) + [OverlapMotive(['websites'], 2, word_level=False)] """ if not motives: raise ValueError("Motives must not be empty") - final_motives = [motives[0]] - for motive in motives[1:]: - if motive not in final_motives: - final_motives.append(motive) - if type(motive) is OverlapMotive: - # Look for redundant motives - for motive_to_compare in final_motives[:-1]: - if ( - type(motive_to_compare) is OverlapMotive - ): # With EquivalenceMotive, equality check suffices - if ( - motive.blocking_columns - == motive_to_compare.blocking_columns - ): - if motive.word_level == motive_to_compare.word_level: - # Replace Blocker with the one with bigger overlap - if motive.overlap < motive_to_compare.overlap: - final_motives.remove(motive) - final_motives.append(motive_to_compare) - elif motive.overlap > motive.overlap: - final_motives.remove(motive_to_compare) - final_motives.append(motive) - elif motive.overlap == motive_to_compare.overlap: - # Replace Blocker with the one with stricter word/element-level condition - if ( - motive.word_level - and not motive_to_compare.word_level - ): - final_motives.remove(motive) - final_motives.append(motive_to_compare) - elif ( - not motive.word_level - and motive_to_compare.word_level - ): - final_motives.remove(motive_to_compare) - final_motives.append(motive) - - return final_motives + # split_motives = [] + # for motive in motives: + # split_motives += split_motive(motive) + + final_motives = [ + motive for motive in motives if type(motive) is EquivalenceMotive + ] # With EquivalenceMotive, equality check suffices + overlap_motives = [motive for motive in motives if type(motive) is OverlapMotive] + overlap_columns = [motive.blocking_column for motive in overlap_motives] + + for column in overlap_columns: + overlap_motives_for_column = [ + motive for motive in overlap_motives if motive.blocking_column == column + ] + + # Select Blocker with stricter word/element-level condition + word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if motive.word_level + ] + not_word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if not motive.word_level + ] + + # Find biggest overlap among the non-word_level ones + if not_word_level_motives_for_column: + max_overlap_not_word_level_for_column = max(not_word_level_motives_for_column, key=lambda m: m.overlap) + max_overlap_not_word_level_for_column_overlap = max_overlap_not_word_level_for_column.overlap + else: + max_overlap_not_word_level_for_column = [] + max_overlap_not_word_level_for_column_overlap = 0 # Will never be used, left for linter + + # Now find biggest overlap among the word_level ones + if word_level_motives_for_column: + max_overlap_word_level_for_column = max(word_level_motives_for_column, key=lambda m: m.overlap) + max_overlap_word_level_for_column_overlap = max_overlap_word_level_for_column.overlap + if not_word_level_motives_for_column: + # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it + if max_overlap_word_level_for_column_overlap <= max_overlap_not_word_level_for_column_overlap: + max_overlap_word_level_for_column = [] + else: + max_overlap_word_level_for_column = [] + + if max_overlap_not_word_level_for_column: + max_overlap_not_word_level_for_column = [max_overlap_not_word_level_for_column] + if max_overlap_word_level_for_column: + max_overlap_word_level_for_column = [max_overlap_word_level_for_column] + final_motives += ( + max_overlap_word_level_for_column + max_overlap_not_word_level_for_column + ) + + # Remove duplicates + final_motives_no_duplicates = [] + for motive in final_motives: + if motive not in final_motives_no_duplicates: + final_motives_no_duplicates.append(motive) + return final_motives_no_duplicates From add6ae54fed5e847452fcf55034cfab59f6f8510 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 15:54:54 +0100 Subject: [PATCH 04/20] docs: fix rendering issue in notebook --- docs/example.ipynb | 440 +++++++++++++++++++++++---------------------- 1 file changed, 224 insertions(+), 216 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 7c44012..45699af 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,8 +32,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:42.897197100Z", - "start_time": "2026-02-03T08:45:42.069366700Z" + "end_time": "2026-02-03T14:40:28.508876500Z", + "start_time": "2026-02-03T14:40:27.761433800Z" } }, "source": [ @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:42.936219800Z", - "start_time": "2026-02-03T08:45:42.901218100Z" + "end_time": "2026-02-03T14:40:28.563486200Z", + "start_time": "2026-02-03T14:40:28.512916Z" } }, "source": [ @@ -282,8 +282,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.089459800Z", - "start_time": "2026-02-03T08:45:42.974568800Z" + "end_time": "2026-02-03T14:40:28.741020400Z", + "start_time": "2026-02-03T14:40:28.615799300Z" } }, "source": [ @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.241858Z", - "start_time": "2026-02-03T08:45:43.164364500Z" + "end_time": "2026-02-03T14:40:29.068322900Z", + "start_time": "2026-02-03T14:40:28.915502800Z" } }, "source": [ @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.293312300Z", - "start_time": "2026-02-03T08:45:43.279951300Z" + "end_time": "2026-02-03T14:40:29.328166900Z", + "start_time": "2026-02-03T14:40:29.309785500Z" } }, "source": [ @@ -369,8 +369,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.491120800Z", - "start_time": "2026-02-03T08:45:43.387967900Z" + "end_time": "2026-02-03T14:40:29.547537Z", + "start_time": "2026-02-03T14:40:29.397273800Z" } }, "source": [ @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.564017300Z", - "start_time": "2026-02-03T08:45:43.543375900Z" + "end_time": "2026-02-03T14:40:29.599221700Z", + "start_time": "2026-02-03T14:40:29.572788900Z" } }, "source": [ @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.730577200Z", - "start_time": "2026-02-03T08:45:43.602849600Z" + "end_time": "2026-02-03T14:40:29.841412500Z", + "start_time": "2026-02-03T14:40:29.660471200Z" } }, "source": [ @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:43.963649700Z", - "start_time": "2026-02-03T08:45:43.857183700Z" + "end_time": "2026-02-03T14:40:30.138487100Z", + "start_time": "2026-02-03T14:40:30.060590900Z" } }, "source": [ @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:44.258242200Z", - "start_time": "2026-02-03T08:45:44.158668200Z" + "end_time": "2026-02-03T14:40:30.619777700Z", + "start_time": "2026-02-03T14:40:30.422768900Z" } }, "source": [ @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:44.439022100Z", - "start_time": "2026-02-03T08:45:44.392038500Z" + "end_time": "2026-02-03T14:40:30.988164600Z", + "start_time": "2026-02-03T14:40:30.834419400Z" } }, "source": [ @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:44.704919900Z", - "start_time": "2026-02-03T08:45:44.604905100Z" + "end_time": "2026-02-03T14:40:31.490353400Z", + "start_time": "2026-02-03T14:40:31.385134Z" } }, "source": [ @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:45.167225900Z", - "start_time": "2026-02-03T08:45:45.142061100Z" + "end_time": "2026-02-03T14:40:32.118722700Z", + "start_time": "2026-02-03T14:40:31.995086900Z" } }, "source": [ @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:45.497760900Z", - "start_time": "2026-02-03T08:45:45.335278600Z" + "end_time": "2026-02-03T14:40:32.898993200Z", + "start_time": "2026-02-03T14:40:32.771388400Z" } }, "source": [ @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:45.879254300Z", - "start_time": "2026-02-03T08:45:45.779256400Z" + "end_time": "2026-02-03T14:40:33.431455700Z", + "start_time": "2026-02-03T14:40:33.206324Z" } }, "source": [ @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:46.232628900Z", - "start_time": "2026-02-03T08:45:46.186246600Z" + "end_time": "2026-02-03T14:40:34.177679600Z", + "start_time": "2026-02-03T14:40:34.059417200Z" } }, "source": [ @@ -1464,8 +1464,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:46.440480Z", - "start_time": "2026-02-03T08:45:46.391161200Z" + "end_time": "2026-02-03T14:40:34.323788200Z", + "start_time": "2026-02-03T14:40:34.232749100Z" } }, "source": [ @@ -1589,8 +1589,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:46.754986800Z", - "start_time": "2026-02-03T08:45:46.666968100Z" + "end_time": "2026-02-03T14:40:35.068638700Z", + "start_time": "2026-02-03T14:40:34.966880900Z" } }, "source": [ @@ -1804,8 +1804,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:47.079529400Z", - "start_time": "2026-02-03T08:45:47.029011300Z" + "end_time": "2026-02-03T14:40:35.421514400Z", + "start_time": "2026-02-03T14:40:35.348243100Z" } }, "source": [ @@ -1828,8 +1828,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:47.289177100Z", - "start_time": "2026-02-03T08:45:47.270625400Z" + "end_time": "2026-02-03T14:40:35.616427400Z", + "start_time": "2026-02-03T14:40:35.568154600Z" } }, "source": [ @@ -1849,8 +1849,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:47.381218700Z", - "start_time": "2026-02-03T08:45:47.334125300Z" + "end_time": "2026-02-03T14:40:35.856681600Z", + "start_time": "2026-02-03T14:40:35.755378800Z" } }, "source": [ @@ -1990,8 +1990,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:47.689050500Z", - "start_time": "2026-02-03T08:45:47.511174200Z" + "end_time": "2026-02-03T14:40:36.225054300Z", + "start_time": "2026-02-03T14:40:36.088658200Z" } }, "source": [ @@ -2034,8 +2034,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:47.818974600Z", - "start_time": "2026-02-03T08:45:47.771680100Z" + "end_time": "2026-02-03T14:40:36.546944200Z", + "start_time": "2026-02-03T14:40:36.506897100Z" } }, "source": [ @@ -2213,8 +2213,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:48.096706900Z", - "start_time": "2026-02-03T08:45:48.012725300Z" + "end_time": "2026-02-03T14:40:36.899709500Z", + "start_time": "2026-02-03T14:40:36.769059100Z" } }, "source": [ @@ -2443,8 +2443,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:48.598207900Z", - "start_time": "2026-02-03T08:45:48.541276800Z" + "end_time": "2026-02-03T14:40:37.743563200Z", + "start_time": "2026-02-03T14:40:37.558867900Z" } }, "source": [ @@ -2593,8 +2593,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:49.188205Z", - "start_time": "2026-02-03T08:45:49.122589100Z" + "end_time": "2026-02-03T14:40:38.350268900Z", + "start_time": "2026-02-03T14:40:38.156431100Z" } }, "source": [ @@ -2613,12 +2613,12 @@ { "data": { "text/plain": [ - "{frozenset({1, 4}): [Same 'City'],\n", - " frozenset({8, 11}): [Same 'City'],\n", - " frozenset({2, 5}): [Same 'City'],\n", - " frozenset({10, 13}): [Same 'City'],\n", - " frozenset({3, 8}): [Same 'City'],\n", - " frozenset({3, 11}): [Same 'City']}" + "{frozenset({1, 4}): [EquivalenceMotive(['City'])],\n", + " frozenset({8, 11}): [EquivalenceMotive(['City'])],\n", + " frozenset({2, 5}): [EquivalenceMotive(['City'])],\n", + " frozenset({10, 13}): [EquivalenceMotive(['City'])],\n", + " frozenset({3, 8}): [EquivalenceMotive(['City'])],\n", + " frozenset({3, 11}): [EquivalenceMotive(['City'])]}" ] }, "execution_count": 26, @@ -2644,8 +2644,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:49.300573800Z", - "start_time": "2026-02-03T08:45:49.260624100Z" + "end_time": "2026-02-03T14:40:38.712869Z", + "start_time": "2026-02-03T14:40:38.617699300Z" } }, "source": [ @@ -2667,15 +2667,15 @@ "8 13 Benoît Benoît Lens 15 \n", "\n", " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 ([Same 'City']) \n", - "1 ['jacquesdupond.fr'] 0 ([Same 'City']) \n", - "2 ['somewebsite.com/users/rpz59'] 1 ([Same 'City']) \n", - "3 [] 1 ([Same 'City']) \n", - "4 ['roubaixlove.fr'] 2 ([Same 'City']) \n", - "5 [] 2 ([Same 'City']) \n", - "6 [] 2 ([Same 'City']) \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 ([Same 'City']) \n", - "8 ['lensfans.fr'] 3 ([Same 'City']) " + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 [\"Same 'City'\"] \n", + "1 ['jacquesdupond.fr'] 0 [\"Same 'City'\"] \n", + "2 ['somewebsite.com/users/rpz59'] 1 [\"Same 'City'\"] \n", + "3 [] 1 [\"Same 'City'\"] \n", + "4 ['roubaixlove.fr'] 2 [\"Same 'City'\"] \n", + "5 [] 2 [\"Same 'City'\"] \n", + "6 [] 2 [\"Same 'City'\"] \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 [\"Same 'City'\"] \n", + "8 ['lensfans.fr'] 3 [\"Same 'City'\"] " ], "text/html": [ "
\n", @@ -2714,7 +2714,7 @@ " 37\n", " ['somewebsite.com/users/jacquesdupond', 'jacqu...\n", " 0\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 1\n", @@ -2724,7 +2724,7 @@ " 37\n", " ['jacquesdupond.fr']\n", " 0\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 2\n", @@ -2734,7 +2734,7 @@ " 24\n", " ['somewebsite.com/users/rpz59']\n", " 1\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 3\n", @@ -2744,7 +2744,7 @@ " 24\n", " []\n", " 1\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 4\n", @@ -2754,7 +2754,7 @@ " 32\n", " ['roubaixlove.fr']\n", " 2\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 5\n", @@ -2764,7 +2764,7 @@ " 33\n", " []\n", " 2\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 6\n", @@ -2774,7 +2774,7 @@ " 33\n", " []\n", " 2\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 7\n", @@ -2784,7 +2784,7 @@ " 45\n", " ['pythonensamusant.fr', 'lensfans.fr']\n", " 3\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", " 8\n", @@ -2794,7 +2794,7 @@ " 15\n", " ['lensfans.fr']\n", " 3\n", - " ([Same 'City'])\n", + " [\"Same 'City'\"]\n", " \n", " \n", "\n", @@ -2822,8 +2822,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:49.556914900Z", - "start_time": "2026-02-03T08:45:49.481507100Z" + "end_time": "2026-02-03T14:40:39.016063100Z", + "start_time": "2026-02-03T14:40:38.850306600Z" } }, "source": [ @@ -2849,13 +2849,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r _motive _block \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [Same 'City'] 0 \n", - "1 Phalempin 24 [] [Same 'City'] 1 \n", - "2 Roubaix 33 [] [Same 'City'] 2 \n", - "3 Roubaix 33 [] [Same 'City'] 2 \n", - "4 Roubaix 32 ['roubaixlove.fr'] [Same 'City'] 2 \n", - "5 Lens 15 ['lensfans.fr'] [Same 'City'] 3 " + " City_r Age_r websites_r _motive _block \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [\"Same 'City'\"] 0 \n", + "1 Phalempin 24 [] [\"Same 'City'\"] 1 \n", + "2 Roubaix 33 [] [\"Same 'City'\"] 2 \n", + "3 Roubaix 33 [] [\"Same 'City'\"] 2 \n", + "4 Roubaix 32 ['roubaixlove.fr'] [\"Same 'City'\"] 2 \n", + "5 Lens 15 ['lensfans.fr'] [\"Same 'City'\"] 3 " ], "text/html": [ "
\n", @@ -2903,7 +2903,7 @@ " Villeneuve d'Ascq\n", " 37\n", " ['jacquesdupond.fr']\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 0\n", " \n", " \n", @@ -2918,7 +2918,7 @@ " Phalempin\n", " 24\n", " []\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 1\n", " \n", " \n", @@ -2933,7 +2933,7 @@ " Roubaix\n", " 33\n", " []\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -2948,7 +2948,7 @@ " Roubaix\n", " 33\n", " []\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -2963,7 +2963,7 @@ " Roubaix\n", " 32\n", " ['roubaixlove.fr']\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -2978,7 +2978,7 @@ " Lens\n", " 15\n", " ['lensfans.fr']\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 3\n", " \n", " \n", @@ -3004,8 +3004,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:50.016104Z", - "start_time": "2026-02-03T08:45:49.965660200Z" + "end_time": "2026-02-03T14:40:39.771226400Z", + "start_time": "2026-02-03T14:40:39.536276Z" } }, "source": [ @@ -3017,13 +3017,13 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _motive _block\n", - "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n", - "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n", - "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n", - "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n", - "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3" + " id_l Name_l id_r Name_r _motive _block\n", + "0 1 Jacques Dupond 4 Jacques Dupont [\"Same 'City'\"] 0\n", + "1 2 Pierre Dusquesnes 5 pierre dusquesnes [\"Same 'City'\"] 1\n", + "2 3 Paul Delarue 11 sophie_delarue [\"Same 'City'\"] 2\n", + "3 8 Sophie Delarue 11 sophie_delarue [\"Same 'City'\"] 2\n", + "4 8 Sophie Delarue 3 Paul Delarue [\"Same 'City'\"] 2\n", + "5 10 Caroline Dufour 13 Benoît Benoît [\"Same 'City'\"] 3" ], "text/html": [ "
\n", @@ -3059,7 +3059,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 0\n", " \n", " \n", @@ -3068,7 +3068,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 1\n", " \n", " \n", @@ -3077,7 +3077,7 @@ " Paul Delarue\n", " 11\n", " sophie_delarue\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -3086,7 +3086,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -3095,7 +3095,7 @@ " Sophie Delarue\n", " 3\n", " Paul Delarue\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -3104,7 +3104,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [Same 'City']\n", + " [\"Same 'City'\"]\n", " 3\n", " \n", " \n", @@ -3131,8 +3131,8 @@ "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-02-03T08:45:50.383366500Z", - "start_time": "2026-02-03T08:45:50.220420100Z" + "end_time": "2026-02-03T14:40:41.596196Z", + "start_time": "2026-02-03T14:40:41.287210400Z" } }, "source": [ @@ -3179,19 +3179,19 @@ "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", " _motive _block \n", - "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 \n", - "1 [>=1 overlap in 'websites'] 0 \n", - "2 [>=1 overlap in 'websites'] 0 \n", - "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 \n", - "4 [>=1 overlap in 'websites'] 1 \n", - "5 [>=1 overlap in 'websites'] 1 \n", - "6 [>=1 overlap in 'websites'] 1 \n", - "7 [>=1 overlap in 'websites'] 1 \n", - "8 [Same 'Age', Same 'City'] 2 \n", - "9 [Same 'Age', Same 'City'] 3 \n", - "10 [>=1 overlap in 'websites'] 4 \n", - "11 [>=1 overlap in 'websites'] 4 \n", - "12 [>=1 overlap in 'websites'] 4 " + "0 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 0 \n", + "1 [\">=1 overlap in 'websites'\"] 0 \n", + "2 [\">=1 overlap in 'websites'\"] 0 \n", + "3 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 1 \n", + "4 [\">=1 overlap in 'websites'\"] 1 \n", + "5 [\">=1 overlap in 'websites'\"] 1 \n", + "6 [\">=1 overlap in 'websites'\"] 1 \n", + "7 [\">=1 overlap in 'websites'\"] 1 \n", + "8 [\"Same 'Age'\", \"Same 'City'\"] 2 \n", + "9 [\"Same 'Age'\", \"Same 'City'\"] 3 \n", + "10 [\">=1 overlap in 'websites'\"] 4 \n", + "11 [\">=1 overlap in 'websites'\"] 4 \n", + "12 [\">=1 overlap in 'websites'\"] 4 " ], "text/html": [ "
\n", @@ -3227,7 +3227,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", " 0\n", " \n", " \n", @@ -3236,7 +3236,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 0\n", " \n", " \n", @@ -3245,7 +3245,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 0\n", " \n", " \n", @@ -3254,7 +3254,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", " 1\n", " \n", " \n", @@ -3263,7 +3263,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", " \n", " \n", @@ -3272,7 +3272,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", " \n", " \n", @@ -3281,7 +3281,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", " \n", " \n", @@ -3290,7 +3290,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", " \n", " \n", @@ -3299,7 +3299,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [Same 'Age', Same 'City']\n", + " [\"Same 'Age'\", \"Same 'City'\"]\n", " 2\n", " \n", " \n", @@ -3308,7 +3308,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [Same 'Age', Same 'City']\n", + " [\"Same 'Age'\", \"Same 'City'\"]\n", " 3\n", " \n", " \n", @@ -3317,7 +3317,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 4\n", " \n", " \n", @@ -3326,7 +3326,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 4\n", " \n", " \n", @@ -3335,7 +3335,7 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 4\n", " \n", " \n", @@ -3358,18 +3358,26 @@ { "cell_type": "markdown", "metadata": {}, - "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of motives." + "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives." }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T08:45:50.717126Z", - "start_time": "2026-02-03T08:45:50.677002100Z" + "end_time": "2026-02-03T14:40:42.388195200Z", + "start_time": "2026-02-03T14:40:42.261220800Z" } }, "source": [ - "report[\"_score\"] = msb.scoring(report)\n", + "report = msb.add_blocks_to_dataset(\n", + " df,\n", + " links,\n", + " motives=True,\n", + " show_as_pairs=True,\n", + " output_columns=[\"id\", \"Name\"],\n", + " merge_blocks=False,\n", + " score=True,\n", + ")\n", "report.sort_values(\"_score\", ascending=False)" ], "outputs": [ @@ -3379,32 +3387,32 @@ " id_l Name_l id_r Name_r \\\n", "0 1 Jacques Dupond 4 Jacques Dupont \n", "3 1 Jacques Dupond 4 Jacques Dupont \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", + "9 8 Sophie Delarue 11 sophie_delarue \n", "1 1 Jacques Dupond 6 Jean-Michel Python \n", - "2 1 Jacques Dupond 10 Caroline Dufour \n", "4 1 Jacques Dupond 6 Jean-Michel Python \n", - "5 1 Jacques Dupond 10 Caroline Dufour \n", + "2 1 Jacques Dupond 10 Caroline Dufour \n", "6 10 Caroline Dufour 6 Jean-Michel Python \n", + "5 1 Jacques Dupond 10 Caroline Dufour \n", "7 10 Caroline Dufour 13 Benoît Benoît \n", "10 10 Caroline Dufour 6 Jean-Michel Python \n", - "12 13 Benoît Benoît 6 Jean-Michel Python \n", "11 10 Caroline Dufour 13 Benoît Benoît \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", - "9 8 Sophie Delarue 11 sophie_delarue \n", + "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", - " _motive _block _score \n", - "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 52 \n", - "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 52 \n", - "1 [>=1 overlap in 'websites'] 0 27 \n", - "2 [>=1 overlap in 'websites'] 0 27 \n", - "4 [>=1 overlap in 'websites'] 1 27 \n", - "5 [>=1 overlap in 'websites'] 1 27 \n", - "6 [>=1 overlap in 'websites'] 1 27 \n", - "7 [>=1 overlap in 'websites'] 1 27 \n", - "10 [>=1 overlap in 'websites'] 4 27 \n", - "12 [>=1 overlap in 'websites'] 4 27 \n", - "11 [>=1 overlap in 'websites'] 4 27 \n", - "8 [Same 'Age', Same 'City'] 2 25 \n", - "9 [Same 'Age', Same 'City'] 3 25 " + " _motive _score _block \n", + "0 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 3 0 \n", + "3 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 3 1 \n", + "8 [\"Same 'Age'\", \"Same 'City'\"] 2 2 \n", + "9 [\"Same 'Age'\", \"Same 'City'\"] 2 3 \n", + "1 [\">=1 overlap in 'websites'\"] 1 0 \n", + "4 [\">=1 overlap in 'websites'\"] 1 1 \n", + "2 [\">=1 overlap in 'websites'\"] 1 0 \n", + "6 [\">=1 overlap in 'websites'\"] 1 1 \n", + "5 [\">=1 overlap in 'websites'\"] 1 1 \n", + "7 [\">=1 overlap in 'websites'\"] 1 1 \n", + "10 [\">=1 overlap in 'websites'\"] 1 4 \n", + "11 [\">=1 overlap in 'websites'\"] 1 4 \n", + "12 [\">=1 overlap in 'websites'\"] 1 4 " ], "text/html": [ "
\n", @@ -3430,8 +3438,8 @@ " id_r\n", " Name_r\n", " _motive\n", - " _block\n", " _score\n", + " _block\n", " \n", " \n", " \n", @@ -3441,9 +3449,9 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " 3\n", " 0\n", - " 52\n", " \n", " \n", " 3\n", @@ -3451,9 +3459,29 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " 3\n", " 1\n", - " 52\n", + " \n", + " \n", + " 8\n", + " 2\n", + " Pierre Dusquesnes\n", + " 5\n", + " pierre dusquesnes\n", + " [\"Same 'Age'\", \"Same 'City'\"]\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 9\n", + " 8\n", + " Sophie Delarue\n", + " 11\n", + " sophie_delarue\n", + " [\"Same 'Age'\", \"Same 'City'\"]\n", + " 2\n", + " 3\n", " \n", " \n", " 1\n", @@ -3461,19 +3489,9 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", - " 0\n", - " 27\n", - " \n", - " \n", - " 2\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", - " Jacques Dupond\n", - " 10\n", - " Caroline Dufour\n", - " [>=1 overlap in 'websites']\n", " 0\n", - " 27\n", " \n", " \n", " 4\n", @@ -3481,19 +3499,19 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", " 1\n", - " 27\n", " \n", " \n", - " 5\n", + " 2\n", " 1\n", " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", " 1\n", - " 27\n", + " 0\n", " \n", " \n", " 6\n", @@ -3501,9 +3519,19 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 5\n", + " 1\n", + " Jacques Dupond\n", + " 10\n", + " Caroline Dufour\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", " 1\n", - " 27\n", " \n", " \n", " 7\n", @@ -3511,9 +3539,9 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", " 1\n", - " 27\n", " \n", " \n", " 10\n", @@ -3521,19 +3549,9 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", - " 4\n", - " 27\n", - " \n", - " \n", - " 12\n", - " 13\n", - " Benoît Benoît\n", - " 6\n", - " Jean-Michel Python\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", " 4\n", - " 27\n", " \n", " \n", " 11\n", @@ -3541,29 +3559,19 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [>=1 overlap in 'websites']\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", " 4\n", - " 27\n", - " \n", - " \n", - " 8\n", - " 2\n", - " Pierre Dusquesnes\n", - " 5\n", - " pierre dusquesnes\n", - " [Same 'Age', Same 'City']\n", - " 2\n", - " 25\n", " \n", " \n", - " 9\n", - " 8\n", - " Sophie Delarue\n", - " 11\n", - " sophie_delarue\n", - " [Same 'Age', Same 'City']\n", - " 3\n", - " 25\n", + " 12\n", + " 13\n", + " Benoît Benoît\n", + " 6\n", + " Jean-Michel Python\n", + " [\">=1 overlap in 'websites'\"]\n", + " 1\n", + " 4\n", " \n", " \n", "\n", From 41e154e0ff48b3fed132194f29620334a704f9c9 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 15:55:34 +0100 Subject: [PATCH 05/20] test: update tests to new Motives --- tests/test_ms_blocking.py | 117 ++++++++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 23 deletions(-) diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index d3f9ab2..3efb309 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -84,18 +84,29 @@ def attribute_city_keep_ungrouped_rows_false(): @pytest.fixture def attribute_city_motives_true_block(): return { - frozenset({3, 8}): {"Same 'City'"}, - frozenset({1, 4}): {"Same 'City'"}, - frozenset({8, 11}): {"Same 'City'"}, - frozenset({3, 11}): {"Same 'City'"}, - frozenset({2, 5}): {"Same 'City'"}, - frozenset({10, 13}): {"Same 'City'"}, + frozenset({3, 8}): [msb.EquivalenceMotive("City")], + frozenset({1, 4}): [msb.EquivalenceMotive("City")], + frozenset({8, 11}): [msb.EquivalenceMotive("City")], + frozenset({3, 11}): [msb.EquivalenceMotive("City")], + frozenset({2, 5}): [msb.EquivalenceMotive("City")], + frozenset({10, 13}): [msb.EquivalenceMotive("City")], } @pytest.fixture def attribute_city_motives_true_add(): - return [{"Same 'City'"}] * 9 + return [ + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + "[\"Same 'City'\"]", + ] + # [msb.EquivalenceMotive("City")] * 9 @pytest.fixture @@ -116,25 +127,65 @@ def city_age_name_websites_pipelining_id(): @pytest.fixture def city_age_websites_pipelining_motives(): return [ - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), + "[\"Same 'City'\", \"Same 'Age'\", \">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\"Same 'City'\", \"Same 'Age'\", \">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\"Same 'City'\", \"Same 'Age'\"]", + "[\"Same 'City'\", \"Same 'Age'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", + "[\">=1 overlap in 'websites'\"]", ] + # [ + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [ + # msb.EquivalenceMotive("Age"), + # msb.EquivalenceMotive("City"), + # msb.OverlapMotive("websites", 1), + # ], + # [msb.OverlapMotive("websites", 1)], + # [msb.OverlapMotive("websites", 1)], + # [msb.EquivalenceMotive("Age"), msb.EquivalenceMotive("City")], + # [msb.EquivalenceMotive("Age"), msb.EquivalenceMotive("City")], + # [msb.OverlapMotive("websites", 1)], + # [msb.OverlapMotive("websites", 1)], + # [msb.OverlapMotive("websites", 1)], + # ] + @pytest.fixture def city_age_websites_pipelining_scores(): - return [3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1] + return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1] @pytest.fixture @@ -350,12 +401,32 @@ def test_pipelining_scores(city_age_websites_pipelining_scores): final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) report = msb.add_blocks_to_dataset( - get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False + get_users(), + links, + show_as_pairs=True, + motives=True, + merge_blocks=False, + score=True, ) - actual = sorted(msb.scoring(report), reverse=True) + actual = sorted(report["_score"], reverse=True) assert actual == expected +# def test_pipelining_scores_without_show_as_pairs(city_age_websites_pipelining_scores): +# """Test that scoring does work as intended""" +# expected = city_age_websites_pipelining_scores +# city_blocker = msb.AttributeEquivalenceBlocker(["City"]) +# age_blocker = msb.AttributeEquivalenceBlocker(["Age"]) +# websites_blocker = msb.OverlapBlocker(["websites"]) +# final_blocker = (city_blocker & age_blocker) | websites_blocker +# links = final_blocker.block(get_users(), motives=True) +# report = msb.add_blocks_to_dataset( +# get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False, score=True +# ) +# actual = sorted(msb.scoring(report), reverse=True) +# assert actual == expected + + def test_merge_blockers_aa(): """Test that merging blockers does work as intended""" expected = msb.AttributeEquivalenceBlocker(["City", "Age"]) From c502c19864025391dba96e826f0164b416a1c291 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 15:56:27 +0100 Subject: [PATCH 06/20] style: reformat --- src/ms_blocking/utils.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index aaa5e08..15c7919 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -567,25 +567,40 @@ def solve_motives(motives: List[Motive]) -> List[Motive]: # Find biggest overlap among the non-word_level ones if not_word_level_motives_for_column: - max_overlap_not_word_level_for_column = max(not_word_level_motives_for_column, key=lambda m: m.overlap) - max_overlap_not_word_level_for_column_overlap = max_overlap_not_word_level_for_column.overlap + max_overlap_not_word_level_for_column = max( + not_word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_not_word_level_for_column_overlap = ( + max_overlap_not_word_level_for_column.overlap + ) else: max_overlap_not_word_level_for_column = [] - max_overlap_not_word_level_for_column_overlap = 0 # Will never be used, left for linter + max_overlap_not_word_level_for_column_overlap = ( + 0 # Will never be used, left for linter + ) # Now find biggest overlap among the word_level ones if word_level_motives_for_column: - max_overlap_word_level_for_column = max(word_level_motives_for_column, key=lambda m: m.overlap) - max_overlap_word_level_for_column_overlap = max_overlap_word_level_for_column.overlap + max_overlap_word_level_for_column = max( + word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_word_level_for_column_overlap = ( + max_overlap_word_level_for_column.overlap + ) if not_word_level_motives_for_column: # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it - if max_overlap_word_level_for_column_overlap <= max_overlap_not_word_level_for_column_overlap: + if ( + max_overlap_word_level_for_column_overlap + <= max_overlap_not_word_level_for_column_overlap + ): max_overlap_word_level_for_column = [] else: max_overlap_word_level_for_column = [] if max_overlap_not_word_level_for_column: - max_overlap_not_word_level_for_column = [max_overlap_not_word_level_for_column] + max_overlap_not_word_level_for_column = [ + max_overlap_not_word_level_for_column + ] if max_overlap_word_level_for_column: max_overlap_word_level_for_column = [max_overlap_word_level_for_column] final_motives += ( From a1282199a27d7d4b0f42b4fea1947bd7f9cf7e48 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:30:14 +0100 Subject: [PATCH 07/20] docs: fix motives --- docs/example.ipynb | 318 +++++++++++++++++++-------------------- src/ms_blocking/utils.py | 1 - 2 files changed, 159 insertions(+), 160 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 45699af..32ee69d 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,8 +32,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:28.508876500Z", - "start_time": "2026-02-03T14:40:27.761433800Z" + "end_time": "2026-02-03T15:26:00.408434200Z", + "start_time": "2026-02-03T15:25:59.668629400Z" } }, "source": [ @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:28.563486200Z", - "start_time": "2026-02-03T14:40:28.512916Z" + "end_time": "2026-02-03T15:26:00.464804400Z", + "start_time": "2026-02-03T15:26:00.408434200Z" } }, "source": [ @@ -282,8 +282,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:28.741020400Z", - "start_time": "2026-02-03T14:40:28.615799300Z" + "end_time": "2026-02-03T15:26:00.723249900Z", + "start_time": "2026-02-03T15:26:00.545044Z" } }, "source": [ @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:29.068322900Z", - "start_time": "2026-02-03T14:40:28.915502800Z" + "end_time": "2026-02-03T15:26:00.930325600Z", + "start_time": "2026-02-03T15:26:00.842587Z" } }, "source": [ @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:29.328166900Z", - "start_time": "2026-02-03T14:40:29.309785500Z" + "end_time": "2026-02-03T15:26:01.002006Z", + "start_time": "2026-02-03T15:26:00.984929700Z" } }, "source": [ @@ -369,8 +369,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:29.547537Z", - "start_time": "2026-02-03T14:40:29.397273800Z" + "end_time": "2026-02-03T15:26:01.389874900Z", + "start_time": "2026-02-03T15:26:01.189496400Z" } }, "source": [ @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:29.599221700Z", - "start_time": "2026-02-03T14:40:29.572788900Z" + "end_time": "2026-02-03T15:26:01.488509700Z", + "start_time": "2026-02-03T15:26:01.458139Z" } }, "source": [ @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:29.841412500Z", - "start_time": "2026-02-03T14:40:29.660471200Z" + "end_time": "2026-02-03T15:26:01.849762800Z", + "start_time": "2026-02-03T15:26:01.604523100Z" } }, "source": [ @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:30.138487100Z", - "start_time": "2026-02-03T14:40:30.060590900Z" + "end_time": "2026-02-03T15:26:02.327630200Z", + "start_time": "2026-02-03T15:26:02.082466800Z" } }, "source": [ @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:30.619777700Z", - "start_time": "2026-02-03T14:40:30.422768900Z" + "end_time": "2026-02-03T15:26:02.765309400Z", + "start_time": "2026-02-03T15:26:02.567839300Z" } }, "source": [ @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:30.988164600Z", - "start_time": "2026-02-03T14:40:30.834419400Z" + "end_time": "2026-02-03T15:26:03.163072900Z", + "start_time": "2026-02-03T15:26:03.015158500Z" } }, "source": [ @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:31.490353400Z", - "start_time": "2026-02-03T14:40:31.385134Z" + "end_time": "2026-02-03T15:26:03.713908200Z", + "start_time": "2026-02-03T15:26:03.587227900Z" } }, "source": [ @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:32.118722700Z", - "start_time": "2026-02-03T14:40:31.995086900Z" + "end_time": "2026-02-03T15:26:04.512418700Z", + "start_time": "2026-02-03T15:26:04.371414700Z" } }, "source": [ @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:32.898993200Z", - "start_time": "2026-02-03T14:40:32.771388400Z" + "end_time": "2026-02-03T15:26:05.232296700Z", + "start_time": "2026-02-03T15:26:05.138463900Z" } }, "source": [ @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:33.431455700Z", - "start_time": "2026-02-03T14:40:33.206324Z" + "end_time": "2026-02-03T15:26:05.746292700Z", + "start_time": "2026-02-03T15:26:05.615214500Z" } }, "source": [ @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:34.177679600Z", - "start_time": "2026-02-03T14:40:34.059417200Z" + "end_time": "2026-02-03T15:26:06.550041700Z", + "start_time": "2026-02-03T15:26:06.378265100Z" } }, "source": [ @@ -1464,8 +1464,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:34.323788200Z", - "start_time": "2026-02-03T14:40:34.232749100Z" + "end_time": "2026-02-03T15:26:06.933740700Z", + "start_time": "2026-02-03T15:26:06.700136700Z" } }, "source": [ @@ -1589,8 +1589,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:35.068638700Z", - "start_time": "2026-02-03T14:40:34.966880900Z" + "end_time": "2026-02-03T15:26:07.724806800Z", + "start_time": "2026-02-03T15:26:07.416889200Z" } }, "source": [ @@ -1804,8 +1804,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:35.421514400Z", - "start_time": "2026-02-03T14:40:35.348243100Z" + "end_time": "2026-02-03T15:26:08.366574400Z", + "start_time": "2026-02-03T15:26:08.287314300Z" } }, "source": [ @@ -1828,8 +1828,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:35.616427400Z", - "start_time": "2026-02-03T14:40:35.568154600Z" + "end_time": "2026-02-03T15:26:08.624518900Z", + "start_time": "2026-02-03T15:26:08.604191500Z" } }, "source": [ @@ -1849,8 +1849,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:35.856681600Z", - "start_time": "2026-02-03T14:40:35.755378800Z" + "end_time": "2026-02-03T15:26:08.886089600Z", + "start_time": "2026-02-03T15:26:08.721474Z" } }, "source": [ @@ -1990,8 +1990,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:36.225054300Z", - "start_time": "2026-02-03T14:40:36.088658200Z" + "end_time": "2026-02-03T15:26:09.251246500Z", + "start_time": "2026-02-03T15:26:09.080396800Z" } }, "source": [ @@ -2034,8 +2034,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:36.546944200Z", - "start_time": "2026-02-03T14:40:36.506897100Z" + "end_time": "2026-02-03T15:26:09.530329Z", + "start_time": "2026-02-03T15:26:09.486287900Z" } }, "source": [ @@ -2213,8 +2213,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:36.899709500Z", - "start_time": "2026-02-03T14:40:36.769059100Z" + "end_time": "2026-02-03T15:26:09.985303200Z", + "start_time": "2026-02-03T15:26:09.845263800Z" } }, "source": [ @@ -2443,8 +2443,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:37.743563200Z", - "start_time": "2026-02-03T14:40:37.558867900Z" + "end_time": "2026-02-03T15:26:10.930371500Z", + "start_time": "2026-02-03T15:26:10.809849600Z" } }, "source": [ @@ -2593,8 +2593,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:38.350268900Z", - "start_time": "2026-02-03T14:40:38.156431100Z" + "end_time": "2026-02-03T15:26:11.634404900Z", + "start_time": "2026-02-03T15:26:11.403226800Z" } }, "source": [ @@ -2644,8 +2644,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:38.712869Z", - "start_time": "2026-02-03T14:40:38.617699300Z" + "end_time": "2026-02-03T15:26:12.214789500Z", + "start_time": "2026-02-03T15:26:12.007748800Z" } }, "source": [ @@ -2666,16 +2666,16 @@ "7 10 Caroline Dufour Lens 45 \n", "8 13 Benoît Benoît Lens 15 \n", "\n", - " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 [\"Same 'City'\"] \n", - "1 ['jacquesdupond.fr'] 0 [\"Same 'City'\"] \n", - "2 ['somewebsite.com/users/rpz59'] 1 [\"Same 'City'\"] \n", - "3 [] 1 [\"Same 'City'\"] \n", - "4 ['roubaixlove.fr'] 2 [\"Same 'City'\"] \n", - "5 [] 2 [\"Same 'City'\"] \n", - "6 [] 2 [\"Same 'City'\"] \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 [\"Same 'City'\"] \n", - "8 ['lensfans.fr'] 3 [\"Same 'City'\"] " + " websites _block _motive \n", + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 [Same 'City'] \n", + "1 ['jacquesdupond.fr'] 0 [Same 'City'] \n", + "2 ['somewebsite.com/users/rpz59'] 1 [Same 'City'] \n", + "3 [] 1 [Same 'City'] \n", + "4 ['roubaixlove.fr'] 2 [Same 'City'] \n", + "5 [] 2 [Same 'City'] \n", + "6 [] 2 [Same 'City'] \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 [Same 'City'] \n", + "8 ['lensfans.fr'] 3 [Same 'City'] " ], "text/html": [ "
\n", @@ -2714,7 +2714,7 @@ " 37\n", " ['somewebsite.com/users/jacquesdupond', 'jacqu...\n", " 0\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 1\n", @@ -2724,7 +2724,7 @@ " 37\n", " ['jacquesdupond.fr']\n", " 0\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 2\n", @@ -2734,7 +2734,7 @@ " 24\n", " ['somewebsite.com/users/rpz59']\n", " 1\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 3\n", @@ -2744,7 +2744,7 @@ " 24\n", " []\n", " 1\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 4\n", @@ -2754,7 +2754,7 @@ " 32\n", " ['roubaixlove.fr']\n", " 2\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 5\n", @@ -2764,7 +2764,7 @@ " 33\n", " []\n", " 2\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 6\n", @@ -2774,7 +2774,7 @@ " 33\n", " []\n", " 2\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 7\n", @@ -2784,7 +2784,7 @@ " 45\n", " ['pythonensamusant.fr', 'lensfans.fr']\n", " 3\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", " 8\n", @@ -2794,7 +2794,7 @@ " 15\n", " ['lensfans.fr']\n", " 3\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " \n", " \n", "\n", @@ -2822,8 +2822,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:39.016063100Z", - "start_time": "2026-02-03T14:40:38.850306600Z" + "end_time": "2026-02-03T15:26:12.610291100Z", + "start_time": "2026-02-03T15:26:12.498335600Z" } }, "source": [ @@ -2849,13 +2849,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r _motive _block \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [\"Same 'City'\"] 0 \n", - "1 Phalempin 24 [] [\"Same 'City'\"] 1 \n", - "2 Roubaix 33 [] [\"Same 'City'\"] 2 \n", - "3 Roubaix 33 [] [\"Same 'City'\"] 2 \n", - "4 Roubaix 32 ['roubaixlove.fr'] [\"Same 'City'\"] 2 \n", - "5 Lens 15 ['lensfans.fr'] [\"Same 'City'\"] 3 " + " City_r Age_r websites_r _motive _block \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [Same 'City'] 0 \n", + "1 Phalempin 24 [] [Same 'City'] 1 \n", + "2 Roubaix 33 [] [Same 'City'] 2 \n", + "3 Roubaix 33 [] [Same 'City'] 2 \n", + "4 Roubaix 32 ['roubaixlove.fr'] [Same 'City'] 2 \n", + "5 Lens 15 ['lensfans.fr'] [Same 'City'] 3 " ], "text/html": [ "
\n", @@ -2903,7 +2903,7 @@ " Villeneuve d'Ascq\n", " 37\n", " ['jacquesdupond.fr']\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 0\n", " \n", " \n", @@ -2918,7 +2918,7 @@ " Phalempin\n", " 24\n", " []\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 1\n", " \n", " \n", @@ -2933,7 +2933,7 @@ " Roubaix\n", " 33\n", " []\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -2948,7 +2948,7 @@ " Roubaix\n", " 33\n", " []\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -2963,7 +2963,7 @@ " Roubaix\n", " 32\n", " ['roubaixlove.fr']\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -2978,7 +2978,7 @@ " Lens\n", " 15\n", " ['lensfans.fr']\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 3\n", " \n", " \n", @@ -3004,8 +3004,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:39.771226400Z", - "start_time": "2026-02-03T14:40:39.536276Z" + "end_time": "2026-02-03T15:26:13.272171600Z", + "start_time": "2026-02-03T15:26:13.063070700Z" } }, "source": [ @@ -3017,13 +3017,13 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _motive _block\n", - "0 1 Jacques Dupond 4 Jacques Dupont [\"Same 'City'\"] 0\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes [\"Same 'City'\"] 1\n", - "2 3 Paul Delarue 11 sophie_delarue [\"Same 'City'\"] 2\n", - "3 8 Sophie Delarue 11 sophie_delarue [\"Same 'City'\"] 2\n", - "4 8 Sophie Delarue 3 Paul Delarue [\"Same 'City'\"] 2\n", - "5 10 Caroline Dufour 13 Benoît Benoît [\"Same 'City'\"] 3" + " id_l Name_l id_r Name_r _motive _block\n", + "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n", + "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n", + "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n", + "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n", + "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n", + "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3" ], "text/html": [ "
\n", @@ -3059,7 +3059,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 0\n", " \n", " \n", @@ -3068,7 +3068,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 1\n", " \n", " \n", @@ -3077,7 +3077,7 @@ " Paul Delarue\n", " 11\n", " sophie_delarue\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -3086,7 +3086,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -3095,7 +3095,7 @@ " Sophie Delarue\n", " 3\n", " Paul Delarue\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 2\n", " \n", " \n", @@ -3104,7 +3104,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [\"Same 'City'\"]\n", + " [Same 'City']\n", " 3\n", " \n", " \n", @@ -3131,8 +3131,8 @@ "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-02-03T14:40:41.596196Z", - "start_time": "2026-02-03T14:40:41.287210400Z" + "end_time": "2026-02-03T15:26:14.300257400Z", + "start_time": "2026-02-03T15:26:13.981549200Z" } }, "source": [ @@ -3179,19 +3179,19 @@ "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", " _motive _block \n", - "0 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 0 \n", - "1 [\">=1 overlap in 'websites'\"] 0 \n", - "2 [\">=1 overlap in 'websites'\"] 0 \n", - "3 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 1 \n", - "4 [\">=1 overlap in 'websites'\"] 1 \n", - "5 [\">=1 overlap in 'websites'\"] 1 \n", - "6 [\">=1 overlap in 'websites'\"] 1 \n", - "7 [\">=1 overlap in 'websites'\"] 1 \n", - "8 [\"Same 'Age'\", \"Same 'City'\"] 2 \n", - "9 [\"Same 'Age'\", \"Same 'City'\"] 3 \n", - "10 [\">=1 overlap in 'websites'\"] 4 \n", - "11 [\">=1 overlap in 'websites'\"] 4 \n", - "12 [\">=1 overlap in 'websites'\"] 4 " + "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 \n", + "1 [>=1 overlap in 'websites'] 0 \n", + "2 [>=1 overlap in 'websites'] 0 \n", + "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 \n", + "4 [>=1 overlap in 'websites'] 1 \n", + "5 [>=1 overlap in 'websites'] 1 \n", + "6 [>=1 overlap in 'websites'] 1 \n", + "7 [>=1 overlap in 'websites'] 1 \n", + "8 [Same 'Age', Same 'City'] 2 \n", + "9 [Same 'Age', Same 'City'] 3 \n", + "10 [>=1 overlap in 'websites'] 4 \n", + "11 [>=1 overlap in 'websites'] 4 \n", + "12 [>=1 overlap in 'websites'] 4 " ], "text/html": [ "
\n", @@ -3227,7 +3227,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 0\n", " \n", " \n", @@ -3236,7 +3236,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 0\n", " \n", " \n", @@ -3245,7 +3245,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 0\n", " \n", " \n", @@ -3254,7 +3254,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 1\n", " \n", " \n", @@ -3263,7 +3263,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " \n", " \n", @@ -3272,7 +3272,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " \n", " \n", @@ -3281,7 +3281,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " \n", " \n", @@ -3290,7 +3290,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " \n", " \n", @@ -3299,7 +3299,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [\"Same 'Age'\", \"Same 'City'\"]\n", + " [Same 'Age', Same 'City']\n", " 2\n", " \n", " \n", @@ -3308,7 +3308,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [\"Same 'Age'\", \"Same 'City'\"]\n", + " [Same 'Age', Same 'City']\n", " 3\n", " \n", " \n", @@ -3317,7 +3317,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 4\n", " \n", " \n", @@ -3326,7 +3326,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 4\n", " \n", " \n", @@ -3335,7 +3335,7 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 4\n", " \n", " \n", @@ -3364,8 +3364,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T14:40:42.388195200Z", - "start_time": "2026-02-03T14:40:42.261220800Z" + "end_time": "2026-02-03T15:26:15.195066300Z", + "start_time": "2026-02-03T15:26:14.996741600Z" } }, "source": [ @@ -3400,19 +3400,19 @@ "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", " _motive _score _block \n", - "0 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 3 0 \n", - "3 [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ... 3 1 \n", - "8 [\"Same 'Age'\", \"Same 'City'\"] 2 2 \n", - "9 [\"Same 'Age'\", \"Same 'City'\"] 2 3 \n", - "1 [\">=1 overlap in 'websites'\"] 1 0 \n", - "4 [\">=1 overlap in 'websites'\"] 1 1 \n", - "2 [\">=1 overlap in 'websites'\"] 1 0 \n", - "6 [\">=1 overlap in 'websites'\"] 1 1 \n", - "5 [\">=1 overlap in 'websites'\"] 1 1 \n", - "7 [\">=1 overlap in 'websites'\"] 1 1 \n", - "10 [\">=1 overlap in 'websites'\"] 1 4 \n", - "11 [\">=1 overlap in 'websites'\"] 1 4 \n", - "12 [\">=1 overlap in 'websites'\"] 1 4 " + "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 3 0 \n", + "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 3 1 \n", + "8 [Same 'Age', Same 'City'] 2 2 \n", + "9 [Same 'Age', Same 'City'] 2 3 \n", + "1 [>=1 overlap in 'websites'] 1 0 \n", + "4 [>=1 overlap in 'websites'] 1 1 \n", + "2 [>=1 overlap in 'websites'] 1 0 \n", + "6 [>=1 overlap in 'websites'] 1 1 \n", + "5 [>=1 overlap in 'websites'] 1 1 \n", + "7 [>=1 overlap in 'websites'] 1 1 \n", + "10 [>=1 overlap in 'websites'] 1 4 \n", + "11 [>=1 overlap in 'websites'] 1 4 \n", + "12 [>=1 overlap in 'websites'] 1 4 " ], "text/html": [ "
\n", @@ -3449,7 +3449,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 3\n", " 0\n", " \n", @@ -3459,7 +3459,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [\"Same 'Age'\", \"Same 'City'\", \">=1 overlap in ...\n", + " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", " 3\n", " 1\n", " \n", @@ -3469,7 +3469,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [\"Same 'Age'\", \"Same 'City'\"]\n", + " [Same 'Age', Same 'City']\n", " 2\n", " 2\n", " \n", @@ -3479,7 +3479,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [\"Same 'Age'\", \"Same 'City'\"]\n", + " [Same 'Age', Same 'City']\n", " 2\n", " 3\n", " \n", @@ -3489,7 +3489,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 0\n", " \n", @@ -3499,7 +3499,7 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 1\n", " \n", @@ -3509,7 +3509,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 0\n", " \n", @@ -3519,7 +3519,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 1\n", " \n", @@ -3529,7 +3529,7 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 1\n", " \n", @@ -3539,7 +3539,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 1\n", " \n", @@ -3549,7 +3549,7 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 4\n", " \n", @@ -3559,7 +3559,7 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 4\n", " \n", @@ -3569,7 +3569,7 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", - " [\">=1 overlap in 'websites'\"]\n", + " [>=1 overlap in 'websites']\n", " 1\n", " 4\n", " \n", diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 15c7919..7f271f5 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -294,7 +294,6 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: } else: return coords_1.union(coords_2) - # TODO: check for merging one with motive and one w/o def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: From c0e18911b5eebb2466277f77529fa4e427c2dc6a Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:30:56 +0100 Subject: [PATCH 08/20] refactor: motive as list instead of string --- src/ms_blocking/ms_blocking.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index 9d61832..d47ceb7 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -585,7 +585,7 @@ def add_blocks_to_dataset( current_row.index = current_index if motives: motives_solved = solve_motives(coords[pair]) - current_row["_motive"] = str(list(map(str, motives_solved))) + current_row["_motive"] = [list(map(str, motives_solved))] if score: current_row["_score"] = len( motives_solved @@ -640,11 +640,12 @@ def add_blocks_to_dataset( if not show_as_pairs and motives: id_list = flatten(coords.keys()) motive_matcher = { - row_id: str(list(map(str, solve_motives(coords[pair])))) + row_id: list(map(str, solve_motives(coords[pair]))) for pair in coords.keys() for row_id in id_list if row_id in pair } + # noinspection PyTypeChecker output_data["_motive"] = output_data.index.map(motive_matcher) if score: output_data["_score"] = 0 From 4822c87f342b88f9358c0c5e4ea104013fcb2f64 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:31:39 +0100 Subject: [PATCH 09/20] test: fix checks depending on (random) ordering of motives --- tests/test_ms_blocking.py | 126 +++++++++++++++----------------------- 1 file changed, 49 insertions(+), 77 deletions(-) diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index 3efb309..7fdaaa5 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -96,17 +96,16 @@ def attribute_city_motives_true_block(): @pytest.fixture def attribute_city_motives_true_add(): return [ - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", - "[\"Same 'City'\"]", + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], ] - # [msb.EquivalenceMotive("City")] * 9 @pytest.fixture @@ -127,67 +126,32 @@ def city_age_name_websites_pipelining_id(): @pytest.fixture def city_age_websites_pipelining_motives(): return [ - "[\"Same 'City'\", \"Same 'Age'\", \">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\"Same 'City'\", \"Same 'Age'\", \">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\"Same 'City'\", \"Same 'Age'\"]", - "[\"Same 'City'\", \"Same 'Age'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", - "[\">=1 overlap in 'websites'\"]", + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'"}, + {"Same 'City'", "Same 'Age'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, ] - # [ - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [ - # msb.EquivalenceMotive("Age"), - # msb.EquivalenceMotive("City"), - # msb.OverlapMotive("websites", 1), - # ], - # [msb.OverlapMotive("websites", 1)], - # [msb.OverlapMotive("websites", 1)], - # [msb.EquivalenceMotive("Age"), msb.EquivalenceMotive("City")], - # [msb.EquivalenceMotive("Age"), msb.EquivalenceMotive("City")], - # [msb.OverlapMotive("websites", 1)], - # [msb.OverlapMotive("websites", 1)], - # [msb.OverlapMotive("websites", 1)], - # ] - @pytest.fixture def city_age_websites_pipelining_scores(): return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1] +@pytest.fixture +def city_age_websites_pipelining_scores_not_show_as_pairs(): + return [3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1] + + @pytest.fixture def city_age_not_different(): return {frozenset({1, 4}), frozenset({8, 11}), frozenset({2, 5})} @@ -386,9 +350,10 @@ def test_pipelining_motives(city_age_websites_pipelining_motives): websites_blocker = msb.OverlapBlocker(["websites"]) final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) - actual = msb.add_blocks_to_dataset( + motives = msb.add_blocks_to_dataset( # Use set to ignore ordering get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False )["_motive"].to_list() + actual = [set(motive) for motive in motives] assert actual == expected @@ -412,19 +377,26 @@ def test_pipelining_scores(city_age_websites_pipelining_scores): assert actual == expected -# def test_pipelining_scores_without_show_as_pairs(city_age_websites_pipelining_scores): -# """Test that scoring does work as intended""" -# expected = city_age_websites_pipelining_scores -# city_blocker = msb.AttributeEquivalenceBlocker(["City"]) -# age_blocker = msb.AttributeEquivalenceBlocker(["Age"]) -# websites_blocker = msb.OverlapBlocker(["websites"]) -# final_blocker = (city_blocker & age_blocker) | websites_blocker -# links = final_blocker.block(get_users(), motives=True) -# report = msb.add_blocks_to_dataset( -# get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False, score=True -# ) -# actual = sorted(msb.scoring(report), reverse=True) -# assert actual == expected +def test_pipelining_scores_without_show_as_pairs( + city_age_websites_pipelining_scores_not_show_as_pairs, +): + """Test that scoring does work as intended""" + expected = city_age_websites_pipelining_scores_not_show_as_pairs + city_blocker = msb.AttributeEquivalenceBlocker(["City"]) + age_blocker = msb.AttributeEquivalenceBlocker(["Age"]) + websites_blocker = msb.OverlapBlocker(["websites"]) + final_blocker = (city_blocker & age_blocker) | websites_blocker + links = final_blocker.block(get_users(), motives=True) + report = msb.add_blocks_to_dataset( + get_users(), + links, + show_as_pairs=False, + motives=True, + merge_blocks=False, + score=True, + ) + actual = sorted(report["_score"], reverse=True) + assert actual == expected def test_merge_blockers_aa(): From 460808d7e4557bfd6947726a73a4573e376e29a7 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:33:06 +0100 Subject: [PATCH 10/20] style: reformat --- tests/test_ms_blocking.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index 7fdaaa5..cf92924 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -127,18 +127,18 @@ def city_age_name_websites_pipelining_id(): def city_age_websites_pipelining_motives(): return [ {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {"Same 'City'", "Same 'Age'"}, - {"Same 'City'", "Same 'Age'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, - {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'"}, + {"Same 'City'", "Same 'Age'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, ] From 9e722604e8a27973e01f5a452f57e6aedbf356bf Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:35:00 +0100 Subject: [PATCH 11/20] style: remove obsolete comments --- src/ms_blocking/ms_blocking.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index d47ceb7..5571ba0 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -657,9 +657,6 @@ def add_blocks_to_dataset( } output_data["_score"] = output_data.index.map(score_matcher) - # if "_block" not in output_data.columns: # Empty coords - # output_data["_block"] = -1 - output_data = output_data.reset_index(drop=True) output_data["_block"] = output_data["_block"].astype(int) From 41e8defbbc31fec7fad983e70adcb33d8a8954c9 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 16:37:54 +0100 Subject: [PATCH 12/20] docs: fix discarded reference --- docs/example.ipynb | 270 ++++++++++++++++++++++----------------------- 1 file changed, 135 insertions(+), 135 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 32ee69d..f3d0353 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,15 +32,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:00.408434200Z", - "start_time": "2026-02-03T15:25:59.668629400Z" + "end_time": "2026-02-03T15:36:04.452948500Z", + "start_time": "2026-02-03T15:36:03.131330Z" } }, "source": [ "import ms_blocking.ms_blocking as msb" ], "outputs": [], - "execution_count": 1 + "execution_count": 2 }, { "cell_type": "markdown", @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:00.464804400Z", - "start_time": "2026-02-03T15:26:00.408434200Z" + "end_time": "2026-02-03T15:36:04.676076Z", + "start_time": "2026-02-03T15:36:04.488835200Z" } }, "source": [ @@ -250,12 +250,12 @@ "
" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 2 + "execution_count": 3 }, { "cell_type": "markdown", @@ -282,15 +282,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:00.723249900Z", - "start_time": "2026-02-03T15:26:00.545044Z" + "end_time": "2026-02-03T15:36:04.943687900Z", + "start_time": "2026-02-03T15:36:04.758421500Z" } }, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])" ], "outputs": [], - "execution_count": 3 + "execution_count": 4 }, { "cell_type": "markdown", @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:00.930325600Z", - "start_time": "2026-02-03T15:26:00.842587Z" + "end_time": "2026-02-03T15:36:05.205832200Z", + "start_time": "2026-02-03T15:36:05.172076200Z" } }, "source": [ @@ -326,7 +326,7 @@ ] } ], - "execution_count": 4 + "execution_count": 5 }, { "cell_type": "markdown", @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:01.002006Z", - "start_time": "2026-02-03T15:26:00.984929700Z" + "end_time": "2026-02-03T15:36:05.479610700Z", + "start_time": "2026-02-03T15:36:05.419422900Z" } }, "source": [ @@ -358,19 +358,19 @@ " frozenset({10, 13})}" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 5 + "execution_count": 6 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:01.389874900Z", - "start_time": "2026-02-03T15:26:01.189496400Z" + "end_time": "2026-02-03T15:36:05.687275800Z", + "start_time": "2026-02-03T15:36:05.545108700Z" } }, "source": [ @@ -396,7 +396,7 @@ } } ], - "execution_count": 6 + "execution_count": 7 }, { "cell_type": "markdown", @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:01.488509700Z", - "start_time": "2026-02-03T15:26:01.458139Z" + "end_time": "2026-02-03T15:36:05.722622300Z", + "start_time": "2026-02-03T15:36:05.695740900Z" } }, "source": [ @@ -556,12 +556,12 @@ "
" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 7 + "execution_count": 8 }, { "cell_type": "markdown", @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:01.849762800Z", - "start_time": "2026-02-03T15:26:01.604523100Z" + "end_time": "2026-02-03T15:36:06.032894900Z", + "start_time": "2026-02-03T15:36:05.817497100Z" } }, "source": [ @@ -590,12 +590,12 @@ "array([-1, 0, 1, 2, 0, 1, -1, -1, 2, -1, 3, 2, -1, 3])" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 8 + "execution_count": 9 }, { "cell_type": "markdown", @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:02.327630200Z", - "start_time": "2026-02-03T15:26:02.082466800Z" + "end_time": "2026-02-03T15:36:06.307293Z", + "start_time": "2026-02-03T15:36:06.224025900Z" } }, "source": [ @@ -734,12 +734,12 @@ "
" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 9 + "execution_count": 10 }, { "cell_type": "markdown", @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:02.765309400Z", - "start_time": "2026-02-03T15:26:02.567839300Z" + "end_time": "2026-02-03T15:36:07.067841300Z", + "start_time": "2026-02-03T15:36:06.923106400Z" } }, "source": [ @@ -783,7 +783,7 @@ } } ], - "execution_count": 10 + "execution_count": 11 }, { "cell_type": "markdown", @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:03.163072900Z", - "start_time": "2026-02-03T15:26:03.015158500Z" + "end_time": "2026-02-03T15:36:07.208452100Z", + "start_time": "2026-02-03T15:36:07.146327700Z" } }, "source": [ @@ -932,12 +932,12 @@ "
" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 11 + "execution_count": 12 }, { "cell_type": "markdown", @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:03.713908200Z", - "start_time": "2026-02-03T15:26:03.587227900Z" + "end_time": "2026-02-03T15:36:07.531853600Z", + "start_time": "2026-02-03T15:36:07.348460600Z" } }, "source": [ @@ -1050,12 +1050,12 @@ "
" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 12 + "execution_count": 13 }, { "cell_type": "markdown", @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:04.512418700Z", - "start_time": "2026-02-03T15:26:04.371414700Z" + "end_time": "2026-02-03T15:36:08.439928700Z", + "start_time": "2026-02-03T15:36:08.350546900Z" } }, "source": [ @@ -1089,7 +1089,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n" ] }, { @@ -1198,12 +1198,12 @@ "
" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 13 + "execution_count": 14 }, { "cell_type": "markdown", @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:05.232296700Z", - "start_time": "2026-02-03T15:26:05.138463900Z" + "end_time": "2026-02-03T15:36:08.853151500Z", + "start_time": "2026-02-03T15:36:08.721190500Z" } }, "source": [ @@ -1324,12 +1324,12 @@ "
" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 14 + "execution_count": 15 }, { "cell_type": "markdown", @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:05.746292700Z", - "start_time": "2026-02-03T15:26:05.615214500Z" + "end_time": "2026-02-03T15:36:09.489967600Z", + "start_time": "2026-02-03T15:36:09.326530600Z" } }, "source": [ @@ -1401,12 +1401,12 @@ "
" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 15 + "execution_count": 16 }, { "cell_type": "markdown", @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:06.550041700Z", - "start_time": "2026-02-03T15:26:06.378265100Z" + "end_time": "2026-02-03T15:36:10.401390200Z", + "start_time": "2026-02-03T15:36:10.365448300Z" } }, "source": [ @@ -1458,14 +1458,14 @@ ] } ], - "execution_count": 16 + "execution_count": 17 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:06.933740700Z", - "start_time": "2026-02-03T15:26:06.700136700Z" + "end_time": "2026-02-03T15:36:10.570875500Z", + "start_time": "2026-02-03T15:36:10.489956600Z" } }, "source": [ @@ -1564,12 +1564,12 @@ "
" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 17 + "execution_count": 18 }, { "cell_type": "markdown", @@ -1589,8 +1589,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:07.724806800Z", - "start_time": "2026-02-03T15:26:07.416889200Z" + "end_time": "2026-02-03T15:36:10.930134200Z", + "start_time": "2026-02-03T15:36:10.722500400Z" } }, "source": [ @@ -1756,12 +1756,12 @@ "
" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 18 + "execution_count": 19 }, { "cell_type": "markdown", @@ -1804,8 +1804,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:08.366574400Z", - "start_time": "2026-02-03T15:26:08.287314300Z" + "end_time": "2026-02-03T15:36:11.472124800Z", + "start_time": "2026-02-03T15:36:11.410766500Z" } }, "source": [ @@ -1815,7 +1815,7 @@ "websites_blocker = msb.OverlapBlocker([\"websites\"])" ], "outputs": [], - "execution_count": 19 + "execution_count": 20 }, { "cell_type": "markdown", @@ -1828,15 +1828,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:08.624518900Z", - "start_time": "2026-02-03T15:26:08.604191500Z" + "end_time": "2026-02-03T15:36:11.730809800Z", + "start_time": "2026-02-03T15:36:11.717895300Z" } }, "source": [ "final_blocker = (city_blocker & age_blocker) | (name_blocker & websites_blocker)" ], "outputs": [], - "execution_count": 20 + "execution_count": 21 }, { "cell_type": "markdown", @@ -1849,8 +1849,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:08.886089600Z", - "start_time": "2026-02-03T15:26:08.721474Z" + "end_time": "2026-02-03T15:36:12.008762600Z", + "start_time": "2026-02-03T15:36:11.829817400Z" } }, "source": [ @@ -1862,7 +1862,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", "Processing MixedBlocker(['Name'], ['websites'], 1)\n" ] }, @@ -1972,12 +1972,12 @@ "
" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 21 + "execution_count": 22 }, { "cell_type": "markdown", @@ -1990,8 +1990,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:09.251246500Z", - "start_time": "2026-02-03T15:26:09.080396800Z" + "end_time": "2026-02-03T15:36:12.404335300Z", + "start_time": "2026-02-03T15:36:12.172638Z" } }, "source": [ @@ -2007,7 +2007,7 @@ ] } ], - "execution_count": 22 + "execution_count": 23 }, { "cell_type": "markdown", @@ -2034,8 +2034,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:09.530329Z", - "start_time": "2026-02-03T15:26:09.486287900Z" + "end_time": "2026-02-03T15:36:12.721833200Z", + "start_time": "2026-02-03T15:36:12.589340400Z" } }, "source": [ @@ -2181,12 +2181,12 @@ "
" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 23 + "execution_count": 24 }, { "cell_type": "markdown", @@ -2213,8 +2213,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:09.985303200Z", - "start_time": "2026-02-03T15:26:09.845263800Z" + "end_time": "2026-02-03T15:36:13.208456200Z", + "start_time": "2026-02-03T15:36:13.112548200Z" } }, "source": [ @@ -2415,12 +2415,12 @@ "
" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 24 + "execution_count": 25 }, { "cell_type": "markdown", @@ -2443,8 +2443,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:10.930371500Z", - "start_time": "2026-02-03T15:26:10.809849600Z" + "end_time": "2026-02-03T15:36:14.249378400Z", + "start_time": "2026-02-03T15:36:14.008531Z" } }, "source": [ @@ -2568,12 +2568,12 @@ "" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 25 + "execution_count": 26 }, { "cell_type": "markdown", @@ -2593,8 +2593,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:11.634404900Z", - "start_time": "2026-02-03T15:26:11.403226800Z" + "end_time": "2026-02-03T15:36:14.709861100Z", + "start_time": "2026-02-03T15:36:14.517552400Z" } }, "source": [ @@ -2621,12 +2621,12 @@ " frozenset({3, 11}): [EquivalenceMotive(['City'])]}" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 26 + "execution_count": 27 }, { "cell_type": "markdown", @@ -2644,8 +2644,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:12.214789500Z", - "start_time": "2026-02-03T15:26:12.007748800Z" + "end_time": "2026-02-03T15:36:15.116572300Z", + "start_time": "2026-02-03T15:36:15.007172300Z" } }, "source": [ @@ -2801,12 +2801,12 @@ "" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 27 + "execution_count": 28 }, { "cell_type": "markdown", @@ -2822,8 +2822,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:12.610291100Z", - "start_time": "2026-02-03T15:26:12.498335600Z" + "end_time": "2026-02-03T15:36:15.563997200Z", + "start_time": "2026-02-03T15:36:15.425225900Z" } }, "source": [ @@ -2986,12 +2986,12 @@ "" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 28 + "execution_count": 29 }, { "cell_type": "markdown", @@ -3004,8 +3004,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:13.272171600Z", - "start_time": "2026-02-03T15:26:13.063070700Z" + "end_time": "2026-02-03T15:36:16.181630Z", + "start_time": "2026-02-03T15:36:16.065192300Z" } }, "source": [ @@ -3112,12 +3112,12 @@ "" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 29 + "execution_count": 30 }, { "cell_type": "markdown", @@ -3131,8 +3131,8 @@ "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-02-03T15:26:14.300257400Z", - "start_time": "2026-02-03T15:26:13.981549200Z" + "end_time": "2026-02-03T15:36:17.213402500Z", + "start_time": "2026-02-03T15:36:17.028434800Z" } }, "source": [ @@ -3156,7 +3156,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] }, @@ -3179,16 +3179,16 @@ "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", " _motive _block \n", - "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 0 \n", + "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 0 \n", "1 [>=1 overlap in 'websites'] 0 \n", "2 [>=1 overlap in 'websites'] 0 \n", - "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 1 \n", + "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 1 \n", "4 [>=1 overlap in 'websites'] 1 \n", "5 [>=1 overlap in 'websites'] 1 \n", "6 [>=1 overlap in 'websites'] 1 \n", "7 [>=1 overlap in 'websites'] 1 \n", - "8 [Same 'Age', Same 'City'] 2 \n", - "9 [Same 'Age', Same 'City'] 3 \n", + "8 [Same 'City', Same 'Age'] 2 \n", + "9 [Same 'City', Same 'Age'] 3 \n", "10 [>=1 overlap in 'websites'] 4 \n", "11 [>=1 overlap in 'websites'] 4 \n", "12 [>=1 overlap in 'websites'] 4 " @@ -3227,7 +3227,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 0\n", " \n", " \n", @@ -3254,7 +3254,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 1\n", " \n", " \n", @@ -3299,7 +3299,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [Same 'Age', Same 'City']\n", + " [Same 'City', Same 'Age']\n", " 2\n", " \n", " \n", @@ -3308,7 +3308,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [Same 'Age', Same 'City']\n", + " [Same 'City', Same 'Age']\n", " 3\n", " \n", " \n", @@ -3343,12 +3343,12 @@ "" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 30 + "execution_count": 31 }, { "cell_type": "markdown", @@ -3358,14 +3358,14 @@ { "cell_type": "markdown", "metadata": {}, - "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives." + "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `score=True` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives." }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:26:15.195066300Z", - "start_time": "2026-02-03T15:26:14.996741600Z" + "end_time": "2026-02-03T15:36:17.696557900Z", + "start_time": "2026-02-03T15:36:17.550771100Z" } }, "source": [ @@ -3400,10 +3400,10 @@ "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", " _motive _score _block \n", - "0 [Same 'Age', Same 'City', >=1 overlap in 'webs... 3 0 \n", - "3 [Same 'Age', Same 'City', >=1 overlap in 'webs... 3 1 \n", - "8 [Same 'Age', Same 'City'] 2 2 \n", - "9 [Same 'Age', Same 'City'] 2 3 \n", + "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 0 \n", + "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 1 \n", + "8 [Same 'City', Same 'Age'] 2 2 \n", + "9 [Same 'City', Same 'Age'] 2 3 \n", "1 [>=1 overlap in 'websites'] 1 0 \n", "4 [>=1 overlap in 'websites'] 1 1 \n", "2 [>=1 overlap in 'websites'] 1 0 \n", @@ -3449,7 +3449,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 3\n", " 0\n", " \n", @@ -3459,7 +3459,7 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " [Same 'Age', Same 'City', >=1 overlap in 'webs...\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 3\n", " 1\n", " \n", @@ -3469,7 +3469,7 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", - " [Same 'Age', Same 'City']\n", + " [Same 'City', Same 'Age']\n", " 2\n", " 2\n", " \n", @@ -3479,7 +3479,7 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", - " [Same 'Age', Same 'City']\n", + " [Same 'City', Same 'Age']\n", " 2\n", " 3\n", " \n", @@ -3578,12 +3578,12 @@ "" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 31 + "execution_count": 32 } ], "metadata": { From b4441c402fe4d45389a120539c28b2b92628cb88 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 17:07:48 +0100 Subject: [PATCH 13/20] fix: switche must_not_be_different and normalize_strings --- docs/example.ipynb | 2087 +++----------------------------- src/ms_blocking/ms_blocking.py | 17 +- 2 files changed, 190 insertions(+), 1914 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index f3d0353..aef6ee6 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,15 +32,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:04.452948500Z", - "start_time": "2026-02-03T15:36:03.131330Z" + "end_time": "2026-02-03T16:02:56.751154300Z", + "start_time": "2026-02-03T16:02:55.924397100Z" } }, "source": [ "import ms_blocking.ms_blocking as msb" ], "outputs": [], - "execution_count": 2 + "execution_count": 1 }, { "cell_type": "markdown", @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:04.676076Z", - "start_time": "2026-02-03T15:36:04.488835200Z" + "end_time": "2026-02-03T16:02:56.810955300Z", + "start_time": "2026-02-03T16:02:56.751154300Z" } }, "source": [ @@ -250,12 +250,12 @@ "" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 3 + "execution_count": 2 }, { "cell_type": "markdown", @@ -282,15 +282,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:04.943687900Z", - "start_time": "2026-02-03T15:36:04.758421500Z" + "end_time": "2026-02-03T16:02:56.966380500Z", + "start_time": "2026-02-03T16:02:56.862834100Z" } }, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])" ], "outputs": [], - "execution_count": 4 + "execution_count": 3 }, { "cell_type": "markdown", @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:05.205832200Z", - "start_time": "2026-02-03T15:36:05.172076200Z" + "end_time": "2026-02-03T16:02:57.285912400Z", + "start_time": "2026-02-03T16:02:57.147878900Z" } }, "source": [ @@ -322,11 +322,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City'])\n" ] } ], - "execution_count": 5 + "execution_count": 4 }, { "cell_type": "markdown", @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:05.479610700Z", - "start_time": "2026-02-03T15:36:05.419422900Z" + "end_time": "2026-02-03T16:02:57.479607Z", + "start_time": "2026-02-03T16:02:57.418159200Z" } }, "source": [ @@ -358,19 +358,19 @@ " frozenset({10, 13})}" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 6 + "execution_count": 5 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:05.687275800Z", - "start_time": "2026-02-03T15:36:05.545108700Z" + "end_time": "2026-02-03T16:02:57.776512200Z", + "start_time": "2026-02-03T16:02:57.565676Z" } }, "source": [ @@ -396,7 +396,7 @@ } } ], - "execution_count": 7 + "execution_count": 6 }, { "cell_type": "markdown", @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:05.722622300Z", - "start_time": "2026-02-03T15:36:05.695740900Z" + "end_time": "2026-02-03T16:02:57.810023Z", + "start_time": "2026-02-03T16:02:57.778482900Z" } }, "source": [ @@ -556,12 +556,12 @@ "" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 8 + "execution_count": 7 }, { "cell_type": "markdown", @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:06.032894900Z", - "start_time": "2026-02-03T15:36:05.817497100Z" + "end_time": "2026-02-03T16:02:58.075057800Z", + "start_time": "2026-02-03T16:02:57.893294100Z" } }, "source": [ @@ -590,12 +590,12 @@ "array([-1, 0, 1, 2, 0, 1, -1, -1, 2, -1, 3, 2, -1, 3])" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 9 + "execution_count": 8 }, { "cell_type": "markdown", @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:06.307293Z", - "start_time": "2026-02-03T15:36:06.224025900Z" + "end_time": "2026-02-03T16:02:58.413477400Z", + "start_time": "2026-02-03T16:02:58.285492900Z" } }, "source": [ @@ -734,12 +734,12 @@ "" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 10 + "execution_count": 9 }, { "cell_type": "markdown", @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:07.067841300Z", - "start_time": "2026-02-03T15:36:06.923106400Z" + "end_time": "2026-02-03T16:02:58.887317800Z", + "start_time": "2026-02-03T16:02:58.675247500Z" } }, "source": [ @@ -783,7 +783,7 @@ } } ], - "execution_count": 11 + "execution_count": 10 }, { "cell_type": "markdown", @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:07.208452100Z", - "start_time": "2026-02-03T15:36:07.146327700Z" + "end_time": "2026-02-03T16:02:59.272554700Z", + "start_time": "2026-02-03T16:02:59.130460300Z" } }, "source": [ @@ -932,12 +932,12 @@ "" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 12 + "execution_count": 11 }, { "cell_type": "markdown", @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:07.531853600Z", - "start_time": "2026-02-03T15:36:07.348460600Z" + "end_time": "2026-02-03T16:02:59.806784300Z", + "start_time": "2026-02-03T16:02:59.686250600Z" } }, "source": [ @@ -1050,12 +1050,12 @@ "" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 13 + "execution_count": 12 }, { "cell_type": "markdown", @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:08.439928700Z", - "start_time": "2026-02-03T15:36:08.350546900Z" + "end_time": "2026-02-03T16:03:00.721777Z", + "start_time": "2026-02-03T16:03:00.603955400Z" } }, "source": [ @@ -1089,7 +1089,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n" + "Processing AttributeEquivalenceBlocker(['Age', 'City'])\n" ] }, { @@ -1198,12 +1198,12 @@ "" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 14 + "execution_count": 13 }, { "cell_type": "markdown", @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:08.853151500Z", - "start_time": "2026-02-03T15:36:08.721190500Z" + "end_time": "2026-02-03T16:03:01.209432600Z", + "start_time": "2026-02-03T16:03:01.048013600Z" } }, "source": [ @@ -1237,7 +1237,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'])\n" ] }, { @@ -1324,12 +1324,12 @@ "" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 15 + "execution_count": 14 }, { "cell_type": "markdown", @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:09.489967600Z", - "start_time": "2026-02-03T15:36:09.326530600Z" + "end_time": "2026-02-03T16:03:01.834433100Z", + "start_time": "2026-02-03T16:03:01.686309100Z" } }, "source": [ @@ -1358,7 +1358,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'], NON-NORMALIZED)\n" ] }, { @@ -1401,12 +1401,12 @@ "" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 16 + "execution_count": 15 }, { "cell_type": "markdown", @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:10.401390200Z", - "start_time": "2026-02-03T15:36:10.365448300Z" + "end_time": "2026-02-03T16:03:02.711968Z", + "start_time": "2026-02-03T16:03:02.581163100Z" } }, "source": [ @@ -1453,19 +1453,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City'])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] } ], - "execution_count": 17 + "execution_count": 16 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-02-03T15:36:10.570875500Z", - "start_time": "2026-02-03T15:36:10.489956600Z" + "end_time": "2026-02-03T16:03:03.614029700Z", + "start_time": "2026-02-03T16:03:02.835393200Z" } }, "source": [ @@ -1477,99 +1477,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing MixedBlocker(['City'], ['websites'], 1)\n" + "Processing " ] }, { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 10 Caroline Dufour Lens 45 \n", - "3 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", - "3 ['lensfans.fr'] 1 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
210Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']1
313Benoît BenoîtLens15['lensfans.fr']1
\n", - "
" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" + "ename": "TypeError", + "evalue": "object of type 'bool' has no len()", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mTypeError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[17]\u001B[39m\u001B[32m, line 1\u001B[39m\n\u001B[32m----> \u001B[39m\u001B[32m1\u001B[39m links = \u001B[43m(\u001B[49m\u001B[43mcity_blocker\u001B[49m\u001B[43m \u001B[49m\u001B[43m&\u001B[49m\u001B[43m \u001B[49m\u001B[43mwebsites_blocker\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mblock\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 2\u001B[39m msb.add_blocks_to_dataset(df, links)\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:383\u001B[39m, in \u001B[36mMixedBlocker.block\u001B[39m\u001B[34m(self, data, motives)\u001B[39m\n\u001B[32m 380\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mblock\u001B[39m(\u001B[38;5;28mself\u001B[39m, data, motives=\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[32m 381\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"Regroup rows based on overlap of one or more columns\"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m383\u001B[39m \u001B[38;5;28;43mprint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mProcessing\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 385\u001B[39m total_columns = \u001B[38;5;28mself\u001B[39m.equivalence_columns + \u001B[38;5;28mself\u001B[39m.overlap_columns\n\u001B[32m 387\u001B[39m temp_data = data[total_columns].copy()\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:345\u001B[39m, in \u001B[36mMixedBlocker.__repr__\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 342\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m__repr__\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[32m 343\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(\n\u001B[32m 344\u001B[39m AndNode(\n\u001B[32m--> \u001B[39m\u001B[32m345\u001B[39m \u001B[43mAttributeEquivalenceBlocker\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 346\u001B[39m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mequivalence_columns\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mnormalize\u001B[49m\n\u001B[32m 347\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m,\n\u001B[32m 348\u001B[39m OverlapBlocker(\n\u001B[32m 349\u001B[39m \u001B[38;5;28mself\u001B[39m.overlap_columns, \u001B[38;5;28mself\u001B[39m.overlap, \u001B[38;5;28mself\u001B[39m.word_level, \u001B[38;5;28mself\u001B[39m.normalize\n\u001B[32m 350\u001B[39m ),\n\u001B[32m 351\u001B[39m )\n\u001B[32m 352\u001B[39m )\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:109\u001B[39m, in \u001B[36mAttributeEquivalenceBlocker.__init__\u001B[39m\u001B[34m(self, blocking_columns, normalize_strings, must_not_be_different)\u001B[39m\n\u001B[32m 107\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(must_not_be_different) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28mstr\u001B[39m:\n\u001B[32m 108\u001B[39m must_not_be_different = [must_not_be_different]\n\u001B[32m--> \u001B[39m\u001B[32m109\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m)\u001B[49m > \u001B[32m1\u001B[39m:\n\u001B[32m 110\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[33m\"\u001B[39m\u001B[33mThere must be only one extra column\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 111\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m (\n\u001B[32m 112\u001B[39m must_not_be_different\n\u001B[32m 113\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m must_not_be_different[\u001B[32m0\u001B[39m] \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.blocking_columns\n\u001B[32m 114\u001B[39m ):\n", + "\u001B[31mTypeError\u001B[39m: object of type 'bool' has no len()" + ] } ], - "execution_count": 18 + "execution_count": 17 }, { "cell_type": "markdown", @@ -1587,181 +1513,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:10.930134200Z", - "start_time": "2026-02-03T15:36:10.722500400Z" - } - }, + "metadata": {}, "source": [ "links = (city_blocker | websites_blocker).block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", - "Processing OverlapBlocker(['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 6 Jean-Michel Python Douai 49 \n", - "3 10 Caroline Dufour Lens 45 \n", - "4 13 Benoît Benoît Lens 15 \n", - "5 2 Pierre Dusquesnes Phalempin 24 \n", - "6 5 pierre dusquesnes Phalempin 24 \n", - "7 3 Paul Delarue Roubaix 32 \n", - "8 8 Sophie Delarue Roubaix 33 \n", - "9 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", - "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", - "4 ['lensfans.fr'] 0 \n", - "5 ['somewebsite.com/users/rpz59'] 1 \n", - "6 [] 1 \n", - "7 ['roubaixlove.fr'] 2 \n", - "8 [] 2 \n", - "9 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
26Jean-Michel PythonDouai49['lensfans.fr', 'pythonensamusant.fr']0
310Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']0
413Benoît BenoîtLens15['lensfans.fr']0
52Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
65pierre dusquesnesPhalempin24[]1
73Paul DelarueRoubaix32['roubaixlove.fr']2
88Sophie DelarueRoubaix33[]2
911sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 19 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1802,12 +1560,7 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:11.472124800Z", - "start_time": "2026-02-03T15:36:11.410766500Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", @@ -1815,7 +1568,7 @@ "websites_blocker = msb.OverlapBlocker([\"websites\"])" ], "outputs": [], - "execution_count": 20 + "execution_count": null }, { "cell_type": "markdown", @@ -1826,17 +1579,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:11.730809800Z", - "start_time": "2026-02-03T15:36:11.717895300Z" - } - }, + "metadata": {}, "source": [ "final_blocker = (city_blocker & age_blocker) | (name_blocker & websites_blocker)" ], "outputs": [], - "execution_count": 21 + "execution_count": null }, { "cell_type": "markdown", @@ -1847,137 +1595,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:12.008762600Z", - "start_time": "2026-02-03T15:36:11.829817400Z" - } - }, + "metadata": {}, "source": [ "links = final_blocker.block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", - "Processing MixedBlocker(['Name'], ['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
35pierre dusquesnesPhalempin24[]1
48Sophie DelarueRoubaix33[]2
511sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 22 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1988,26 +1612,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:12.404335300Z", - "start_time": "2026-02-03T15:36:12.172638Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "links = city_blocker.block(df)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" - ] - } - ], - "execution_count": 23 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2032,161 +1643,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:12.721833200Z", - "start_time": "2026-02-03T15:36:12.589340400Z" - } - }, + "metadata": {}, "source": [ "msb.add_blocks_to_dataset(df, links, sort=False)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "4 5 pierre dusquesnes Phalempin 24 \n", - "5 8 Sophie Delarue Roubaix 33 \n", - "6 10 Caroline Dufour Lens 45 \n", - "7 11 sophie_delarue Roubaix 33 \n", - "8 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['somewebsite.com/users/rpz59'] 1 \n", - "2 ['roubaixlove.fr'] 2 \n", - "3 ['jacquesdupond.fr'] 0 \n", - "4 [] 1 \n", - "5 [] 2 \n", - "6 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", - "7 [] 2 \n", - "8 ['lensfans.fr'] 3 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
23Paul DelarueRoubaix32['roubaixlove.fr']2
34Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
45pierre dusquesnesPhalempin24[]1
58Sophie DelarueRoubaix33[]2
610Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']3
711sophie_delarueRoubaix33[]2
813Benoît BenoîtLens15['lensfans.fr']3
\n", - "
" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 24 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2211,216 +1673,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:13.208456200Z", - "start_time": "2026-02-03T15:36:13.112548200Z" - } - }, + "metadata": {}, "source": [ "msb.add_blocks_to_dataset(df, links, keep_ungrouped_rows=True)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 0 Jean d'Aux Lille 26 \n", - "1 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "2 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "3 2 Pierre Dusquesnes Phalempin 24 \n", - "4 5 pierre dusquesnes Phalempin 24 \n", - "5 3 Paul Delarue Roubaix 32 \n", - "6 8 Sophie Delarue Roubaix 33 \n", - "7 11 sophie_delarue Roubaix 33 \n", - "8 6 Jean-Michel Python Douai 49 \n", - "9 7 Gédéon Glincarné Paris 53 \n", - "10 9 Jeanne Verbrugge Valenciennes 41 \n", - "11 10 Caroline Dufour Lens 45 \n", - "12 13 Benoît Benoît Lens 15 \n", - "13 12 Marcel Vandermersch Fourmies 48 \n", - "\n", - " websites _block \n", - "0 ['jeandaux.fr', 'lillefans.fr'] 0 \n", - "1 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", - "2 ['jacquesdupond.fr'] 1 \n", - "3 ['somewebsite.com/users/rpz59'] 2 \n", - "4 [] 2 \n", - "5 ['roubaixlove.fr'] 3 \n", - "6 [] 3 \n", - "7 [] 3 \n", - "8 ['lensfans.fr', 'pythonensamusant.fr'] 4 \n", - "9 ['lorem.fr'] 5 \n", - "10 ['somewebsite.com/users/jajanne59'] 6 \n", - "11 ['pythonensamusant.fr', 'lensfans.fr'] 7 \n", - "12 ['lensfans.fr'] 7 \n", - "13 ['lesrecettesdemarcel.fr'] 8 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
00Jean d'AuxLille26['jeandaux.fr', 'lillefans.fr']0
11Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...1
24Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']1
32Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']2
45pierre dusquesnesPhalempin24[]2
53Paul DelarueRoubaix32['roubaixlove.fr']3
68Sophie DelarueRoubaix33[]3
711sophie_delarueRoubaix33[]3
86Jean-Michel PythonDouai49['lensfans.fr', 'pythonensamusant.fr']4
97Gédéon GlincarnéParis53['lorem.fr']5
109Jeanne VerbruggeValenciennes41['somewebsite.com/users/jajanne59']6
1110Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']7
1213Benoît BenoîtLens15['lensfans.fr']7
1312Marcel VandermerschFourmies48['lesrecettesdemarcel.fr']8
\n", - "
" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 25 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2441,12 +1699,7 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:14.249378400Z", - "start_time": "2026-02-03T15:36:14.008531Z" - } - }, + "metadata": {}, "source": [ "city_blocker_not_different_age = msb.AttributeEquivalenceBlocker(\n", " [\"City\"], must_not_be_different=[\"Age\"]\n", @@ -2454,126 +1707,8 @@ "links = city_blocker_not_different_age.block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], ['Age'])\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
35pierre dusquesnesPhalempin24[]1
48Sophie DelarueRoubaix33[]2
511sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 26 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2591,42 +1726,14 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:14.709861100Z", - "start_time": "2026-02-03T15:36:14.517552400Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "links = city_blocker.block(df, motives=True)\n", "links" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" - ] - }, - { - "data": { - "text/plain": [ - "{frozenset({1, 4}): [EquivalenceMotive(['City'])],\n", - " frozenset({8, 11}): [EquivalenceMotive(['City'])],\n", - " frozenset({2, 5}): [EquivalenceMotive(['City'])],\n", - " frozenset({10, 13}): [EquivalenceMotive(['City'])],\n", - " frozenset({3, 8}): [EquivalenceMotive(['City'])],\n", - " frozenset({3, 11}): [EquivalenceMotive(['City'])]}" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 27 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2640,715 +1747,82 @@ "Similarly, you may add `motives=True` to the `msb.add_blocks_to_dataset` function to see said motives:" ] }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "msb.add_blocks_to_dataset(df, links, motives=True)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "msb.add_blocks_to_dataset(\n", + " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n", + ")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Motives are dynamic:" + ] + }, { "cell_type": "code", "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:15.116572300Z", - "start_time": "2026-02-03T15:36:15.007172300Z" - } + "scrolled": true }, "source": [ - "msb.add_blocks_to_dataset(df, links, motives=True)" + "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", + "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", + "websites_blocker = msb.OverlapBlocker([\"websites\"])\n", + "final_blocker = (city_blocker & age_blocker) | websites_blocker\n", + "links = final_blocker.block(df, motives=True)\n", + "report = msb.add_blocks_to_dataset(\n", + " df,\n", + " links,\n", + " motives=True,\n", + " show_as_pairs=True,\n", + " output_columns=[\"id\", \"Name\"],\n", + " merge_blocks=False,\n", + ")\n", + "report" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 3 Paul Delarue Roubaix 32 \n", - "5 8 Sophie Delarue Roubaix 33 \n", - "6 11 sophie_delarue Roubaix 33 \n", - "7 10 Caroline Dufour Lens 45 \n", - "8 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 [Same 'City'] \n", - "1 ['jacquesdupond.fr'] 0 [Same 'City'] \n", - "2 ['somewebsite.com/users/rpz59'] 1 [Same 'City'] \n", - "3 [] 1 [Same 'City'] \n", - "4 ['roubaixlove.fr'] 2 [Same 'City'] \n", - "5 [] 2 [Same 'City'] \n", - "6 [] 2 [Same 'City'] \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 [Same 'City'] \n", - "8 ['lensfans.fr'] 3 [Same 'City'] " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0[Same 'City']
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0[Same 'City']
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1[Same 'City']
35pierre dusquesnesPhalempin24[]1[Same 'City']
43Paul DelarueRoubaix32['roubaixlove.fr']2[Same 'City']
58Sophie DelarueRoubaix33[]2[Same 'City']
611sophie_delarueRoubaix33[]2[Same 'City']
710Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']3[Same 'City']
813Benoît BenoîtLens15['lensfans.fr']3[Same 'City']
\n", - "
" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 28 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:15.563997200Z", - "start_time": "2026-02-03T15:36:15.425225900Z" - } - }, - "source": [ - "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l City_l Age_l \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 8 Sophie Delarue Roubaix 33 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 10 Caroline Dufour Lens 45 \n", - "\n", - " websites_l id_r Name_r \\\n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n", - "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n", - "2 ['roubaixlove.fr'] 11 sophie_delarue \n", - "3 [] 11 sophie_delarue \n", - "4 [] 3 Paul Delarue \n", - "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", - "\n", - " City_r Age_r websites_r _motive _block \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [Same 'City'] 0 \n", - "1 Phalempin 24 [] [Same 'City'] 1 \n", - "2 Roubaix 33 [] [Same 'City'] 2 \n", - "3 Roubaix 33 [] [Same 'City'] 2 \n", - "4 Roubaix 32 ['roubaixlove.fr'] [Same 'City'] 2 \n", - "5 Lens 15 ['lensfans.fr'] [Same 'City'] 3 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lCity_lAge_lwebsites_lid_rName_rCity_rAge_rwebsites_r_motive_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...4Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr'][Same 'City']0
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']5pierre dusquesnesPhalempin24[][Same 'City']1
23Paul DelarueRoubaix32['roubaixlove.fr']11sophie_delarueRoubaix33[][Same 'City']2
38Sophie DelarueRoubaix33[]11sophie_delarueRoubaix33[][Same 'City']2
48Sophie DelarueRoubaix33[]3Paul DelarueRoubaix32['roubaixlove.fr'][Same 'City']2
510Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']13Benoît BenoîtLens15['lensfans.fr'][Same 'City']3
\n", - "
" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 29 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:16.181630Z", - "start_time": "2026-02-03T15:36:16.065192300Z" - } - }, - "source": [ - "msb.add_blocks_to_dataset(\n", - " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r _motive _block\n", - "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n", - "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n", - "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n", - "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n", - "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_motive_block
01Jacques Dupond4Jacques Dupont[Same 'City']0
12Pierre Dusquesnes5pierre dusquesnes[Same 'City']1
23Paul Delarue11sophie_delarue[Same 'City']2
38Sophie Delarue11sophie_delarue[Same 'City']2
48Sophie Delarue3Paul Delarue[Same 'City']2
510Caroline Dufour13Benoît Benoît[Same 'City']3
\n", - "
" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 30 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Motives are dynamic:" - ] - }, - { - "cell_type": "code", - "metadata": { - "scrolled": true, - "ExecuteTime": { - "end_time": "2026-02-03T15:36:17.213402500Z", - "start_time": "2026-02-03T15:36:17.028434800Z" - } - }, - "source": [ - "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", - "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", - "websites_blocker = msb.OverlapBlocker([\"websites\"])\n", - "final_blocker = (city_blocker & age_blocker) | websites_blocker\n", - "links = final_blocker.block(df, motives=True)\n", - "report = msb.add_blocks_to_dataset(\n", - " df,\n", - " links,\n", - " motives=True,\n", - " show_as_pairs=True,\n", - " output_columns=[\"id\", \"Name\"],\n", - " merge_blocks=False,\n", - ")\n", - "report" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City', 'Age'], [])\n", - "Processing OverlapBlocker(['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont \n", - "1 1 Jacques Dupond 6 Jean-Michel Python \n", - "2 1 Jacques Dupond 10 Caroline Dufour \n", - "3 1 Jacques Dupond 4 Jacques Dupont \n", - "4 1 Jacques Dupond 6 Jean-Michel Python \n", - "5 1 Jacques Dupond 10 Caroline Dufour \n", - "6 10 Caroline Dufour 6 Jean-Michel Python \n", - "7 10 Caroline Dufour 13 Benoît Benoît \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", - "9 8 Sophie Delarue 11 sophie_delarue \n", - "10 10 Caroline Dufour 6 Jean-Michel Python \n", - "11 10 Caroline Dufour 13 Benoît Benoît \n", - "12 13 Benoît Benoît 6 Jean-Michel Python \n", - "\n", - " _motive _block \n", - "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 0 \n", - "1 [>=1 overlap in 'websites'] 0 \n", - "2 [>=1 overlap in 'websites'] 0 \n", - "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 1 \n", - "4 [>=1 overlap in 'websites'] 1 \n", - "5 [>=1 overlap in 'websites'] 1 \n", - "6 [>=1 overlap in 'websites'] 1 \n", - "7 [>=1 overlap in 'websites'] 1 \n", - "8 [Same 'City', Same 'Age'] 2 \n", - "9 [Same 'City', Same 'Age'] 3 \n", - "10 [>=1 overlap in 'websites'] 4 \n", - "11 [>=1 overlap in 'websites'] 4 \n", - "12 [>=1 overlap in 'websites'] 4 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_motive_block
01Jacques Dupond4Jacques Dupont[Same 'City', Same 'Age', >=1 overlap in 'webs...0
11Jacques Dupond6Jean-Michel Python[>=1 overlap in 'websites']0
21Jacques Dupond10Caroline Dufour[>=1 overlap in 'websites']0
31Jacques Dupond4Jacques Dupont[Same 'City', Same 'Age', >=1 overlap in 'webs...1
41Jacques Dupond6Jean-Michel Python[>=1 overlap in 'websites']1
51Jacques Dupond10Caroline Dufour[>=1 overlap in 'websites']1
610Caroline Dufour6Jean-Michel Python[>=1 overlap in 'websites']1
710Caroline Dufour13Benoît Benoît[>=1 overlap in 'websites']1
82Pierre Dusquesnes5pierre dusquesnes[Same 'City', Same 'Age']2
98Sophie Delarue11sophie_delarue[Same 'City', Same 'Age']3
1010Caroline Dufour6Jean-Michel Python[>=1 overlap in 'websites']4
1110Caroline Dufour13Benoît Benoît[>=1 overlap in 'websites']4
1213Benoît Benoît6Jean-Michel Python[>=1 overlap in 'websites']4
\n", - "
" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 31 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -3362,12 +1836,7 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-02-03T15:36:17.696557900Z", - "start_time": "2026-02-03T15:36:17.550771100Z" - } - }, + "metadata": {}, "source": [ "report = msb.add_blocks_to_dataset(\n", " df,\n", @@ -3380,210 +1849,8 @@ ")\n", "report.sort_values(\"_score\", ascending=False)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont \n", - "3 1 Jacques Dupond 4 Jacques Dupont \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", - "9 8 Sophie Delarue 11 sophie_delarue \n", - "1 1 Jacques Dupond 6 Jean-Michel Python \n", - "4 1 Jacques Dupond 6 Jean-Michel Python \n", - "2 1 Jacques Dupond 10 Caroline Dufour \n", - "6 10 Caroline Dufour 6 Jean-Michel Python \n", - "5 1 Jacques Dupond 10 Caroline Dufour \n", - "7 10 Caroline Dufour 13 Benoît Benoît \n", - "10 10 Caroline Dufour 6 Jean-Michel Python \n", - "11 10 Caroline Dufour 13 Benoît Benoît \n", - "12 13 Benoît Benoît 6 Jean-Michel Python \n", - "\n", - " _motive _score _block \n", - "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 0 \n", - "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 1 \n", - "8 [Same 'City', Same 'Age'] 2 2 \n", - "9 [Same 'City', Same 'Age'] 2 3 \n", - "1 [>=1 overlap in 'websites'] 1 0 \n", - "4 [>=1 overlap in 'websites'] 1 1 \n", - "2 [>=1 overlap in 'websites'] 1 0 \n", - "6 [>=1 overlap in 'websites'] 1 1 \n", - "5 [>=1 overlap in 'websites'] 1 1 \n", - "7 [>=1 overlap in 'websites'] 1 1 \n", - "10 [>=1 overlap in 'websites'] 1 4 \n", - "11 [>=1 overlap in 'websites'] 1 4 \n", - "12 [>=1 overlap in 'websites'] 1 4 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_motive_score_block
01Jacques Dupond4Jacques Dupont[Same 'City', Same 'Age', >=1 overlap in 'webs...30
31Jacques Dupond4Jacques Dupont[Same 'City', Same 'Age', >=1 overlap in 'webs...31
82Pierre Dusquesnes5pierre dusquesnes[Same 'City', Same 'Age']22
98Sophie Delarue11sophie_delarue[Same 'City', Same 'Age']23
11Jacques Dupond6Jean-Michel Python[>=1 overlap in 'websites']10
41Jacques Dupond6Jean-Michel Python[>=1 overlap in 'websites']11
21Jacques Dupond10Caroline Dufour[>=1 overlap in 'websites']10
610Caroline Dufour6Jean-Michel Python[>=1 overlap in 'websites']11
51Jacques Dupond10Caroline Dufour[>=1 overlap in 'websites']11
710Caroline Dufour13Benoît Benoît[>=1 overlap in 'websites']11
1010Caroline Dufour6Jean-Michel Python[>=1 overlap in 'websites']14
1110Caroline Dufour13Benoît Benoît[>=1 overlap in 'websites']14
1213Benoît Benoît6Jean-Michel Python[>=1 overlap in 'websites']14
\n", - "
" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 32 + "outputs": [], + "execution_count": null } ], "metadata": { diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index 5571ba0..b27d3d1 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -92,7 +92,7 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( - self, blocking_columns, normalize_strings=True, must_not_be_different=None + self, blocking_columns, must_not_be_different=None, normalize_strings=True ): super().__init__() @@ -121,7 +121,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"AttributeEquivalenceBlocker({self.blocking_columns}, {self.must_not_be_different})" + return f"AttributeEquivalenceBlocker({self.blocking_columns}{', ' + str(self.must_not_be_different) if self.must_not_be_different else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: @@ -216,7 +216,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"OverlapBlocker({self.blocking_columns}, {self.overlap})" + return f"OverlapBlocker({self.blocking_columns}, {self.overlap}{', WORD-LEVEL' if self.word_level else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is OverlapBlocker: @@ -340,7 +340,16 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"MixedBlocker({self.equivalence_columns}, {self.overlap_columns}, {self.overlap})" + return str( + AndNode( + AttributeEquivalenceBlocker( + self.equivalence_columns, self.must_not_be_different, self.normalize + ), + OverlapBlocker( + self.overlap_columns, self.overlap, self.word_level, self.normalize + ), + ) + ) def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: From 3643a0030afcfb4163a228c81a15a649b4b30bfc Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 17:24:49 +0100 Subject: [PATCH 14/20] docs: fix obsolete type in docstring --- src/ms_blocking/utils.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 7f271f5..100ab2e 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -12,12 +12,14 @@ class EquivalenceMotive: - def __init__(self, blocking_column): + def __init__(self, blocking_column: str): if not isinstance(blocking_column, str): raise TypeError("blocking_column for Motive must be a string") self.blocking_column = blocking_column - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") return self.blocking_column == other.blocking_column def __str__(self): @@ -28,7 +30,9 @@ def __repr__(self): class OverlapMotive: - def __init__(self, blocking_column, overlap=1, word_level=False): + def __init__( + self, blocking_column: str, overlap: int = 1, word_level: bool = False + ): if not isinstance(blocking_column, str): raise TypeError("blocking_column for Motive must be a string") if not isinstance(overlap, int): @@ -39,7 +43,9 @@ def __init__(self, blocking_column, overlap=1, word_level=False): self.overlap = overlap self.word_level = word_level - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") return ( self.blocking_column == other.blocking_column and self.overlap == other.overlap @@ -535,7 +541,7 @@ def solve_motives(motives: List[Motive]) -> List[Motive]: Examples -------- - >>> solve_motives([OverlapMotive(['websites'], 1), OverlapMotive(['websites'], 2), OverlapMotive(['websites'], 2, word_level=False)]) + >>> solve_motives([OverlapMotive('websites', 1), OverlapMotive('websites', 2), OverlapMotive('websites', 2, word_level=False)]) [OverlapMotive(['websites'], 2, word_level=False)] """ if not motives: From 86a443c9982ffb376ee61f6d0d7d444beda0337c Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Tue, 3 Feb 2026 17:40:18 +0100 Subject: [PATCH 15/20] docs: add typehints --- src/ms_blocking/ms_blocking.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index b27d3d1..cdfa664 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -48,7 +48,7 @@ def __init__(self, left, right): def __repr__(self): return f"AndNode{{{self.left}, {self.right}}}" - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool=False) -> Coords: # In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker coords_left = self.left.block(df, motives=motives) @@ -78,7 +78,7 @@ def __init__(self, left, right): def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool=False) -> Coords: # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) @@ -92,7 +92,7 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( - self, blocking_columns, must_not_be_different=None, normalize_strings=True + self, blocking_columns: str|Collection[str], must_not_be_different: str|Collection[str]=None, normalize_strings: bool=True ): super().__init__() @@ -140,7 +140,7 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: """Regroup rows based on equality of one or more columns""" print("Processing", self) @@ -196,7 +196,7 @@ class OverlapBlocker(BlockerNode): # Leaf """To regroup rows based on overlap of one or more columns.""" def __init__( - self, blocking_columns, overlap=1, word_level=False, normalize_strings=True + self, blocking_columns: str|Collection[str], overlap: int=1, word_level: bool=False, normalize_strings: bool=True ): super().__init__() @@ -237,7 +237,7 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) @@ -291,12 +291,12 @@ class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM def __init__( self, - equivalence_columns, - overlap_columns, - must_not_be_different=None, - overlap=1, - word_level=False, - normalize_strings=True, + equivalence_columns: str|Collection[str], + overlap_columns: str|Collection[str], + must_not_be_different: str|Collection[str]=None, + overlap: int=1, + word_level: bool=False, + normalize_strings: bool=True, ): super().__init__() @@ -377,7 +377,7 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) From 83b193229d03a43f473179ed8be0bb6ef9f6227c Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Wed, 4 Feb 2026 09:54:50 +0100 Subject: [PATCH 16/20] fix: parse_list crashes on strings that do not represent lists --- docs/example.ipynb | 45 ++++++++++++++++++++++++++++++++++ src/ms_blocking/ms_blocking.py | 33 +++++++++++++++---------- src/ms_blocking/utils.py | 21 +++++++++++----- 3 files changed, 80 insertions(+), 19 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index aef6ee6..8243053 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -1851,6 +1851,51 @@ ], "outputs": [], "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-03T16:43:50.398834500Z", + "start_time": "2026-02-03T16:43:50.048297Z" + } + }, + "cell_type": "code", + "source": [ + "city_blocker = msb.OverlapBlocker([\"City\"])\n", + "city_blocker.block(df)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing OverlapBlocker(['City'], 1)\n" + ] + }, + { + "ename": "SyntaxError", + "evalue": "unterminated string literal (detected at line 1) (, line 1)", + "output_type": "error", + "traceback": [ + "Traceback \u001B[36m(most recent call last)\u001B[39m:\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\IPython\\core\\interactiveshell.py:3701\u001B[39m in \u001B[95mrun_code\u001B[39m\n exec(code_obj, self.user_global_ns, self.user_ns)\n", + " Cell \u001B[92mIn[19]\u001B[39m\u001B[92m, line 2\u001B[39m\n city_blocker.block(df)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:250\u001B[39m in \u001B[95mblock\u001B[39m\n temp_data[col] = temp_data[col].apply(\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:4943\u001B[39m in \u001B[95mapply\u001B[39m\n ).apply()\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1422\u001B[39m in \u001B[95mapply\u001B[39m\n return self.apply_standard()\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1502\u001B[39m in \u001B[95mapply_standard\u001B[39m\n mapped = obj._map_values(\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\base.py:925\u001B[39m in \u001B[95m_map_values\u001B[39m\n return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001B[39m in \u001B[95mmap_array\u001B[39m\n return lib.map_infer(values, mapper, convert=convert)\n", + " File \u001B[92mpandas/_libs/lib.pyx:2999\u001B[39m in \u001B[95mpandas._libs.lib.map_infer\u001B[39m\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1491\u001B[39m in \u001B[95mcurried\u001B[39m\n return func(x, *self.args, **self.kwargs)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\utils.py:374\u001B[39m in \u001B[95mparse_list\u001B[39m\n s = str(s).strip()\n", + " File \u001B[92m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:66\u001B[39m in \u001B[95mliteral_eval\u001B[39m\n node_or_string = parse(node_or_string.lstrip(\" \\t\"), mode='eval')\n", + "\u001B[36m \u001B[39m\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:52\u001B[39m\u001B[36m in \u001B[39m\u001B[35mparse\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mreturn compile(source, filename, mode, flags,\u001B[39m\n", + " \u001B[36mFile \u001B[39m\u001B[32m:1\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mVilleneuve d'Ascq\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m unterminated string literal (detected at line 1)\n" + ] + } + ], + "execution_count": 19 } ], "metadata": { diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index cdfa664..a6a6097 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -48,7 +48,7 @@ def __init__(self, left, right): def __repr__(self): return f"AndNode{{{self.left}, {self.right}}}" - def block(self, df: pd.DataFrame, motives: bool=False) -> Coords: + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker coords_left = self.left.block(df, motives=motives) @@ -78,7 +78,7 @@ def __init__(self, left, right): def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" - def block(self, df: pd.DataFrame, motives: bool=False) -> Coords: + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) @@ -92,7 +92,10 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( - self, blocking_columns: str|Collection[str], must_not_be_different: str|Collection[str]=None, normalize_strings: bool=True + self, + blocking_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + normalize_strings: bool = True, ): super().__init__() @@ -140,7 +143,7 @@ def __eq__(self, other): else: return False - def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on equality of one or more columns""" print("Processing", self) @@ -196,7 +199,11 @@ class OverlapBlocker(BlockerNode): # Leaf """To regroup rows based on overlap of one or more columns.""" def __init__( - self, blocking_columns: str|Collection[str], overlap: int=1, word_level: bool=False, normalize_strings: bool=True + self, + blocking_columns: str | Collection[str], + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -237,7 +244,7 @@ def __eq__(self, other): else: return False - def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) @@ -291,12 +298,12 @@ class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM def __init__( self, - equivalence_columns: str|Collection[str], - overlap_columns: str|Collection[str], - must_not_be_different: str|Collection[str]=None, - overlap: int=1, - word_level: bool=False, - normalize_strings: bool=True, + equivalence_columns: str | Collection[str], + overlap_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -377,7 +384,7 @@ def __eq__(self, other): else: return False - def block(self, data: pd.DataFrame, motives: bool=False) -> Coords: + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 100ab2e..5c7125c 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -347,7 +347,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: Stringified representation of a list e.g. "['string 1', 'string 2', ...]" word_level : bool - Whether to return a list of all words within s instead of a list of each comma-separated element + Whether to return a list of all words within s instead of a list of each comma-separated element; + Note that if passed a string that does not represent a list, this argument will be ignored and the function + will return a list of each word in the string Returns ------- @@ -363,7 +365,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: """ if type(s) is list: # If we already have a list - if len(s) == 1 and s[0][0] == "[" and s[0][-1] == "]": + if ( + len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).startswith("]") + ): # In case we have a stringified list INSIDE a normal list s = s[0] else: return s @@ -376,10 +380,15 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: if not s: return [] - try: - parts = ast.literal_eval(s) - except ValueError: # doesn't seem to be a stringified list - parts = s.split("', '") + if s.startswith("[") and s.startswith("]"): # Stringified list? + try: + parts = ast.literal_eval(s) + except ValueError: # doesn't seem to be a stringified list + parts = s.split("', '") + except SyntaxError: # In case we have a string surroudned by brackets + parts = s.split() + else: + parts = s.split() cleaned_items = [str(part).strip().strip("''") for part in parts] From b37e0178b4e0dafa77cb9d0c60433f64cc04e94c Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Wed, 4 Feb 2026 10:00:17 +0100 Subject: [PATCH 17/20] fix: parse_list use startswith instead of endswith --- src/ms_blocking/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 5c7125c..0e10488 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -366,7 +366,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: if type(s) is list: # If we already have a list if ( - len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).startswith("]") + len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).endswith("]") ): # In case we have a stringified list INSIDE a normal list s = s[0] else: @@ -380,7 +380,7 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: if not s: return [] - if s.startswith("[") and s.startswith("]"): # Stringified list? + if s.startswith("[") and s.endswith("]"): # Stringified list? try: parts = ast.literal_eval(s) except ValueError: # doesn't seem to be a stringified list From 9fce21d39c1ee6187dd06ea7859a633fd60a0d70 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Wed, 4 Feb 2026 10:49:47 +0100 Subject: [PATCH 18/20] refactor: remove remove_value_if_appears_only_once since it was redundant with df.duplicated --- src/ms_blocking/ms_blocking.py | 17 +++++------------ src/ms_blocking/utils.py | 30 ------------------------------ 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index a6a6097..9301136 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -153,10 +153,7 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: for col in self.blocking_columns: if self.normalize: temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data.dropna(subset=self.blocking_columns) - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns - ) + temp_data = temp_data[temp_data[col].duplicated(keep=False)] if len(temp_data) == 0: # No pairs if motives: @@ -249,9 +246,7 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: print("Processing", self) - temp_data = data.copy() - - temp_data = temp_data[self.blocking_columns].copy() + temp_data = data[self.blocking_columns].copy() for col in self.blocking_columns: temp_data[col] = temp_data[col].apply( @@ -260,12 +255,10 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: temp_data = temp_data.explode(col) if self.normalize: temp_data[col] = temp_data[col].apply(normalize) + temp_data = temp_data[temp_data[col].duplicated(keep=False)] temp_data = temp_data.dropna( subset=self.blocking_columns ) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns - ) if len(temp_data) == 0: # No pairs fulfill any overlap if motives: @@ -274,7 +267,7 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: return set() # Use the DataFrame index for grouping and forming pairs - # Using frozenset since they are ahshable and thus can be used as dictionary keys + # Using frozenset since they are hashable and thus can be used as dictionary keys groups = temp_data.groupby(self.blocking_columns).apply( lambda x: frozenset(x.index), include_groups=False ) @@ -405,9 +398,9 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: else parse_list(x, self.word_level) ) temp_data = temp_data.explode(col) + temp_data = temp_data[temp_data[col].duplicated(keep=False)] temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once(temp_data, total_columns) if len(temp_data) == 0: # No pairs fulfill any overlap if motives: diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 0e10488..b644a43 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -70,36 +70,6 @@ def __repr__(self): _SPACE_RE = re.compile(r"\s+") -def remove_rows_if_value_appears_only_once( - data: pd.DataFrame, cols: Columns -) -> pd.DataFrame: - """Drop rows of a Pandas DataFrame where a certain column's values appears only once. - - Ensures all elements of provided columns appear at least twice in their column - - Parameters - ---------- - data : DataFrame - DataFrame to preprocess - - cols : List[str] - List of columns where rows that contain non-duplicated elements shall be discarded - - Returns - ------- - DataFrame - DataFrame with reduced number of rows - - Examples - -------- - >>> remove_rows_if_value_appears_only_once(data, ['name', 'city']) - """ - for col in cols: - counts = data[col].map(data[col].value_counts()) - data = data[counts >= 2] - return data - - def start_from_zero(figures: Collection[int]) -> List[int]: """Turns a list of integers into a same-length list that starts at 0, without gaps From 8cdd3bd30c1666ba23be717808214d044fc5d758 Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Wed, 4 Feb 2026 11:16:07 +0100 Subject: [PATCH 19/20] perf: move dropna higher in the block logic --- src/ms_blocking/ms_blocking.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index 9301136..ca99978 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -148,7 +148,11 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: print("Processing", self) - temp_data = data.copy() + temp_data = ( + data[self.blocking_columns + self.must_not_be_different] + .dropna(subset=self.blocking_columns) + .copy() + ) for col in self.blocking_columns: if self.normalize: @@ -246,7 +250,7 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: print("Processing", self) - temp_data = data[self.blocking_columns].copy() + temp_data = data[self.blocking_columns].dropna().copy() for col in self.blocking_columns: temp_data[col] = temp_data[col].apply( @@ -256,9 +260,6 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: if self.normalize: temp_data[col] = temp_data[col].apply(normalize) temp_data = temp_data[temp_data[col].duplicated(keep=False)] - temp_data = temp_data.dropna( - subset=self.blocking_columns - ) # Remove empty objects if len(temp_data) == 0: # No pairs fulfill any overlap if motives: @@ -384,7 +385,7 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: total_columns = self.equivalence_columns + self.overlap_columns - temp_data = data[total_columns].copy() + temp_data = data[total_columns].dropna().copy() for col in total_columns: if col in self.equivalence_columns: @@ -400,8 +401,6 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: temp_data = temp_data.explode(col) temp_data = temp_data[temp_data[col].duplicated(keep=False)] - temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects - if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() From 9521338a5b354d29a25b763626c589a0eaeb08dc Mon Sep 17 00:00:00 2001 From: RTiedrez Date: Wed, 4 Feb 2026 12:04:23 +0100 Subject: [PATCH 20/20] refactor: make preprocessing in .block more compact and pandas-esque --- src/ms_blocking/ms_blocking.py | 64 +++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index ca99978..b3552af 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -154,11 +154,17 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: .copy() ) - for col in self.blocking_columns: - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data[temp_data[col].duplicated(keep=False)] + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs if motives: return dict() @@ -252,15 +258,24 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: temp_data = data[self.blocking_columns].dropna().copy() - for col in self.blocking_columns: - temp_data[col] = temp_data[col].apply( - parse_list, word_level=self.word_level + # Ensure we check for overlap between lists of strings + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) + ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.blocking_columns) + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) ) - temp_data = temp_data.explode(col) - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data[temp_data[col].duplicated(keep=False)] + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() @@ -387,20 +402,21 @@ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: temp_data = data[total_columns].dropna().copy() - for col in total_columns: - if col in self.equivalence_columns: - temp_data[col] = temp_data[col].apply(normalize) - elif col in self.overlap_columns: - temp_data[col] = temp_data[col].apply( - lambda x: [ - normalize(item) for item in parse_list(x, self.word_level) - ] - if self.normalize - else parse_list(x, self.word_level) - ) - temp_data = temp_data.explode(col) - temp_data = temp_data[temp_data[col].duplicated(keep=False)] + # Ensure we check for overlap between lists of strings + temp_data[self.overlap_columns] = temp_data[self.overlap_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) + ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.overlap_columns) + # Normalize strings if required + if self.normalize: + temp_data[total_columns] = temp_data[total_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[temp_data.duplicated(keep=False, subset=total_columns)] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict()