diff --git a/docs/example.ipynb b/docs/example.ipynb
index 6b82165..8243053 100644
--- a/docs/example.ipynb
+++ b/docs/example.ipynb
@@ -32,8 +32,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.010997600Z",
- "start_time": "2026-01-30T14:21:13.420790Z"
+ "end_time": "2026-02-03T16:02:56.751154300Z",
+ "start_time": "2026-02-03T16:02:55.924397100Z"
}
},
"source": [
@@ -60,8 +60,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.049404600Z",
- "start_time": "2026-01-30T14:21:14.010997600Z"
+ "end_time": "2026-02-03T16:02:56.810955300Z",
+ "start_time": "2026-02-03T16:02:56.751154300Z"
}
},
"source": [
@@ -282,8 +282,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.190107400Z",
- "start_time": "2026-01-30T14:21:14.089762400Z"
+ "end_time": "2026-02-03T16:02:56.966380500Z",
+ "start_time": "2026-02-03T16:02:56.862834100Z"
}
},
"source": [
@@ -310,8 +310,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.309413300Z",
- "start_time": "2026-01-30T14:21:14.278545600Z"
+ "end_time": "2026-02-03T16:02:57.285912400Z",
+ "start_time": "2026-02-03T16:02:57.147878900Z"
}
},
"source": [
@@ -322,7 +322,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['City'], [])\n"
+ "Processing AttributeEquivalenceBlocker(['City'])\n"
]
}
],
@@ -339,8 +339,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.378808Z",
- "start_time": "2026-01-30T14:21:14.349508200Z"
+ "end_time": "2026-02-03T16:02:57.479607Z",
+ "start_time": "2026-02-03T16:02:57.418159200Z"
}
},
"source": [
@@ -369,8 +369,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.558644200Z",
- "start_time": "2026-01-30T14:21:14.459573100Z"
+ "end_time": "2026-02-03T16:02:57.776512200Z",
+ "start_time": "2026-02-03T16:02:57.565676Z"
}
},
"source": [
@@ -409,8 +409,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.635514Z",
- "start_time": "2026-01-30T14:21:14.598913Z"
+ "end_time": "2026-02-03T16:02:57.810023Z",
+ "start_time": "2026-02-03T16:02:57.778482900Z"
}
},
"source": [
@@ -574,8 +574,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:14.829719100Z",
- "start_time": "2026-01-30T14:21:14.676157200Z"
+ "end_time": "2026-02-03T16:02:58.075057800Z",
+ "start_time": "2026-02-03T16:02:57.893294100Z"
}
},
"source": [
@@ -622,8 +622,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:15.027923700Z",
- "start_time": "2026-01-30T14:21:14.926401Z"
+ "end_time": "2026-02-03T16:02:58.413477400Z",
+ "start_time": "2026-02-03T16:02:58.285492900Z"
}
},
"source": [
@@ -759,8 +759,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:15.403596500Z",
- "start_time": "2026-01-30T14:21:15.279120300Z"
+ "end_time": "2026-02-03T16:02:58.887317800Z",
+ "start_time": "2026-02-03T16:02:58.675247500Z"
}
},
"source": [
@@ -796,8 +796,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:15.686136800Z",
- "start_time": "2026-01-30T14:21:15.608444400Z"
+ "end_time": "2026-02-03T16:02:59.272554700Z",
+ "start_time": "2026-02-03T16:02:59.130460300Z"
}
},
"source": [
@@ -971,8 +971,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:15.998425200Z",
- "start_time": "2026-01-30T14:21:15.931370100Z"
+ "end_time": "2026-02-03T16:02:59.806784300Z",
+ "start_time": "2026-02-03T16:02:59.686250600Z"
}
},
"source": [
@@ -1075,8 +1075,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:16.305679100Z",
- "start_time": "2026-01-30T14:21:16.212470400Z"
+ "end_time": "2026-02-03T16:03:00.721777Z",
+ "start_time": "2026-02-03T16:03:00.603955400Z"
}
},
"source": [
@@ -1089,7 +1089,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n"
+ "Processing AttributeEquivalenceBlocker(['Age', 'City'])\n"
]
},
{
@@ -1223,8 +1223,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:16.678653800Z",
- "start_time": "2026-01-30T14:21:16.558976200Z"
+ "end_time": "2026-02-03T16:03:01.209432600Z",
+ "start_time": "2026-02-03T16:03:01.048013600Z"
}
},
"source": [
@@ -1237,7 +1237,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['Name'], [])\n"
+ "Processing AttributeEquivalenceBlocker(['Name'])\n"
]
},
{
@@ -1342,8 +1342,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:17.354294400Z",
- "start_time": "2026-01-30T14:21:17.316050200Z"
+ "end_time": "2026-02-03T16:03:01.834433100Z",
+ "start_time": "2026-02-03T16:03:01.686309100Z"
}
},
"source": [
@@ -1358,7 +1358,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['Name'], [])\n"
+ "Processing AttributeEquivalenceBlocker(['Name'], NON-NORMALIZED)\n"
]
},
{
@@ -1440,8 +1440,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:17.537043700Z",
- "start_time": "2026-01-30T14:21:17.392490700Z"
+ "end_time": "2026-02-03T16:03:02.711968Z",
+ "start_time": "2026-02-03T16:03:02.581163100Z"
}
},
"source": [
@@ -1453,7 +1453,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['City'], [])\n",
+ "Processing AttributeEquivalenceBlocker(['City'])\n",
"Processing OverlapBlocker(['websites'], 1)\n"
]
}
@@ -1464,8 +1464,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:17.655177300Z",
- "start_time": "2026-01-30T14:21:17.573776300Z"
+ "end_time": "2026-02-03T16:03:03.614029700Z",
+ "start_time": "2026-02-03T16:03:02.835393200Z"
}
},
"source": [
@@ -1477,96 +1477,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing MixedBlocker(['City'], ['websites'], 1)\n"
+ "Processing "
]
},
{
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "2 10 Caroline Dufour Lens 45 \n",
- "3 13 Benoît Benoît Lens 15 \n",
- "\n",
- " websites _block \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n",
- "1 ['jacquesdupond.fr'] 0 \n",
- "2 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n",
- "3 ['lensfans.fr'] 1 "
- ],
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "TypeError",
+ "evalue": "object of type 'bool' has no len()",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
+ "\u001B[31mTypeError\u001B[39m Traceback (most recent call last)",
+ "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[17]\u001B[39m\u001B[32m, line 1\u001B[39m\n\u001B[32m----> \u001B[39m\u001B[32m1\u001B[39m links = \u001B[43m(\u001B[49m\u001B[43mcity_blocker\u001B[49m\u001B[43m \u001B[49m\u001B[43m&\u001B[49m\u001B[43m \u001B[49m\u001B[43mwebsites_blocker\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mblock\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 2\u001B[39m msb.add_blocks_to_dataset(df, links)\n",
+ "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:383\u001B[39m, in \u001B[36mMixedBlocker.block\u001B[39m\u001B[34m(self, data, motives)\u001B[39m\n\u001B[32m 380\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mblock\u001B[39m(\u001B[38;5;28mself\u001B[39m, data, motives=\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[32m 381\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"Regroup rows based on overlap of one or more columns\"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m383\u001B[39m \u001B[38;5;28;43mprint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mProcessing\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 385\u001B[39m total_columns = \u001B[38;5;28mself\u001B[39m.equivalence_columns + \u001B[38;5;28mself\u001B[39m.overlap_columns\n\u001B[32m 387\u001B[39m temp_data = data[total_columns].copy()\n",
+ "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:345\u001B[39m, in \u001B[36mMixedBlocker.__repr__\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 342\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m__repr__\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[32m 343\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(\n\u001B[32m 344\u001B[39m AndNode(\n\u001B[32m--> \u001B[39m\u001B[32m345\u001B[39m \u001B[43mAttributeEquivalenceBlocker\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 346\u001B[39m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mequivalence_columns\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mnormalize\u001B[49m\n\u001B[32m 347\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m,\n\u001B[32m 348\u001B[39m OverlapBlocker(\n\u001B[32m 349\u001B[39m \u001B[38;5;28mself\u001B[39m.overlap_columns, \u001B[38;5;28mself\u001B[39m.overlap, \u001B[38;5;28mself\u001B[39m.word_level, \u001B[38;5;28mself\u001B[39m.normalize\n\u001B[32m 350\u001B[39m ),\n\u001B[32m 351\u001B[39m )\n\u001B[32m 352\u001B[39m )\n",
+ "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:109\u001B[39m, in \u001B[36mAttributeEquivalenceBlocker.__init__\u001B[39m\u001B[34m(self, blocking_columns, normalize_strings, must_not_be_different)\u001B[39m\n\u001B[32m 107\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(must_not_be_different) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28mstr\u001B[39m:\n\u001B[32m 108\u001B[39m must_not_be_different = [must_not_be_different]\n\u001B[32m--> \u001B[39m\u001B[32m109\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m)\u001B[49m > \u001B[32m1\u001B[39m:\n\u001B[32m 110\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[33m\"\u001B[39m\u001B[33mThere must be only one extra column\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 111\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m (\n\u001B[32m 112\u001B[39m must_not_be_different\n\u001B[32m 113\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m must_not_be_different[\u001B[32m0\u001B[39m] \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.blocking_columns\n\u001B[32m 114\u001B[39m ):\n",
+ "\u001B[31mTypeError\u001B[39m: object of type 'bool' has no len()"
+ ]
}
],
"execution_count": 17
@@ -1587,181 +1513,13 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:17.910335600Z",
- "start_time": "2026-01-30T14:21:17.821453400Z"
- }
- },
+ "metadata": {},
"source": [
"links = (city_blocker | websites_blocker).block(df)\n",
"msb.add_blocks_to_dataset(df, links)"
],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['City'], [])\n",
- "Processing OverlapBlocker(['websites'], 1)\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "2 6 Jean-Michel Python Douai 49 \n",
- "3 10 Caroline Dufour Lens 45 \n",
- "4 13 Benoît Benoît Lens 15 \n",
- "5 2 Pierre Dusquesnes Phalempin 24 \n",
- "6 5 pierre dusquesnes Phalempin 24 \n",
- "7 3 Paul Delarue Roubaix 32 \n",
- "8 8 Sophie Delarue Roubaix 33 \n",
- "9 11 sophie_delarue Roubaix 33 \n",
- "\n",
- " websites _block \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n",
- "1 ['jacquesdupond.fr'] 0 \n",
- "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n",
- "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n",
- "4 ['lensfans.fr'] 0 \n",
- "5 ['somewebsite.com/users/rpz59'] 1 \n",
- "6 [] 1 \n",
- "7 ['roubaixlove.fr'] 2 \n",
- "8 [] 2 \n",
- "9 [] 2 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " Douai | \n",
- " 49 | \n",
- " ['lensfans.fr', 'pythonensamusant.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 18
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -1802,12 +1560,7 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:18.279899900Z",
- "start_time": "2026-01-30T14:21:18.250988900Z"
- }
- },
+ "metadata": {},
"source": [
"city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n",
"age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n",
@@ -1815,7 +1568,7 @@
"websites_blocker = msb.OverlapBlocker([\"websites\"])"
],
"outputs": [],
- "execution_count": 19
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -1826,17 +1579,12 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:18.481263300Z",
- "start_time": "2026-01-30T14:21:18.466284300Z"
- }
- },
+ "metadata": {},
"source": [
"final_blocker = (city_blocker & age_blocker) | (name_blocker & websites_blocker)"
],
"outputs": [],
- "execution_count": 20
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -1847,137 +1595,13 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:18.562779600Z",
- "start_time": "2026-01-30T14:21:18.520368200Z"
- }
- },
+ "metadata": {},
"source": [
"links = final_blocker.block(df)\n",
"msb.add_blocks_to_dataset(df, links)"
],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n",
- "Processing MixedBlocker(['Name'], ['websites'], 1)\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "2 2 Pierre Dusquesnes Phalempin 24 \n",
- "3 5 pierre dusquesnes Phalempin 24 \n",
- "4 8 Sophie Delarue Roubaix 33 \n",
- "5 11 sophie_delarue Roubaix 33 \n",
- "\n",
- " websites _block \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n",
- "1 ['jacquesdupond.fr'] 0 \n",
- "2 ['somewebsite.com/users/rpz59'] 1 \n",
- "3 [] 1 \n",
- "4 [] 2 \n",
- "5 [] 2 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 21
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -1988,26 +1612,13 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:18.843568700Z",
- "start_time": "2026-01-30T14:21:18.686911500Z"
- }
- },
+ "metadata": {},
"source": [
"city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n",
"links = city_blocker.block(df)"
],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['City'], [])\n"
- ]
- }
- ],
- "execution_count": 22
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -2032,161 +1643,12 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:18.967168700Z",
- "start_time": "2026-01-30T14:21:18.928864500Z"
- }
- },
+ "metadata": {},
"source": [
"msb.add_blocks_to_dataset(df, links, sort=False)"
],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 2 Pierre Dusquesnes Phalempin 24 \n",
- "2 3 Paul Delarue Roubaix 32 \n",
- "3 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "4 5 pierre dusquesnes Phalempin 24 \n",
- "5 8 Sophie Delarue Roubaix 33 \n",
- "6 10 Caroline Dufour Lens 45 \n",
- "7 11 sophie_delarue Roubaix 33 \n",
- "8 13 Benoît Benoît Lens 15 \n",
- "\n",
- " websites _block \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n",
- "1 ['somewebsite.com/users/rpz59'] 1 \n",
- "2 ['roubaixlove.fr'] 2 \n",
- "3 ['jacquesdupond.fr'] 0 \n",
- "4 [] 1 \n",
- "5 [] 2 \n",
- "6 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n",
- "7 [] 2 \n",
- "8 ['lensfans.fr'] 3 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 3 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 23
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -2211,216 +1673,12 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:19.276047300Z",
- "start_time": "2026-01-30T14:21:19.146886900Z"
- }
- },
+ "metadata": {},
"source": [
"msb.add_blocks_to_dataset(df, links, keep_ungrouped_rows=True)"
],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 0 Jean d'Aux Lille 26 \n",
- "1 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "2 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "3 2 Pierre Dusquesnes Phalempin 24 \n",
- "4 5 pierre dusquesnes Phalempin 24 \n",
- "5 3 Paul Delarue Roubaix 32 \n",
- "6 8 Sophie Delarue Roubaix 33 \n",
- "7 11 sophie_delarue Roubaix 33 \n",
- "8 6 Jean-Michel Python Douai 49 \n",
- "9 7 Gédéon Glincarné Paris 53 \n",
- "10 9 Jeanne Verbrugge Valenciennes 41 \n",
- "11 10 Caroline Dufour Lens 45 \n",
- "12 13 Benoît Benoît Lens 15 \n",
- "13 12 Marcel Vandermersch Fourmies 48 \n",
- "\n",
- " websites _block \n",
- "0 ['jeandaux.fr', 'lillefans.fr'] 0 \n",
- "1 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n",
- "2 ['jacquesdupond.fr'] 1 \n",
- "3 ['somewebsite.com/users/rpz59'] 2 \n",
- "4 [] 2 \n",
- "5 ['roubaixlove.fr'] 3 \n",
- "6 [] 3 \n",
- "7 [] 3 \n",
- "8 ['lensfans.fr', 'pythonensamusant.fr'] 4 \n",
- "9 ['lorem.fr'] 5 \n",
- "10 ['somewebsite.com/users/jajanne59'] 6 \n",
- "11 ['pythonensamusant.fr', 'lensfans.fr'] 7 \n",
- "12 ['lensfans.fr'] 7 \n",
- "13 ['lesrecettesdemarcel.fr'] 8 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 0 | \n",
- " Jean d'Aux | \n",
- " Lille | \n",
- " 26 | \n",
- " ['jeandaux.fr', 'lillefans.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " Douai | \n",
- " 49 | \n",
- " ['lensfans.fr', 'pythonensamusant.fr'] | \n",
- " 4 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 7 | \n",
- " Gédéon Glincarné | \n",
- " Paris | \n",
- " 53 | \n",
- " ['lorem.fr'] | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " | 10 | \n",
- " 9 | \n",
- " Jeanne Verbrugge | \n",
- " Valenciennes | \n",
- " 41 | \n",
- " ['somewebsite.com/users/jajanne59'] | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " | 11 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 7 | \n",
- "
\n",
- " \n",
- " | 12 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 7 | \n",
- "
\n",
- " \n",
- " | 13 | \n",
- " 12 | \n",
- " Marcel Vandermersch | \n",
- " Fourmies | \n",
- " 48 | \n",
- " ['lesrecettesdemarcel.fr'] | \n",
- " 8 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 24
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -2441,12 +1699,7 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:19.820247800Z",
- "start_time": "2026-01-30T14:21:19.653280100Z"
- }
- },
+ "metadata": {},
"source": [
"city_blocker_not_different_age = msb.AttributeEquivalenceBlocker(\n",
" [\"City\"], must_not_be_different=[\"Age\"]\n",
@@ -2454,126 +1707,8 @@
"links = city_blocker_not_different_age.block(df)\n",
"msb.add_blocks_to_dataset(df, links)"
],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['City'], ['Age'])\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "2 2 Pierre Dusquesnes Phalempin 24 \n",
- "3 5 pierre dusquesnes Phalempin 24 \n",
- "4 8 Sophie Delarue Roubaix 33 \n",
- "5 11 sophie_delarue Roubaix 33 \n",
- "\n",
- " websites _block \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n",
- "1 ['jacquesdupond.fr'] 0 \n",
- "2 ['somewebsite.com/users/rpz59'] 1 \n",
- "3 [] 1 \n",
- "4 [] 2 \n",
- "5 [] 2 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 25
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -2591,49 +1726,19 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:20.335572Z",
- "start_time": "2026-01-30T14:21:20.302358700Z"
- }
- },
+ "metadata": {},
"source": [
"city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n",
"links = city_blocker.block(df, motives=True)\n",
"links"
],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['City'], [])\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "{frozenset({1, 4}): {\"Same 'City'\"},\n",
- " frozenset({8, 11}): {\"Same 'City'\"},\n",
- " frozenset({2, 5}): {\"Same 'City'\"},\n",
- " frozenset({10, 13}): {\"Same 'City'\"},\n",
- " frozenset({3, 8}): {\"Same 'City'\"},\n",
- " frozenset({3, 11}): {\"Same 'City'\"}}"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 26
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
"metadata": {},
- "source": [
- "Of course, this will induce some overhead."
- ]
+ "source": "This will induce some overhead."
},
{
"cell_type": "markdown",
@@ -2644,892 +1749,80 @@
},
{
"cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:20.409405100Z",
- "start_time": "2026-01-30T14:21:20.374573700Z"
- }
- },
+ "metadata": {},
"source": [
"msb.add_blocks_to_dataset(df, links, motives=True)"
],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id Name City Age \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n",
- "2 2 Pierre Dusquesnes Phalempin 24 \n",
- "3 5 pierre dusquesnes Phalempin 24 \n",
- "4 3 Paul Delarue Roubaix 32 \n",
- "5 8 Sophie Delarue Roubaix 33 \n",
- "6 11 sophie_delarue Roubaix 33 \n",
- "7 10 Caroline Dufour Lens 45 \n",
- "8 13 Benoît Benoît Lens 15 \n",
- "\n",
- " websites _block _motive \n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n",
- "1 ['jacquesdupond.fr'] 0 (Same 'City') \n",
- "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n",
- "3 [] 1 (Same 'City') \n",
- "4 ['roubaixlove.fr'] 2 (Same 'City') \n",
- "5 [] 2 (Same 'City') \n",
- "6 [] 2 (Same 'City') \n",
- "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n",
- "8 ['lensfans.fr'] 3 (Same 'City') "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " Name | \n",
- " City | \n",
- " Age | \n",
- " websites | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..."
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:"
+ },
+ {
+ "cell_type": "code",
+ "metadata": {},
+ "source": [
+ "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)"
],
- "execution_count": 27
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "... Though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..."
+ "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:"
]
},
+ {
+ "cell_type": "code",
+ "metadata": {},
+ "source": [
+ "msb.add_blocks_to_dataset(\n",
+ " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n",
+ ")"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "... Which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:"
+ "Motives are dynamic:"
]
},
{
"cell_type": "code",
"metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:20.612990700Z",
- "start_time": "2026-01-30T14:21:20.483928200Z"
- }
+ "scrolled": true
},
"source": [
- "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)"
- ],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id_l Name_l City_l Age_l \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 2 Pierre Dusquesnes Phalempin 24 \n",
- "2 3 Paul Delarue Roubaix 32 \n",
- "3 8 Sophie Delarue Roubaix 33 \n",
- "4 8 Sophie Delarue Roubaix 33 \n",
- "5 10 Caroline Dufour Lens 45 \n",
- "\n",
- " websites_l id_r Name_r \\\n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n",
- "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n",
- "2 ['roubaixlove.fr'] 11 sophie_delarue \n",
- "3 [] 11 sophie_delarue \n",
- "4 [] 3 Paul Delarue \n",
- "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n",
- "\n",
- " City_r Age_r websites_r _block _motive \n",
- "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n",
- "1 Phalempin 24 [] 1 (Same 'City') \n",
- "2 Roubaix 33 [] 2 (Same 'City') \n",
- "3 Roubaix 33 [] 2 (Same 'City') \n",
- "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n",
- "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " City_l | \n",
- " Age_l | \n",
- " websites_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " City_r | \n",
- " Age_r | \n",
- " websites_r | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 28
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:20.944670700Z",
- "start_time": "2026-01-30T14:21:20.834495500Z"
- }
- },
- "source": [
- "msb.add_blocks_to_dataset(\n",
- " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n",
- ")"
- ],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id_l Name_l id_r Name_r _block _motive\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n",
- "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n",
- "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n",
- "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n",
- "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n",
- "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')"
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 29
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Motives are dynamic:"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:21.591044600Z",
- "start_time": "2026-01-30T14:21:21.517777200Z"
- }
- },
- "source": [
- "msb.generate_blocking_report(df, links)"
- ],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id_l Name_l City_l Age_l \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 2 Pierre Dusquesnes Phalempin 24 \n",
- "2 3 Paul Delarue Roubaix 32 \n",
- "3 8 Sophie Delarue Roubaix 33 \n",
- "4 8 Sophie Delarue Roubaix 33 \n",
- "5 10 Caroline Dufour Lens 45 \n",
- "\n",
- " websites_l id_r Name_r \\\n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n",
- "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n",
- "2 ['roubaixlove.fr'] 11 sophie_delarue \n",
- "3 [] 11 sophie_delarue \n",
- "4 [] 3 Paul Delarue \n",
- "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n",
- "\n",
- " City_r Age_r websites_r _block _motive \n",
- "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n",
- "1 Phalempin 24 [] 1 (Same 'City') \n",
- "2 Roubaix 33 [] 2 (Same 'City') \n",
- "3 Roubaix 33 [] 2 (Same 'City') \n",
- "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n",
- "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " City_l | \n",
- " Age_l | \n",
- " websites_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " City_r | \n",
- " Age_r | \n",
- " websites_r | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 30
- },
- {
- "cell_type": "code",
- "metadata": {
- "scrolled": true,
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:21.867809800Z",
- "start_time": "2026-01-30T14:21:21.674986800Z"
- }
- },
- "source": [
- "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n",
- "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n",
- "websites_blocker = msb.OverlapBlocker([\"websites\"])\n",
- "final_blocker = (city_blocker & age_blocker) | websites_blocker\n",
- "links = final_blocker.block(df, motives=True)\n",
- "report = msb.add_blocks_to_dataset(\n",
- " df,\n",
- " links,\n",
- " motives=True,\n",
- " show_as_pairs=True,\n",
- " output_columns=[\"id\", \"Name\"],\n",
- " merge_blocks=False,\n",
- ")\n",
- "report"
- ],
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n",
- "Processing OverlapBlocker(['websites'], 1)\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- " id_l Name_l id_r Name_r _block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
- "\n",
- " _motive \n",
- "0 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "1 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "2 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "3 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "4 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "5 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "6 (>=1 overlap in 'websites') \n",
- "7 (>=1 overlap in 'websites') \n",
- "8 (Same 'City', Same 'Age') \n",
- "9 (Same 'City', Same 'Age') \n",
- "10 (>=1 overlap in 'websites') \n",
- "11 (>=1 overlap in 'websites') \n",
- "12 (>=1 overlap in 'websites') "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites') | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites') | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " 2 | \n",
- " (Same 'City', Same 'Age') | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " 3 | \n",
- " (Same 'City', Same 'Age') | \n",
- "
\n",
- " \n",
- " | 10 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- "
\n",
- " \n",
- " | 11 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- "
\n",
- " \n",
- " | 12 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
+ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n",
+ "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n",
+ "websites_blocker = msb.OverlapBlocker([\"websites\"])\n",
+ "final_blocker = (city_blocker & age_blocker) | websites_blocker\n",
+ "links = final_blocker.block(df, motives=True)\n",
+ "report = msb.add_blocks_to_dataset(\n",
+ " df,\n",
+ " links,\n",
+ " motives=True,\n",
+ " show_as_pairs=True,\n",
+ " output_columns=[\"id\", \"Name\"],\n",
+ " merge_blocks=False,\n",
+ ")\n",
+ "report"
],
- "execution_count": 31
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
@@ -3539,224 +1832,70 @@
{
"cell_type": "markdown",
"metadata": {},
- "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of motives."
+ "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `score=True` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives."
},
{
"cell_type": "code",
+ "metadata": {},
+ "source": [
+ "report = msb.add_blocks_to_dataset(\n",
+ " df,\n",
+ " links,\n",
+ " motives=True,\n",
+ " show_as_pairs=True,\n",
+ " output_columns=[\"id\", \"Name\"],\n",
+ " merge_blocks=False,\n",
+ " score=True,\n",
+ ")\n",
+ "report.sort_values(\"_score\", ascending=False)"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:22.186415700Z",
- "start_time": "2026-01-30T14:21:22.127304600Z"
+ "end_time": "2026-02-03T16:43:50.398834500Z",
+ "start_time": "2026-02-03T16:43:50.048297Z"
}
},
+ "cell_type": "code",
"source": [
- "report[\"score\"] = msb.scoring(report)\n",
- "report.sort_values(\"score\", ascending=False)"
+ "city_blocker = msb.OverlapBlocker([\"City\"])\n",
+ "city_blocker.block(df)"
],
"outputs": [
{
- "data": {
- "text/plain": [
- " id_l Name_l id_r Name_r _block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
- "\n",
- " _motive score \n",
- "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "8 (Same 'City', Same 'Age') 2 \n",
- "9 (Same 'City', Same 'Age') 2 \n",
- "6 (>=1 overlap in 'websites') 1 \n",
- "7 (>=1 overlap in 'websites') 1 \n",
- "10 (>=1 overlap in 'websites') 1 \n",
- "11 (>=1 overlap in 'websites') 1 \n",
- "12 (>=1 overlap in 'websites') 1 "
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " _block | \n",
- " _motive | \n",
- " score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " 2 | \n",
- " (Same 'City', Same 'Age') | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " 3 | \n",
- " (Same 'City', Same 'Age') | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 6 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites') | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 7 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 1 | \n",
- " (>=1 overlap in 'websites') | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 10 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 11 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " | 12 | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Processing OverlapBlocker(['City'], 1)\n"
+ ]
+ },
+ {
+ "ename": "SyntaxError",
+ "evalue": "unterminated string literal (detected at line 1) (, line 1)",
+ "output_type": "error",
+ "traceback": [
+ "Traceback \u001B[36m(most recent call last)\u001B[39m:\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\IPython\\core\\interactiveshell.py:3701\u001B[39m in \u001B[95mrun_code\u001B[39m\n exec(code_obj, self.user_global_ns, self.user_ns)\n",
+ " Cell \u001B[92mIn[19]\u001B[39m\u001B[92m, line 2\u001B[39m\n city_blocker.block(df)\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:250\u001B[39m in \u001B[95mblock\u001B[39m\n temp_data[col] = temp_data[col].apply(\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:4943\u001B[39m in \u001B[95mapply\u001B[39m\n ).apply()\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1422\u001B[39m in \u001B[95mapply\u001B[39m\n return self.apply_standard()\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1502\u001B[39m in \u001B[95mapply_standard\u001B[39m\n mapped = obj._map_values(\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\base.py:925\u001B[39m in \u001B[95m_map_values\u001B[39m\n return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001B[39m in \u001B[95mmap_array\u001B[39m\n return lib.map_infer(values, mapper, convert=convert)\n",
+ " File \u001B[92mpandas/_libs/lib.pyx:2999\u001B[39m in \u001B[95mpandas._libs.lib.map_infer\u001B[39m\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1491\u001B[39m in \u001B[95mcurried\u001B[39m\n return func(x, *self.args, **self.kwargs)\n",
+ " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\utils.py:374\u001B[39m in \u001B[95mparse_list\u001B[39m\n s = str(s).strip()\n",
+ " File \u001B[92m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:66\u001B[39m in \u001B[95mliteral_eval\u001B[39m\n node_or_string = parse(node_or_string.lstrip(\" \\t\"), mode='eval')\n",
+ "\u001B[36m \u001B[39m\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:52\u001B[39m\u001B[36m in \u001B[39m\u001B[35mparse\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mreturn compile(source, filename, mode, flags,\u001B[39m\n",
+ " \u001B[36mFile \u001B[39m\u001B[32m:1\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mVilleneuve d'Ascq\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m unterminated string literal (detected at line 1)\n"
+ ]
}
],
- "execution_count": 32
+ "execution_count": 19
}
],
"metadata": {
diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py
index fffbcc8..b3552af 100644
--- a/src/ms_blocking/ms_blocking.py
+++ b/src/ms_blocking/ms_blocking.py
@@ -1,5 +1,7 @@
from ms_blocking.utils import * # noqa: F403
+import networkx as nx
+
class BlockerNode:
"""Abstract class from which derive all classes in the module"""
@@ -46,7 +48,7 @@ def __init__(self, left, right):
def __repr__(self):
return f"AndNode{{{self.left}, {self.right}}}"
- def block(self, df, motives=False):
+ def block(self, df: pd.DataFrame, motives: bool = False) -> Coords:
# In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker
coords_left = self.left.block(df, motives=motives)
@@ -76,8 +78,7 @@ def __init__(self, left, right):
def __repr__(self):
return f"OrNode{{{self.left}, {self.right}}}"
-
- def block(self, df, motives=False):
+ def block(self, df: pd.DataFrame, motives: bool = False) -> Coords:
# Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations
coords_left = self.left.block(df, motives=motives)
@@ -91,7 +92,10 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf
"""To regroup rows based on equality across columns."""
def __init__(
- self, blocking_columns, normalize_strings=True, must_not_be_different=None
+ self,
+ blocking_columns: str | Collection[str],
+ must_not_be_different: str | Collection[str] = None,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -120,7 +124,7 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"AttributeEquivalenceBlocker({self.blocking_columns}, {self.must_not_be_different})"
+ return f"AttributeEquivalenceBlocker({self.blocking_columns}{', ' + str(self.must_not_be_different) if self.must_not_be_different else ''}{', NON-NORMALIZED' if not self.normalize else ''})"
def __eq__(self, other):
if type(other) is AttributeEquivalenceBlocker:
@@ -139,21 +143,28 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on equality of one or more columns"""
print("Processing", self)
- temp_data = data.copy()
-
- for col in self.blocking_columns:
- if self.normalize:
- temp_data[col] = temp_data[col].apply(normalize)
- temp_data = temp_data.dropna(subset=self.blocking_columns)
- temp_data = remove_rows_if_value_appears_only_once(
- temp_data, self.blocking_columns
+ temp_data = (
+ data[self.blocking_columns + self.must_not_be_different]
+ .dropna(subset=self.blocking_columns)
+ .copy()
)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[
+ temp_data.duplicated(keep=False, subset=self.blocking_columns)
+ ]
+
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs
if motives:
return dict()
@@ -185,9 +196,7 @@ def block(self, data, motives=False):
}
if motives:
- explanations = {
- f"Same '{column_name}'" for column_name in self.blocking_columns
- }
+ explanations = [EquivalenceMotive(col) for col in self.blocking_columns]
return add_motives_to_coords(coords, explanations)
else:
return set(coords) # set is unnnecessary
@@ -197,7 +206,11 @@ class OverlapBlocker(BlockerNode): # Leaf
"""To regroup rows based on overlap of one or more columns."""
def __init__(
- self, blocking_columns, overlap=1, word_level=False, normalize_strings=True
+ self,
+ blocking_columns: str | Collection[str],
+ overlap: int = 1,
+ word_level: bool = False,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -217,7 +230,7 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"OverlapBlocker({self.blocking_columns}, {self.overlap})"
+ return f"OverlapBlocker({self.blocking_columns}, {self.overlap}{', WORD-LEVEL' if self.word_level else ''}{', NON-NORMALIZED' if not self.normalize else ''})"
def __eq__(self, other):
if type(other) is OverlapBlocker:
@@ -238,29 +251,31 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on overlap of one or more columns"""
print("Processing", self)
- temp_data = data.copy()
+ temp_data = data[self.blocking_columns].dropna().copy()
- temp_data = temp_data[self.blocking_columns].copy()
-
- for col in self.blocking_columns:
- temp_data[col] = temp_data[col].apply(
- parse_list, word_level=self.word_level
- )
- temp_data = temp_data.explode(col)
- if self.normalize:
- temp_data[col] = temp_data[col].apply(normalize)
- temp_data = temp_data.dropna(
- subset=self.blocking_columns
- ) # Remove empty objects
- temp_data = remove_rows_if_value_appears_only_once(
- temp_data, self.blocking_columns
+ # Ensure we check for overlap between lists of strings
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(parse_list, word_level=self.word_level)
)
+ # Split elements of said lists to compare them one by one
+ temp_data = temp_data.explode(self.blocking_columns)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[
+ temp_data.duplicated(keep=False, subset=self.blocking_columns)
+ ]
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs fulfill any overlap
if motives:
return dict()
@@ -268,7 +283,7 @@ def block(self, data, motives=False):
return set()
# Use the DataFrame index for grouping and forming pairs
- # Using frozenset since they are ahshable and thus can be used as dictionary keys
+ # Using frozenset since they are hashable and thus can be used as dictionary keys
groups = temp_data.groupby(self.blocking_columns).apply(
lambda x: frozenset(x.index), include_groups=False
)
@@ -276,10 +291,10 @@ def block(self, data, motives=False):
coords = block_overlap(groups=groups, overlap=self.overlap)
if motives:
- explanations = {
- f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'"
- for column_name in self.blocking_columns
- }
+ explanations = [
+ OverlapMotive(col, self.overlap, self.word_level)
+ for col in self.blocking_columns
+ ]
return add_motives_to_coords(coords, explanations)
else:
return set(coords)
@@ -287,17 +302,17 @@ def block(self, data, motives=False):
class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM
"""Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker.
- Designed for performance and RAM efficiency.
+ Used for performance and RAM efficiency.
"""
def __init__(
self,
- equivalence_columns,
- overlap_columns,
- must_not_be_different=None,
- overlap=1,
- word_level=False,
- normalize_strings=True,
+ equivalence_columns: str | Collection[str],
+ overlap_columns: str | Collection[str],
+ must_not_be_different: str | Collection[str] = None,
+ overlap: int = 1,
+ word_level: bool = False,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -341,7 +356,16 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"MixedBlocker({self.equivalence_columns}, {self.overlap_columns}, {self.overlap})"
+ return str(
+ AndNode(
+ AttributeEquivalenceBlocker(
+ self.equivalence_columns, self.must_not_be_different, self.normalize
+ ),
+ OverlapBlocker(
+ self.overlap_columns, self.overlap, self.word_level, self.normalize
+ ),
+ )
+ )
def __eq__(self, other):
if type(other) is AttributeEquivalenceBlocker:
@@ -369,31 +393,30 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on overlap of one or more columns"""
print("Processing", self)
total_columns = self.equivalence_columns + self.overlap_columns
- temp_data = data[total_columns].copy()
-
- for col in total_columns:
- if col in self.equivalence_columns:
- temp_data[col] = temp_data[col].apply(normalize)
- elif col in self.overlap_columns:
- temp_data[col] = temp_data[col].apply(
- lambda x: [
- normalize(item) for item in parse_list(x, self.word_level)
- ]
- if self.normalize
- else parse_list(x, self.word_level)
- )
- temp_data = temp_data.explode(col)
+ temp_data = data[total_columns].dropna().copy()
- temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects
- temp_data = remove_rows_if_value_appears_only_once(temp_data, total_columns)
+ # Ensure we check for overlap between lists of strings
+ temp_data[self.overlap_columns] = temp_data[self.overlap_columns].apply(
+ lambda col: col.apply(parse_list, word_level=self.word_level)
+ )
+ # Split elements of said lists to compare them one by one
+ temp_data = temp_data.explode(self.overlap_columns)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[total_columns] = temp_data[total_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[temp_data.duplicated(keep=False, subset=total_columns)]
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs fulfill any overlap
if motives:
return dict()
@@ -426,17 +449,261 @@ def block(self, data, motives=False):
coords = coords_equivalence.intersection(coords_overlap)
if motives:
- explanations = {
- f"Same '{column_name}'" for column_name in self.equivalence_columns
- } | {
- f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'"
- for column_name in self.overlap_columns
- }
+ explanations = [
+ EquivalenceMotive(col) for col in self.equivalence_columns
+ ] + [
+ OverlapMotive(col, self.overlap, self.word_level)
+ for col in self.overlap_columns
+ ]
+
return add_motives_to_coords(coords, explanations)
else:
return set(coords)
+def add_blocks_to_dataset(
+ data: pd.DataFrame,
+ coords: Coords,
+ sort: bool = True,
+ keep_ungrouped_rows: bool = False,
+ merge_blocks: bool = True,
+ motives: bool = False,
+ show_as_pairs: bool = False,
+ output_columns: Columns = None,
+ score: bool = False,
+) -> pd.DataFrame:
+ """Returns the intersection of an array of links
+
+ Takes two lists of paired elements, with or without motives, returns their intersection
+
+ Parameters
+ ----------
+ data : DataFrame
+ DataFrame for blocking
+ coords : Array
+ Blocked coordinates
+ sort : bool
+ Whether to sort the result by block, thereby regrouping rows of the same block
+ keep_ungrouped_rows : bool
+ Whether to display rows that do not belong to any block
+ merge_blocks : bool
+ Whether to merge transitively merge blocks
+ motives : bool
+ Whether to display the reason behind each block
+ show_as_pairs : bool
+ Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame
+ output_columns : list
+ Columns to show. Useful in combination with show_as_pairs as column names are altered
+ score : bool
+ Whether to show a score (computed from the number of motives)
+
+ Returns
+ -------
+ DataFrame
+ Blocked DataFrame
+
+ Examples
+ --------
+ >>> add_blocks_to_dataset(data=pd.DataFrame(
+ [
+ [0, 'first', 4],
+ [1, 'second', 6],
+ [2, 'first', 2],
+ [3, 'third', 5]
+ ],
+ columns=['id', 'rank', 'score']),
+ coords=np.array([{0, 2}]),
+ show_as_pairs=True,
+ output_columns=['id', 'rank'])
+ id_l rank_l id_r rank_r block
+ 0 0 first 2 first 0
+ """
+
+ if show_as_pairs and keep_ungrouped_rows:
+ raise ValueError("Cannot both return pairs and keep ungrouped rows")
+
+ if motives:
+ if type(coords) is not dict:
+ raise TypeError("Cannot specify 'motives=True' without passing motives")
+
+ # Ensure the index is a unique identifier
+ if not data.index.is_unique:
+ raise ValueError("DataFrame index must be unique to be used as an identifier.")
+
+ if score and not motives:
+ raise ValueError("Cannot specify 'score=True' without passing motives")
+
+ if "_motive" in data.columns:
+ if motives:
+ raise ValueError(
+ "Please rename existing '_motive' column OR do not pass 'motives=True'"
+ )
+
+ if "score" in data.columns:
+ if score:
+ raise ValueError(
+ "Please rename existing '_score' column OR do not pass 'score=True'"
+ )
+
+ if "_block" in data.columns:
+ raise ValueError("Please rename existing '_block' column")
+
+ if output_columns is None:
+ output_columns = data.columns
+
+ data = data[output_columns].copy()
+
+ if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph
+ if show_as_pairs:
+ columns = [col + "_l" for col in data.columns] + [
+ col + "_r" for col in data.columns
+ ]
+ output_data = pd.DataFrame(columns=columns)
+ else:
+ output_data = pd.DataFrame(columns=data.columns)
+
+ if motives:
+ output_data["_motive"] = ""
+ if score:
+ output_data["_score"] = 0
+ output_data["_block"] = -1
+
+ else:
+ output_data = data
+ # Map coords to connected component labels
+ if merge_blocks: # We solve the connected components problem
+ cc_labels = solve_connected_components_from_coords(coords)
+ # Match original index to new block ID
+ matcher = {
+ idx: label
+ for idx, label in enumerate(cc_labels)
+ if label != -1 and idx in data.index
+ }
+ else: # We solve the cliques problem
+ g = nx.Graph()
+ # noinspection PyTypeChecker
+ g.add_edges_from(coords)
+ complete_subgraphs = list(nx.find_cliques(g))
+ complete_subgraphs = sorted(complete_subgraphs)
+ # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))}
+ matcher = dict()
+ for i, clique in enumerate(complete_subgraphs):
+ for node_idx in clique:
+ if node_idx in matcher.keys():
+ matcher[node_idx].append(i)
+ else:
+ matcher[node_idx] = [i]
+
+ if show_as_pairs:
+ output_data = pd.DataFrame()
+ for pair in coords:
+ left_row = data.loc[[tuple(pair)[0]]].copy()
+ current_index = left_row.index
+ right_row = data.loc[[tuple(pair)[1]]].copy()
+ left_row.columns = [col + "_l" for col in left_row.columns]
+ right_row.columns = [col + "_r" for col in right_row.columns]
+ current_row = pd.concat(
+ [left_row.reset_index(drop=True), right_row.reset_index(drop=True)],
+ axis=1,
+ )
+ current_row.index = current_index
+ if motives:
+ motives_solved = solve_motives(coords[pair])
+ current_row["_motive"] = [list(map(str, motives_solved))]
+ if score:
+ current_row["_score"] = len(
+ motives_solved
+ ) # Score is simply the number of non-redundant motives
+ output_data = pd.concat([output_data, current_row])
+
+ # Assign blocks to rows based on their original index
+ output_data["_block"] = output_data.index.map(matcher)
+ if not merge_blocks:
+ output_data = output_data.explode("_block")
+
+ if keep_ungrouped_rows:
+ output_data["_block"] = output_data["_block"].fillna(-1)
+ matcher_ungrouped_rows = {}
+ block_temp = []
+ i = 0 # Track # of blocks processed
+ for b in output_data["_block"]:
+ if b == -1:
+ block_temp.append(i)
+ i += 1
+ elif b not in matcher_ungrouped_rows:
+ matcher_ungrouped_rows[b] = i
+ block_temp.append(i)
+ i += 1
+ else:
+ block_temp.append(matcher_ungrouped_rows[b])
+ output_data["_block"] = block_temp
+ else:
+ if not show_as_pairs:
+ output_data = output_data[
+ output_data["_block"].duplicated(keep=False)
+ & output_data["_block"].notna()
+ ]
+
+ output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"])
+
+ if sort:
+ # Sort by block, then by original index
+ sort_cols = ["_block"]
+ if output_data.index.name:
+ output_data = output_data.sort_values(
+ sort_cols + [output_data.index.name]
+ )
+ else:
+ # If no named index, use the first column of the DataFrame
+ output_data = output_data.reset_index()
+ output_data = output_data.sort_values(
+ sort_cols + [output_data.columns[0]]
+ )
+ output_data = output_data.set_index(output_data.columns[0])
+
+ if not show_as_pairs and motives:
+ id_list = flatten(coords.keys())
+ motive_matcher = {
+ row_id: list(map(str, solve_motives(coords[pair])))
+ for pair in coords.keys()
+ for row_id in id_list
+ if row_id in pair
+ }
+ # noinspection PyTypeChecker
+ output_data["_motive"] = output_data.index.map(motive_matcher)
+ if score:
+ output_data["_score"] = 0
+ score_matcher = { # Horribly repetitive
+ row_id: len(solve_motives(coords[pair]))
+ for pair in coords.keys()
+ for row_id in id_list
+ if row_id in pair
+ }
+ output_data["_score"] = output_data.index.map(score_matcher)
+
+ output_data = output_data.reset_index(drop=True)
+ output_data["_block"] = output_data["_block"].astype(int)
+
+ return output_data
+
+
+def generate_blocking_report(
+ data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None
+) -> pd.DataFrame:
+ """
+ Shorthand for add_blocks_to_dataset with below arguments
+ """
+ return add_blocks_to_dataset(
+ data,
+ coords,
+ sort=True,
+ merge_blocks=False,
+ motives=True,
+ show_as_pairs=True,
+ output_columns=output_columns,
+ )
+
+
def merge_blockers(
left: BlockerNode, right: BlockerNode
) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode:
@@ -592,3 +859,6 @@ def merge_blockers(
)
else:
return AndNode(left, right)
+
+
+# TODO: deport logic in a way that enables .progress_apply
diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py
index 837645f..b644a43 100644
--- a/src/ms_blocking/utils.py
+++ b/src/ms_blocking/utils.py
@@ -4,53 +4,72 @@
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
import pandas as pd
-import networkx as nx
import random
from collections import Counter
from itertools import combinations
from typing import List, Set, Iterable, Dict, Collection, Any
+
+class EquivalenceMotive:
+ def __init__(self, blocking_column: str):
+ if not isinstance(blocking_column, str):
+ raise TypeError("blocking_column for Motive must be a string")
+ self.blocking_column = blocking_column
+
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, EquivalenceMotive | OverlapMotive):
+ raise TypeError("Can only compare Motives")
+ return self.blocking_column == other.blocking_column
+
+ def __str__(self):
+ return f"Same '{self.blocking_column}'"
+
+ def __repr__(self):
+ return f"EquivalenceMotive(['{self.blocking_column}'])"
+
+
+class OverlapMotive:
+ def __init__(
+ self, blocking_column: str, overlap: int = 1, word_level: bool = False
+ ):
+ if not isinstance(blocking_column, str):
+ raise TypeError("blocking_column for Motive must be a string")
+ if not isinstance(overlap, int):
+ raise TypeError("overlap must be an int")
+ if not isinstance(word_level, bool):
+ raise TypeError("word_level must be a boolean")
+ self.blocking_column = blocking_column
+ self.overlap = overlap
+ self.word_level = word_level
+
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, EquivalenceMotive | OverlapMotive):
+ raise TypeError("Can only compare Motives")
+ return (
+ self.blocking_column == other.blocking_column
+ and self.overlap == other.overlap
+ and self.word_level == other.word_level
+ )
+
+ def __str__(self):
+ return f">={self.overlap}{' word-level' if self.word_level else ''} overlap in '{self.blocking_column}'"
+
+ def __repr__(self):
+ return f"OverlapMotive(['{self.blocking_column}'], {self.overlap}{', word_level=True' if self.word_level else ''})"
+
+
Columns = List[str]
Pair = Collection[int]
+Motive = EquivalenceMotive | OverlapMotive
CoordsBasic = Set[Pair]
-CoordsMotives = Dict[Pair, Set[str]]
+CoordsMotives = Dict[Pair, List[Motive]]
Coords = CoordsBasic | CoordsMotives
_PUNCT_RE = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~]')
_SPACE_RE = re.compile(r"\s+")
-def remove_rows_if_value_appears_only_once(
- data: pd.DataFrame, cols: Columns
-) -> pd.DataFrame:
- """Drop rows of a Pandas DataFrame where a certain column's values appears only once.
-
- Ensures all elements of provided columns appear at least twice in their column
-
- Parameters
- ----------
- data : DataFrame
- DataFrame to preprocess
-
- cols : List[str]
- List of columns where rows that contain non-duplicated elements shall be discarded
-
- Returns
- -------
- DataFrame
- DataFrame with reduced number of rows
-
- Examples
- --------
- >>> remove_rows_if_value_appears_only_once(data, ['name', 'city'])
- """
- for col in cols:
- counts = data[col].map(data[col].value_counts())
- data = data[counts >= 2]
- return data
-
-
def start_from_zero(figures: Collection[int]) -> List[int]:
"""Turns a list of integers into a same-length list that starts at 0, without gaps
@@ -240,7 +259,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords:
if type(coords_1) is type(coords_2) is dict: # We have motives
return {
pair: (
- (coords_1[pair] | coords_2[pair])
+ coords_1[pair] + coords_2[pair]
if (pair in coords_1 and pair in coords_2)
else coords_1[pair]
if (pair in coords_1)
@@ -278,7 +297,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords:
"""
if type(coords_1) is type(coords_2) is dict: # We have motives
return {
- pair: (coords_1[pair] | coords_2[pair])
+ pair: coords_1[pair] + coords_2[pair]
for y in (coords_1, coords_2)
for pair in y.keys()
if (pair in coords_1 and pair in coords_2)
@@ -287,219 +306,6 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords:
return coords_1.intersection(coords_2)
-def add_blocks_to_dataset(
- data: pd.DataFrame,
- coords: Coords,
- sort: bool = True,
- keep_ungrouped_rows: bool = False,
- merge_blocks: bool = True,
- motives: bool = False,
- show_as_pairs: bool = False,
- output_columns: Columns = None,
-) -> pd.DataFrame:
- """Returns the intersection of an array of links
-
- Takes two lists of paired elements, with or without motives, returns their intersection
-
- Parameters
- ----------
- data : DataFrame
- DataFrame for blocking
- coords : Array
- Blocked coordinates
- sort : bool
- Whether to sort the result by block, thereby regrouping rows of the same block
- keep_ungrouped_rows : bool
- Whether to display rows that do not belong to any block
- merge_blocks : bool
- Whether to merge transitively merge blocks
- motives : bool
- Whether to display the reason behind each block
- show_as_pairs : bool
- Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame
- output_columns : list
- Columns to show. Useful in combination with show_as_pairs as column names are altered
-
- Returns
- -------
- DataFrame
- Blocked DataFrame
-
- Examples
- --------
- >>> add_blocks_to_dataset(data=pd.DataFrame(
- [
- [0, 'first', 4],
- [1, 'second', 6],
- [2, 'first', 2],
- [3, 'third', 5]
- ],
- columns=['id', 'rank', 'score']),
- coords=np.array([{0, 2}]),
- show_as_pairs=True,
- output_columns=['id', 'rank'])
- id_l rank_l id_r rank_r block
- 0 0 first 2 first 0
- """
-
- if show_as_pairs and keep_ungrouped_rows:
- raise ValueError("Cannot both return pairs and keep ungrouped rows")
-
- if motives:
- if type(coords) is not dict:
- raise TypeError("Cannot specify motives=True without passing motives")
-
- # Ensure the index is a unique identifier
- if not data.index.is_unique:
- raise ValueError("DataFrame index must be unique to be used as an identifier.")
-
- if "_motive" in data.columns:
- if motives:
- raise ValueError(
- "Please rename existing '_motive' column OR do not pass 'motives=True'"
- )
-
- if "_block" in data.columns:
- raise ValueError("Please rename existing '_block' column")
-
- if output_columns is None:
- output_columns = data.columns
- data = data[output_columns].copy()
-
- if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph
- if show_as_pairs:
- columns = [col + "_l" for col in data.columns] + [
- col + "_r" for col in data.columns
- ]
- output_data = pd.DataFrame(columns=columns)
- else:
- output_data = pd.DataFrame(columns=data.columns)
- else:
- output_data = data
- # Map coords to connected component labels
- if merge_blocks: # We solve the connected components problem
- cc_labels = solve_connected_components_from_coords(coords)
- # Match original index to new block ID
- matcher = {
- idx: label
- for idx, label in enumerate(cc_labels)
- if label != -1 and idx in data.index
- }
- else: # We solve the cliques problem
- g = nx.Graph()
- # noinspection PyTypeChecker
- g.add_edges_from(coords)
- complete_subgraphs = list(nx.find_cliques(g))
- complete_subgraphs = sorted(complete_subgraphs)
- # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))}
- matcher = dict()
- for i, clique in enumerate(complete_subgraphs):
- for node_idx in clique:
- if node_idx in matcher.keys():
- matcher[node_idx].append(i)
- else:
- matcher[node_idx] = [i]
-
- if show_as_pairs:
- output_data = pd.DataFrame()
- for pair in coords:
- left_row = data.loc[[tuple(pair)[0]]].copy()
- current_index = left_row.index
- right_row = data.loc[[tuple(pair)[1]]].copy()
- left_row.columns = [col + "_l" for col in left_row.columns]
- right_row.columns = [col + "_r" for col in right_row.columns]
- current_row = pd.concat(
- [left_row.reset_index(drop=True), right_row.reset_index(drop=True)],
- axis=1,
- )
- current_row.index = current_index
- output_data = pd.concat([output_data, current_row])
-
- # Assign blocks to rows based on their original index
- output_data["_block"] = output_data.index.map(matcher)
- if not merge_blocks:
- output_data = output_data.explode("_block")
-
- if keep_ungrouped_rows:
- output_data["_block"] = output_data["_block"].fillna(-1)
- matcher_ungrouped_rows = {}
- block_temp = []
- i = 0 # Track # of blocks processed
- for b in output_data["_block"]:
- if b == -1:
- block_temp.append(i)
- i += 1
- elif b not in matcher_ungrouped_rows:
- matcher_ungrouped_rows[b] = i
- block_temp.append(i)
- i += 1
- else:
- block_temp.append(matcher_ungrouped_rows[b])
- output_data["_block"] = block_temp
- else:
- if not show_as_pairs:
- output_data = output_data[
- output_data["_block"].duplicated(keep=False)
- & output_data["_block"].notna()
- ]
-
- output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"])
-
- if sort:
- # Sort by block, then by original index
- sort_cols = ["_block"]
- if output_data.index.name:
- output_data = output_data.sort_values(
- sort_cols + [output_data.index.name]
- )
- else:
- # If no named index, use the first column of the DataFrame
- output_data = output_data.reset_index()
- output_data = output_data.sort_values(
- sort_cols + [output_data.columns[0]]
- )
- output_data = output_data.set_index(output_data.columns[0])
-
- if motives:
- output_data["_motive"] = ""
- id_list = flatten(coords.keys())
- motive_matcher = {
- row_id: frozenset(
- reason
- for pair in coords.keys()
- if row_id in pair
- for reason in coords[pair]
- )
- for row_id in id_list
- }
- output_data["_motive"] = output_data.index.map(motive_matcher)
-
- if "_block" not in output_data.columns: # Empty coords
- output_data["_block"] = -1
-
- output_data = output_data.reset_index(drop=True)
- output_data["_block"] = output_data["_block"].astype(int)
-
- return output_data
-
-
-def generate_blocking_report(
- data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None
-) -> pd.DataFrame:
- """
- Shorthand for add_blocks_to_dataset with below arguments
- """
- return add_blocks_to_dataset(
- data,
- coords,
- sort=True,
- merge_blocks=False,
- motives=True,
- show_as_pairs=True,
- output_columns=output_columns,
- )
-
-
def parse_list(s: str | List, word_level: bool = False) -> List[str]:
"""Turns a stringified list into an actual python list, taking extra inner quotes into account
@@ -511,7 +317,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
Stringified representation of a list e.g. "['string 1', 'string 2', ...]"
word_level : bool
- Whether to return a list of all words within s instead of a list of each comma-separated element
+ Whether to return a list of all words within s instead of a list of each comma-separated element;
+ Note that if passed a string that does not represent a list, this argument will be ignored and the function
+ will return a list of each word in the string
Returns
-------
@@ -527,7 +335,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
"""
if type(s) is list: # If we already have a list
- if len(s) == 1 and s[0][0] == "[" and s[0][-1] == "]":
+ if (
+ len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).endswith("]")
+ ): # In case we have a stringified list INSIDE a normal list
s = s[0]
else:
return s
@@ -540,10 +350,15 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
if not s:
return []
- try:
- parts = ast.literal_eval(s)
- except ValueError: # doesn't seem to be a stringified list
- parts = s.split("', '")
+ if s.startswith("[") and s.endswith("]"): # Stringified list?
+ try:
+ parts = ast.literal_eval(s)
+ except ValueError: # doesn't seem to be a stringified list
+ parts = s.split("', '")
+ except SyntaxError: # In case we have a string surroudned by brackets
+ parts = s.split()
+ else:
+ parts = s.split()
cleaned_items = [str(part).strip().strip("''") for part in parts]
@@ -553,40 +368,6 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
return [s for s in cleaned_items if len(s) > 0]
-def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series:
- """Add a score to a blocked DataFrame based on the number of motives
-
- Parameters
- ----------
- data : DataFrame
- DataFrame with motives
-
- motives_column : str
- Name of the column containing the motives
-
- Returns
- -------
- Series[int]
- A column of scores
- """
-
- # Check that we do have motives
- if motives_column not in data.columns:
- if motives_column == "_motive":
- raise ValueError("No motives in DataFrame")
- else:
- raise ValueError(
- f'Specified motives column "{motives_column}" does not exist'
- )
-
- if "score" in data.columns:
- print("Renaming 'score' column to 'score_old'")
- data = data.rename(columns={"score": "score_old"})
-
- scores = data[motives_column].apply(len)
- return scores
-
-
def must_not_be_different_apply( # WIP
temp_data: pd.DataFrame,
blocking_columns: List[str],
@@ -682,7 +463,9 @@ def block_overlap(groups: Iterable, overlap: int = 1) -> Coords:
return coords
-def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives:
+def add_motives_to_coords(
+ coords: Coords, explanations: List[Motive]
+) -> Dict[Pair, List[Motive]]:
"""Block a DataFrame based on overlap accross columns
Parameters
@@ -690,7 +473,7 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv
coords : Coords
Coords obtained by blocking
- explanations : Set[str]
+ explanations : Set[EquivalenceMotive|OverlapMotive]
Set of explanations
Returns
@@ -718,3 +501,99 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv
}
"""
return {pair: explanations for pair in coords}
+
+
+def solve_motives(motives: List[Motive]) -> List[Motive]:
+ """Remove duplicated and redundant motives from a list of motives
+
+ Redundant motives refer to OverlapMotives on the same column(s) but with different overlap or word-level condition
+
+ Parameters
+ ----------
+ motives : List[Motive]
+ Coords obtained by blocking
+
+ Returns
+ -------
+ List[Motive]
+ A list of Motives whose length should be smaller or equal to the original list of motives
+
+ Examples
+ --------
+ >>> solve_motives([OverlapMotive('websites', 1), OverlapMotive('websites', 2), OverlapMotive('websites', 2, word_level=False)])
+ [OverlapMotive(['websites'], 2, word_level=False)]
+ """
+ if not motives:
+ raise ValueError("Motives must not be empty")
+
+ # split_motives = []
+ # for motive in motives:
+ # split_motives += split_motive(motive)
+
+ final_motives = [
+ motive for motive in motives if type(motive) is EquivalenceMotive
+ ] # With EquivalenceMotive, equality check suffices
+ overlap_motives = [motive for motive in motives if type(motive) is OverlapMotive]
+ overlap_columns = [motive.blocking_column for motive in overlap_motives]
+
+ for column in overlap_columns:
+ overlap_motives_for_column = [
+ motive for motive in overlap_motives if motive.blocking_column == column
+ ]
+
+ # Select Blocker with stricter word/element-level condition
+ word_level_motives_for_column = [
+ motive for motive in overlap_motives_for_column if motive.word_level
+ ]
+ not_word_level_motives_for_column = [
+ motive for motive in overlap_motives_for_column if not motive.word_level
+ ]
+
+ # Find biggest overlap among the non-word_level ones
+ if not_word_level_motives_for_column:
+ max_overlap_not_word_level_for_column = max(
+ not_word_level_motives_for_column, key=lambda m: m.overlap
+ )
+ max_overlap_not_word_level_for_column_overlap = (
+ max_overlap_not_word_level_for_column.overlap
+ )
+ else:
+ max_overlap_not_word_level_for_column = []
+ max_overlap_not_word_level_for_column_overlap = (
+ 0 # Will never be used, left for linter
+ )
+
+ # Now find biggest overlap among the word_level ones
+ if word_level_motives_for_column:
+ max_overlap_word_level_for_column = max(
+ word_level_motives_for_column, key=lambda m: m.overlap
+ )
+ max_overlap_word_level_for_column_overlap = (
+ max_overlap_word_level_for_column.overlap
+ )
+ if not_word_level_motives_for_column:
+ # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it
+ if (
+ max_overlap_word_level_for_column_overlap
+ <= max_overlap_not_word_level_for_column_overlap
+ ):
+ max_overlap_word_level_for_column = []
+ else:
+ max_overlap_word_level_for_column = []
+
+ if max_overlap_not_word_level_for_column:
+ max_overlap_not_word_level_for_column = [
+ max_overlap_not_word_level_for_column
+ ]
+ if max_overlap_word_level_for_column:
+ max_overlap_word_level_for_column = [max_overlap_word_level_for_column]
+ final_motives += (
+ max_overlap_word_level_for_column + max_overlap_not_word_level_for_column
+ )
+
+ # Remove duplicates
+ final_motives_no_duplicates = []
+ for motive in final_motives:
+ if motive not in final_motives_no_duplicates:
+ final_motives_no_duplicates.append(motive)
+ return final_motives_no_duplicates
diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py
index d3f9ab2..cf92924 100644
--- a/tests/test_ms_blocking.py
+++ b/tests/test_ms_blocking.py
@@ -84,18 +84,28 @@ def attribute_city_keep_ungrouped_rows_false():
@pytest.fixture
def attribute_city_motives_true_block():
return {
- frozenset({3, 8}): {"Same 'City'"},
- frozenset({1, 4}): {"Same 'City'"},
- frozenset({8, 11}): {"Same 'City'"},
- frozenset({3, 11}): {"Same 'City'"},
- frozenset({2, 5}): {"Same 'City'"},
- frozenset({10, 13}): {"Same 'City'"},
+ frozenset({3, 8}): [msb.EquivalenceMotive("City")],
+ frozenset({1, 4}): [msb.EquivalenceMotive("City")],
+ frozenset({8, 11}): [msb.EquivalenceMotive("City")],
+ frozenset({3, 11}): [msb.EquivalenceMotive("City")],
+ frozenset({2, 5}): [msb.EquivalenceMotive("City")],
+ frozenset({10, 13}): [msb.EquivalenceMotive("City")],
}
@pytest.fixture
def attribute_city_motives_true_add():
- return [{"Same 'City'"}] * 9
+ return [
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ]
@pytest.fixture
@@ -116,25 +126,30 @@ def city_age_name_websites_pipelining_id():
@pytest.fixture
def city_age_websites_pipelining_motives():
return [
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'"}),
- frozenset({"Same 'Age'", "Same 'City'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
+ {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {"Same 'City'", "Same 'Age'"},
+ {"Same 'City'", "Same 'Age'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
]
@pytest.fixture
def city_age_websites_pipelining_scores():
- return [3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1]
+ return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+@pytest.fixture
+def city_age_websites_pipelining_scores_not_show_as_pairs():
+ return [3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1]
@pytest.fixture
@@ -335,9 +350,10 @@ def test_pipelining_motives(city_age_websites_pipelining_motives):
websites_blocker = msb.OverlapBlocker(["websites"])
final_blocker = (city_blocker & age_blocker) | websites_blocker
links = final_blocker.block(get_users(), motives=True)
- actual = msb.add_blocks_to_dataset(
+ motives = msb.add_blocks_to_dataset( # Use set to ignore ordering
get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False
)["_motive"].to_list()
+ actual = [set(motive) for motive in motives]
assert actual == expected
@@ -350,9 +366,36 @@ def test_pipelining_scores(city_age_websites_pipelining_scores):
final_blocker = (city_blocker & age_blocker) | websites_blocker
links = final_blocker.block(get_users(), motives=True)
report = msb.add_blocks_to_dataset(
- get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False
+ get_users(),
+ links,
+ show_as_pairs=True,
+ motives=True,
+ merge_blocks=False,
+ score=True,
+ )
+ actual = sorted(report["_score"], reverse=True)
+ assert actual == expected
+
+
+def test_pipelining_scores_without_show_as_pairs(
+ city_age_websites_pipelining_scores_not_show_as_pairs,
+):
+ """Test that scoring does work as intended"""
+ expected = city_age_websites_pipelining_scores_not_show_as_pairs
+ city_blocker = msb.AttributeEquivalenceBlocker(["City"])
+ age_blocker = msb.AttributeEquivalenceBlocker(["Age"])
+ websites_blocker = msb.OverlapBlocker(["websites"])
+ final_blocker = (city_blocker & age_blocker) | websites_blocker
+ links = final_blocker.block(get_users(), motives=True)
+ report = msb.add_blocks_to_dataset(
+ get_users(),
+ links,
+ show_as_pairs=False,
+ motives=True,
+ merge_blocks=False,
+ score=True,
)
- actual = sorted(msb.scoring(report), reverse=True)
+ actual = sorted(report["_score"], reverse=True)
assert actual == expected