diff --git a/docs/example.ipynb b/docs/example.ipynb index 6b82165..8243053 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,8 +32,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.010997600Z", - "start_time": "2026-01-30T14:21:13.420790Z" + "end_time": "2026-02-03T16:02:56.751154300Z", + "start_time": "2026-02-03T16:02:55.924397100Z" } }, "source": [ @@ -60,8 +60,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.049404600Z", - "start_time": "2026-01-30T14:21:14.010997600Z" + "end_time": "2026-02-03T16:02:56.810955300Z", + "start_time": "2026-02-03T16:02:56.751154300Z" } }, "source": [ @@ -282,8 +282,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.190107400Z", - "start_time": "2026-01-30T14:21:14.089762400Z" + "end_time": "2026-02-03T16:02:56.966380500Z", + "start_time": "2026-02-03T16:02:56.862834100Z" } }, "source": [ @@ -310,8 +310,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.309413300Z", - "start_time": "2026-01-30T14:21:14.278545600Z" + "end_time": "2026-02-03T16:02:57.285912400Z", + "start_time": "2026-02-03T16:02:57.147878900Z" } }, "source": [ @@ -322,7 +322,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City'])\n" ] } ], @@ -339,8 +339,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.378808Z", - "start_time": "2026-01-30T14:21:14.349508200Z" + "end_time": "2026-02-03T16:02:57.479607Z", + "start_time": "2026-02-03T16:02:57.418159200Z" } }, "source": [ @@ -369,8 +369,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.558644200Z", - "start_time": "2026-01-30T14:21:14.459573100Z" + "end_time": "2026-02-03T16:02:57.776512200Z", + "start_time": "2026-02-03T16:02:57.565676Z" } }, "source": [ @@ -409,8 +409,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.635514Z", - "start_time": "2026-01-30T14:21:14.598913Z" + "end_time": "2026-02-03T16:02:57.810023Z", + "start_time": "2026-02-03T16:02:57.778482900Z" } }, "source": [ @@ -574,8 +574,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.829719100Z", - "start_time": "2026-01-30T14:21:14.676157200Z" + "end_time": "2026-02-03T16:02:58.075057800Z", + "start_time": "2026-02-03T16:02:57.893294100Z" } }, "source": [ @@ -622,8 +622,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.027923700Z", - "start_time": "2026-01-30T14:21:14.926401Z" + "end_time": "2026-02-03T16:02:58.413477400Z", + "start_time": "2026-02-03T16:02:58.285492900Z" } }, "source": [ @@ -759,8 +759,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.403596500Z", - "start_time": "2026-01-30T14:21:15.279120300Z" + "end_time": "2026-02-03T16:02:58.887317800Z", + "start_time": "2026-02-03T16:02:58.675247500Z" } }, "source": [ @@ -796,8 +796,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.686136800Z", - "start_time": "2026-01-30T14:21:15.608444400Z" + "end_time": "2026-02-03T16:02:59.272554700Z", + "start_time": "2026-02-03T16:02:59.130460300Z" } }, "source": [ @@ -971,8 +971,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.998425200Z", - "start_time": "2026-01-30T14:21:15.931370100Z" + "end_time": "2026-02-03T16:02:59.806784300Z", + "start_time": "2026-02-03T16:02:59.686250600Z" } }, "source": [ @@ -1075,8 +1075,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.305679100Z", - "start_time": "2026-01-30T14:21:16.212470400Z" + "end_time": "2026-02-03T16:03:00.721777Z", + "start_time": "2026-02-03T16:03:00.603955400Z" } }, "source": [ @@ -1089,7 +1089,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n" + "Processing AttributeEquivalenceBlocker(['Age', 'City'])\n" ] }, { @@ -1223,8 +1223,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.678653800Z", - "start_time": "2026-01-30T14:21:16.558976200Z" + "end_time": "2026-02-03T16:03:01.209432600Z", + "start_time": "2026-02-03T16:03:01.048013600Z" } }, "source": [ @@ -1237,7 +1237,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'])\n" ] }, { @@ -1342,8 +1342,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.354294400Z", - "start_time": "2026-01-30T14:21:17.316050200Z" + "end_time": "2026-02-03T16:03:01.834433100Z", + "start_time": "2026-02-03T16:03:01.686309100Z" } }, "source": [ @@ -1358,7 +1358,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'], NON-NORMALIZED)\n" ] }, { @@ -1440,8 +1440,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.537043700Z", - "start_time": "2026-01-30T14:21:17.392490700Z" + "end_time": "2026-02-03T16:03:02.711968Z", + "start_time": "2026-02-03T16:03:02.581163100Z" } }, "source": [ @@ -1453,7 +1453,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City'])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] } @@ -1464,8 +1464,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.655177300Z", - "start_time": "2026-01-30T14:21:17.573776300Z" + "end_time": "2026-02-03T16:03:03.614029700Z", + "start_time": "2026-02-03T16:03:02.835393200Z" } }, "source": [ @@ -1477,96 +1477,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing MixedBlocker(['City'], ['websites'], 1)\n" + "Processing " ] }, { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 10 Caroline Dufour Lens 45 \n", - "3 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['pythonensamusant.fr', 'lensfans.fr'] 1 \n", - "3 ['lensfans.fr'] 1 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
210Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']1
313Benoît BenoîtLens15['lensfans.fr']1
\n", - "
" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" + "ename": "TypeError", + "evalue": "object of type 'bool' has no len()", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mTypeError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[17]\u001B[39m\u001B[32m, line 1\u001B[39m\n\u001B[32m----> \u001B[39m\u001B[32m1\u001B[39m links = \u001B[43m(\u001B[49m\u001B[43mcity_blocker\u001B[49m\u001B[43m \u001B[49m\u001B[43m&\u001B[49m\u001B[43m \u001B[49m\u001B[43mwebsites_blocker\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mblock\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 2\u001B[39m msb.add_blocks_to_dataset(df, links)\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:383\u001B[39m, in \u001B[36mMixedBlocker.block\u001B[39m\u001B[34m(self, data, motives)\u001B[39m\n\u001B[32m 380\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mblock\u001B[39m(\u001B[38;5;28mself\u001B[39m, data, motives=\u001B[38;5;28;01mFalse\u001B[39;00m):\n\u001B[32m 381\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"Regroup rows based on overlap of one or more columns\"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m383\u001B[39m \u001B[38;5;28;43mprint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43mProcessing\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 385\u001B[39m total_columns = \u001B[38;5;28mself\u001B[39m.equivalence_columns + \u001B[38;5;28mself\u001B[39m.overlap_columns\n\u001B[32m 387\u001B[39m temp_data = data[total_columns].copy()\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:345\u001B[39m, in \u001B[36mMixedBlocker.__repr__\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 342\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m__repr__\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[32m 343\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(\n\u001B[32m 344\u001B[39m AndNode(\n\u001B[32m--> \u001B[39m\u001B[32m345\u001B[39m \u001B[43mAttributeEquivalenceBlocker\u001B[49m\u001B[43m(\u001B[49m\n\u001B[32m 346\u001B[39m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mequivalence_columns\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mnormalize\u001B[49m\n\u001B[32m 347\u001B[39m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m,\n\u001B[32m 348\u001B[39m OverlapBlocker(\n\u001B[32m 349\u001B[39m \u001B[38;5;28mself\u001B[39m.overlap_columns, \u001B[38;5;28mself\u001B[39m.overlap, \u001B[38;5;28mself\u001B[39m.word_level, \u001B[38;5;28mself\u001B[39m.normalize\n\u001B[32m 350\u001B[39m ),\n\u001B[32m 351\u001B[39m )\n\u001B[32m 352\u001B[39m )\n", + "\u001B[36mFile \u001B[39m\u001B[32m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:109\u001B[39m, in \u001B[36mAttributeEquivalenceBlocker.__init__\u001B[39m\u001B[34m(self, blocking_columns, normalize_strings, must_not_be_different)\u001B[39m\n\u001B[32m 107\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(must_not_be_different) \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28mstr\u001B[39m:\n\u001B[32m 108\u001B[39m must_not_be_different = [must_not_be_different]\n\u001B[32m--> \u001B[39m\u001B[32m109\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28;43mlen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mmust_not_be_different\u001B[49m\u001B[43m)\u001B[49m > \u001B[32m1\u001B[39m:\n\u001B[32m 110\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[33m\"\u001B[39m\u001B[33mThere must be only one extra column\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 111\u001B[39m \u001B[38;5;28;01melif\u001B[39;00m (\n\u001B[32m 112\u001B[39m must_not_be_different\n\u001B[32m 113\u001B[39m \u001B[38;5;129;01mand\u001B[39;00m must_not_be_different[\u001B[32m0\u001B[39m] \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m.blocking_columns\n\u001B[32m 114\u001B[39m ):\n", + "\u001B[31mTypeError\u001B[39m: object of type 'bool' has no len()" + ] } ], "execution_count": 17 @@ -1587,181 +1513,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.910335600Z", - "start_time": "2026-01-30T14:21:17.821453400Z" - } - }, + "metadata": {}, "source": [ "links = (city_blocker | websites_blocker).block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", - "Processing OverlapBlocker(['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 6 Jean-Michel Python Douai 49 \n", - "3 10 Caroline Dufour Lens 45 \n", - "4 13 Benoît Benoît Lens 15 \n", - "5 2 Pierre Dusquesnes Phalempin 24 \n", - "6 5 pierre dusquesnes Phalempin 24 \n", - "7 3 Paul Delarue Roubaix 32 \n", - "8 8 Sophie Delarue Roubaix 33 \n", - "9 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['lensfans.fr', 'pythonensamusant.fr'] 0 \n", - "3 ['pythonensamusant.fr', 'lensfans.fr'] 0 \n", - "4 ['lensfans.fr'] 0 \n", - "5 ['somewebsite.com/users/rpz59'] 1 \n", - "6 [] 1 \n", - "7 ['roubaixlove.fr'] 2 \n", - "8 [] 2 \n", - "9 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
26Jean-Michel PythonDouai49['lensfans.fr', 'pythonensamusant.fr']0
310Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']0
413Benoît BenoîtLens15['lensfans.fr']0
52Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
65pierre dusquesnesPhalempin24[]1
73Paul DelarueRoubaix32['roubaixlove.fr']2
88Sophie DelarueRoubaix33[]2
911sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 18 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1802,12 +1560,7 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.279899900Z", - "start_time": "2026-01-30T14:21:18.250988900Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", @@ -1815,7 +1568,7 @@ "websites_blocker = msb.OverlapBlocker([\"websites\"])" ], "outputs": [], - "execution_count": 19 + "execution_count": null }, { "cell_type": "markdown", @@ -1826,17 +1579,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.481263300Z", - "start_time": "2026-01-30T14:21:18.466284300Z" - } - }, + "metadata": {}, "source": [ "final_blocker = (city_blocker & age_blocker) | (name_blocker & websites_blocker)" ], "outputs": [], - "execution_count": 20 + "execution_count": null }, { "cell_type": "markdown", @@ -1847,137 +1595,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.562779600Z", - "start_time": "2026-01-30T14:21:18.520368200Z" - } - }, + "metadata": {}, "source": [ "links = final_blocker.block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", - "Processing MixedBlocker(['Name'], ['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
35pierre dusquesnesPhalempin24[]1
48Sophie DelarueRoubaix33[]2
511sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 21 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -1988,26 +1612,13 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.843568700Z", - "start_time": "2026-01-30T14:21:18.686911500Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "links = city_blocker.block(df)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" - ] - } - ], - "execution_count": 22 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2032,161 +1643,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.967168700Z", - "start_time": "2026-01-30T14:21:18.928864500Z" - } - }, + "metadata": {}, "source": [ "msb.add_blocks_to_dataset(df, links, sort=False)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "4 5 pierre dusquesnes Phalempin 24 \n", - "5 8 Sophie Delarue Roubaix 33 \n", - "6 10 Caroline Dufour Lens 45 \n", - "7 11 sophie_delarue Roubaix 33 \n", - "8 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['somewebsite.com/users/rpz59'] 1 \n", - "2 ['roubaixlove.fr'] 2 \n", - "3 ['jacquesdupond.fr'] 0 \n", - "4 [] 1 \n", - "5 [] 2 \n", - "6 ['pythonensamusant.fr', 'lensfans.fr'] 3 \n", - "7 [] 2 \n", - "8 ['lensfans.fr'] 3 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
23Paul DelarueRoubaix32['roubaixlove.fr']2
34Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
45pierre dusquesnesPhalempin24[]1
58Sophie DelarueRoubaix33[]2
610Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']3
711sophie_delarueRoubaix33[]2
813Benoît BenoîtLens15['lensfans.fr']3
\n", - "
" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 23 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2211,216 +1673,12 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.276047300Z", - "start_time": "2026-01-30T14:21:19.146886900Z" - } - }, + "metadata": {}, "source": [ "msb.add_blocks_to_dataset(df, links, keep_ungrouped_rows=True)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 0 Jean d'Aux Lille 26 \n", - "1 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "2 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "3 2 Pierre Dusquesnes Phalempin 24 \n", - "4 5 pierre dusquesnes Phalempin 24 \n", - "5 3 Paul Delarue Roubaix 32 \n", - "6 8 Sophie Delarue Roubaix 33 \n", - "7 11 sophie_delarue Roubaix 33 \n", - "8 6 Jean-Michel Python Douai 49 \n", - "9 7 Gédéon Glincarné Paris 53 \n", - "10 9 Jeanne Verbrugge Valenciennes 41 \n", - "11 10 Caroline Dufour Lens 45 \n", - "12 13 Benoît Benoît Lens 15 \n", - "13 12 Marcel Vandermersch Fourmies 48 \n", - "\n", - " websites _block \n", - "0 ['jeandaux.fr', 'lillefans.fr'] 0 \n", - "1 ['somewebsite.com/users/jacquesdupond', 'jacqu... 1 \n", - "2 ['jacquesdupond.fr'] 1 \n", - "3 ['somewebsite.com/users/rpz59'] 2 \n", - "4 [] 2 \n", - "5 ['roubaixlove.fr'] 3 \n", - "6 [] 3 \n", - "7 [] 3 \n", - "8 ['lensfans.fr', 'pythonensamusant.fr'] 4 \n", - "9 ['lorem.fr'] 5 \n", - "10 ['somewebsite.com/users/jajanne59'] 6 \n", - "11 ['pythonensamusant.fr', 'lensfans.fr'] 7 \n", - "12 ['lensfans.fr'] 7 \n", - "13 ['lesrecettesdemarcel.fr'] 8 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
00Jean d'AuxLille26['jeandaux.fr', 'lillefans.fr']0
11Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...1
24Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']1
32Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']2
45pierre dusquesnesPhalempin24[]2
53Paul DelarueRoubaix32['roubaixlove.fr']3
68Sophie DelarueRoubaix33[]3
711sophie_delarueRoubaix33[]3
86Jean-Michel PythonDouai49['lensfans.fr', 'pythonensamusant.fr']4
97Gédéon GlincarnéParis53['lorem.fr']5
109Jeanne VerbruggeValenciennes41['somewebsite.com/users/jajanne59']6
1110Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']7
1213Benoît BenoîtLens15['lensfans.fr']7
1312Marcel VandermerschFourmies48['lesrecettesdemarcel.fr']8
\n", - "
" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 24 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2441,12 +1699,7 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.820247800Z", - "start_time": "2026-01-30T14:21:19.653280100Z" - } - }, + "metadata": {}, "source": [ "city_blocker_not_different_age = msb.AttributeEquivalenceBlocker(\n", " [\"City\"], must_not_be_different=[\"Age\"]\n", @@ -2454,126 +1707,8 @@ "links = city_blocker_not_different_age.block(df)\n", "msb.add_blocks_to_dataset(df, links)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], ['Age'])\n" - ] - }, - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 11 sophie_delarue Roubaix 33 \n", - "\n", - " websites _block \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 \n", - "1 ['jacquesdupond.fr'] 0 \n", - "2 ['somewebsite.com/users/rpz59'] 1 \n", - "3 [] 1 \n", - "4 [] 2 \n", - "5 [] 2 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1
35pierre dusquesnesPhalempin24[]1
48Sophie DelarueRoubaix33[]2
511sophie_delarueRoubaix33[]2
\n", - "
" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 25 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -2591,49 +1726,19 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.335572Z", - "start_time": "2026-01-30T14:21:20.302358700Z" - } - }, + "metadata": {}, "source": [ "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", "links = city_blocker.block(df, motives=True)\n", "links" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" - ] - }, - { - "data": { - "text/plain": [ - "{frozenset({1, 4}): {\"Same 'City'\"},\n", - " frozenset({8, 11}): {\"Same 'City'\"},\n", - " frozenset({2, 5}): {\"Same 'City'\"},\n", - " frozenset({10, 13}): {\"Same 'City'\"},\n", - " frozenset({3, 8}): {\"Same 'City'\"},\n", - " frozenset({3, 11}): {\"Same 'City'\"}}" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 26 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "Of course, this will induce some overhead." - ] + "source": "This will induce some overhead." }, { "cell_type": "markdown", @@ -2644,892 +1749,80 @@ }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.409405100Z", - "start_time": "2026-01-30T14:21:20.374573700Z" - } - }, + "metadata": {}, "source": [ "msb.add_blocks_to_dataset(df, links, motives=True)" ], - "outputs": [ - { - "data": { - "text/plain": [ - " id Name City Age \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 4 Jacques Dupont Villeneuve d'Ascq 37 \n", - "2 2 Pierre Dusquesnes Phalempin 24 \n", - "3 5 pierre dusquesnes Phalempin 24 \n", - "4 3 Paul Delarue Roubaix 32 \n", - "5 8 Sophie Delarue Roubaix 33 \n", - "6 11 sophie_delarue Roubaix 33 \n", - "7 10 Caroline Dufour Lens 45 \n", - "8 13 Benoît Benoît Lens 15 \n", - "\n", - " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n", - "1 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n", - "3 [] 1 (Same 'City') \n", - "4 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 [] 2 (Same 'City') \n", - "6 [] 2 (Same 'City') \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n", - "8 ['lensfans.fr'] 3 (Same 'City') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idNameCityAgewebsites_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...0(Same 'City')
14Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0(Same 'City')
22Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']1(Same 'City')
35pierre dusquesnesPhalempin24[]1(Same 'City')
43Paul DelarueRoubaix32['roubaixlove.fr']2(Same 'City')
58Sophie DelarueRoubaix33[]2(Same 'City')
611sophie_delarueRoubaix33[]2(Same 'City')
710Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']3(Same 'City')
813Benoît BenoîtLens15['lensfans.fr']3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)" ], - "execution_count": 27 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", "metadata": {}, "source": [ - "... Though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." + "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:" ] }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "msb.add_blocks_to_dataset(\n", + " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n", + ")" + ], + "outputs": [], + "execution_count": null + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "... Which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" + "Motives are dynamic:" ] }, { "cell_type": "code", "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.612990700Z", - "start_time": "2026-01-30T14:21:20.483928200Z" - } + "scrolled": true }, "source": [ - "msb.add_blocks_to_dataset(df, links, motives=True, show_as_pairs=True)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l City_l Age_l \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 8 Sophie Delarue Roubaix 33 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 10 Caroline Dufour Lens 45 \n", - "\n", - " websites_l id_r Name_r \\\n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n", - "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n", - "2 ['roubaixlove.fr'] 11 sophie_delarue \n", - "3 [] 11 sophie_delarue \n", - "4 [] 3 Paul Delarue \n", - "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", - "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lCity_lAge_lwebsites_lid_rName_rCity_rAge_rwebsites_r_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...4Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0(Same 'City')
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']5pierre dusquesnesPhalempin24[]1(Same 'City')
23Paul DelarueRoubaix32['roubaixlove.fr']11sophie_delarueRoubaix33[]2(Same 'City')
38Sophie DelarueRoubaix33[]11sophie_delarueRoubaix33[]2(Same 'City')
48Sophie DelarueRoubaix33[]3Paul DelarueRoubaix32['roubaixlove.fr']2(Same 'City')
510Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']13Benoît BenoîtLens15['lensfans.fr']3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 28 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If our dataset had many columns, the above output would be too large to easily be read, so we added the `output_columns` option:" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.944670700Z", - "start_time": "2026-01-30T14:21:20.834495500Z" - } - }, - "source": [ - "msb.add_blocks_to_dataset(\n", - " df, links, motives=True, show_as_pairs=True, output_columns=[\"id\", \"Name\"]\n", - ")" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r _block _motive\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n", - "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n", - "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n", - "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n", - "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_block_motive
01Jacques Dupond4Jacques Dupont0(Same 'City')
12Pierre Dusquesnes5pierre dusquesnes1(Same 'City')
23Paul Delarue11sophie_delarue2(Same 'City')
38Sophie Delarue11sophie_delarue2(Same 'City')
48Sophie Delarue3Paul Delarue2(Same 'City')
510Caroline Dufour13Benoît Benoît3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 29 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Motives are dynamic:" - ] - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.591044600Z", - "start_time": "2026-01-30T14:21:21.517777200Z" - } - }, - "source": [ - "msb.generate_blocking_report(df, links)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l City_l Age_l \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 8 Sophie Delarue Roubaix 33 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 10 Caroline Dufour Lens 45 \n", - "\n", - " websites_l id_r Name_r \\\n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n", - "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n", - "2 ['roubaixlove.fr'] 11 sophie_delarue \n", - "3 [] 11 sophie_delarue \n", - "4 [] 3 Paul Delarue \n", - "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", - "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lCity_lAge_lwebsites_lid_rName_rCity_rAge_rwebsites_r_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...4Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0(Same 'City')
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']5pierre dusquesnesPhalempin24[]1(Same 'City')
23Paul DelarueRoubaix32['roubaixlove.fr']11sophie_delarueRoubaix33[]2(Same 'City')
38Sophie DelarueRoubaix33[]11sophie_delarueRoubaix33[]2(Same 'City')
48Sophie DelarueRoubaix33[]3Paul DelarueRoubaix32['roubaixlove.fr']2(Same 'City')
510Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']13Benoît BenoîtLens15['lensfans.fr']3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 30 - }, - { - "cell_type": "code", - "metadata": { - "scrolled": true, - "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.867809800Z", - "start_time": "2026-01-30T14:21:21.674986800Z" - } - }, - "source": [ - "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", - "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", - "websites_blocker = msb.OverlapBlocker([\"websites\"])\n", - "final_blocker = (city_blocker & age_blocker) | websites_blocker\n", - "links = final_blocker.block(df, motives=True)\n", - "report = msb.add_blocks_to_dataset(\n", - " df,\n", - " links,\n", - " motives=True,\n", - " show_as_pairs=True,\n", - " output_columns=[\"id\", \"Name\"],\n", - " merge_blocks=False,\n", - ")\n", - "report" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", - "Processing OverlapBlocker(['websites'], 1)\n" - ] - }, - { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", - "\n", - " _motive \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "6 (>=1 overlap in 'websites') \n", - "7 (>=1 overlap in 'websites') \n", - "8 (Same 'City', Same 'Age') \n", - "9 (Same 'City', Same 'Age') \n", - "10 (>=1 overlap in 'websites') \n", - "11 (>=1 overlap in 'websites') \n", - "12 (>=1 overlap in 'websites') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_block_motive
01Jacques Dupond4Jacques Dupont0(>=1 overlap in 'websites', Same 'City', Same ...
11Jacques Dupond6Jean-Michel Python0(>=1 overlap in 'websites', Same 'City', Same ...
21Jacques Dupond10Caroline Dufour0(>=1 overlap in 'websites', Same 'City', Same ...
31Jacques Dupond4Jacques Dupont1(>=1 overlap in 'websites', Same 'City', Same ...
41Jacques Dupond6Jean-Michel Python1(>=1 overlap in 'websites', Same 'City', Same ...
51Jacques Dupond10Caroline Dufour1(>=1 overlap in 'websites', Same 'City', Same ...
610Caroline Dufour6Jean-Michel Python1(>=1 overlap in 'websites')
710Caroline Dufour13Benoît Benoît1(>=1 overlap in 'websites')
82Pierre Dusquesnes5pierre dusquesnes2(Same 'City', Same 'Age')
98Sophie Delarue11sophie_delarue3(Same 'City', Same 'Age')
1010Caroline Dufour6Jean-Michel Python4(>=1 overlap in 'websites')
1110Caroline Dufour13Benoît Benoît4(>=1 overlap in 'websites')
1213Benoît Benoît6Jean-Michel Python4(>=1 overlap in 'websites')
\n", - "
" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } + "city_blocker = msb.AttributeEquivalenceBlocker([\"City\"])\n", + "age_blocker = msb.AttributeEquivalenceBlocker([\"Age\"])\n", + "websites_blocker = msb.OverlapBlocker([\"websites\"])\n", + "final_blocker = (city_blocker & age_blocker) | websites_blocker\n", + "links = final_blocker.block(df, motives=True)\n", + "report = msb.add_blocks_to_dataset(\n", + " df,\n", + " links,\n", + " motives=True,\n", + " show_as_pairs=True,\n", + " output_columns=[\"id\", \"Name\"],\n", + " merge_blocks=False,\n", + ")\n", + "report" ], - "execution_count": 31 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -3539,224 +1832,70 @@ { "cell_type": "markdown", "metadata": {}, - "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of motives." + "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `score=True` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives." }, { "cell_type": "code", + "metadata": {}, + "source": [ + "report = msb.add_blocks_to_dataset(\n", + " df,\n", + " links,\n", + " motives=True,\n", + " show_as_pairs=True,\n", + " output_columns=[\"id\", \"Name\"],\n", + " merge_blocks=False,\n", + " score=True,\n", + ")\n", + "report.sort_values(\"_score\", ascending=False)" + ], + "outputs": [], + "execution_count": null + }, + { "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:22.186415700Z", - "start_time": "2026-01-30T14:21:22.127304600Z" + "end_time": "2026-02-03T16:43:50.398834500Z", + "start_time": "2026-02-03T16:43:50.048297Z" } }, + "cell_type": "code", "source": [ - "report[\"score\"] = msb.scoring(report)\n", - "report.sort_values(\"score\", ascending=False)" + "city_blocker = msb.OverlapBlocker([\"City\"])\n", + "city_blocker.block(df)" ], "outputs": [ { - "data": { - "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", - "\n", - " _motive score \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "8 (Same 'City', Same 'Age') 2 \n", - "9 (Same 'City', Same 'Age') 2 \n", - "6 (>=1 overlap in 'websites') 1 \n", - "7 (>=1 overlap in 'websites') 1 \n", - "10 (>=1 overlap in 'websites') 1 \n", - "11 (>=1 overlap in 'websites') 1 \n", - "12 (>=1 overlap in 'websites') 1 " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lid_rName_r_block_motivescore
01Jacques Dupond4Jacques Dupont0(>=1 overlap in 'websites', Same 'City', Same ...3
11Jacques Dupond6Jean-Michel Python0(>=1 overlap in 'websites', Same 'City', Same ...3
21Jacques Dupond10Caroline Dufour0(>=1 overlap in 'websites', Same 'City', Same ...3
31Jacques Dupond4Jacques Dupont1(>=1 overlap in 'websites', Same 'City', Same ...3
41Jacques Dupond6Jean-Michel Python1(>=1 overlap in 'websites', Same 'City', Same ...3
51Jacques Dupond10Caroline Dufour1(>=1 overlap in 'websites', Same 'City', Same ...3
82Pierre Dusquesnes5pierre dusquesnes2(Same 'City', Same 'Age')2
98Sophie Delarue11sophie_delarue3(Same 'City', Same 'Age')2
610Caroline Dufour6Jean-Michel Python1(>=1 overlap in 'websites')1
710Caroline Dufour13Benoît Benoît1(>=1 overlap in 'websites')1
1010Caroline Dufour6Jean-Michel Python4(>=1 overlap in 'websites')1
1110Caroline Dufour13Benoît Benoît4(>=1 overlap in 'websites')1
1213Benoît Benoît6Jean-Michel Python4(>=1 overlap in 'websites')1
\n", - "
" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing OverlapBlocker(['City'], 1)\n" + ] + }, + { + "ename": "SyntaxError", + "evalue": "unterminated string literal (detected at line 1) (, line 1)", + "output_type": "error", + "traceback": [ + "Traceback \u001B[36m(most recent call last)\u001B[39m:\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\IPython\\core\\interactiveshell.py:3701\u001B[39m in \u001B[95mrun_code\u001B[39m\n exec(code_obj, self.user_global_ns, self.user_ns)\n", + " Cell \u001B[92mIn[19]\u001B[39m\u001B[92m, line 2\u001B[39m\n city_blocker.block(df)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\ms_blocking.py:250\u001B[39m in \u001B[95mblock\u001B[39m\n temp_data[col] = temp_data[col].apply(\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\series.py:4943\u001B[39m in \u001B[95mapply\u001B[39m\n ).apply()\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1422\u001B[39m in \u001B[95mapply\u001B[39m\n return self.apply_standard()\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1502\u001B[39m in \u001B[95mapply_standard\u001B[39m\n mapped = obj._map_values(\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\base.py:925\u001B[39m in \u001B[95m_map_values\u001B[39m\n return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001B[39m in \u001B[95mmap_array\u001B[39m\n return lib.map_infer(values, mapper, convert=convert)\n", + " File \u001B[92mpandas/_libs/lib.pyx:2999\u001B[39m in \u001B[95mpandas._libs.lib.map_infer\u001B[39m\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\.venv\\Lib\\site-packages\\pandas\\core\\apply.py:1491\u001B[39m in \u001B[95mcurried\u001B[39m\n return func(x, *self.args, **self.kwargs)\n", + " File \u001B[92m~\\PycharmProjects\\MSBlock\\ms_blocking\\src\\ms_blocking\\utils.py:374\u001B[39m in \u001B[95mparse_list\u001B[39m\n s = str(s).strip()\n", + " File \u001B[92m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:66\u001B[39m in \u001B[95mliteral_eval\u001B[39m\n node_or_string = parse(node_or_string.lstrip(\" \\t\"), mode='eval')\n", + "\u001B[36m \u001B[39m\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\ast.py:52\u001B[39m\u001B[36m in \u001B[39m\u001B[35mparse\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mreturn compile(source, filename, mode, flags,\u001B[39m\n", + " \u001B[36mFile \u001B[39m\u001B[32m:1\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mVilleneuve d'Ascq\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m unterminated string literal (detected at line 1)\n" + ] } ], - "execution_count": 32 + "execution_count": 19 } ], "metadata": { diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index fffbcc8..b3552af 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -1,5 +1,7 @@ from ms_blocking.utils import * # noqa: F403 +import networkx as nx + class BlockerNode: """Abstract class from which derive all classes in the module""" @@ -46,7 +48,7 @@ def __init__(self, left, right): def __repr__(self): return f"AndNode{{{self.left}, {self.right}}}" - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker coords_left = self.left.block(df, motives=motives) @@ -76,8 +78,7 @@ def __init__(self, left, right): def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" - - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) @@ -91,7 +92,10 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( - self, blocking_columns, normalize_strings=True, must_not_be_different=None + self, + blocking_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + normalize_strings: bool = True, ): super().__init__() @@ -120,7 +124,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"AttributeEquivalenceBlocker({self.blocking_columns}, {self.must_not_be_different})" + return f"AttributeEquivalenceBlocker({self.blocking_columns}{', ' + str(self.must_not_be_different) if self.must_not_be_different else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: @@ -139,21 +143,28 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on equality of one or more columns""" print("Processing", self) - temp_data = data.copy() - - for col in self.blocking_columns: - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data.dropna(subset=self.blocking_columns) - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns + temp_data = ( + data[self.blocking_columns + self.must_not_be_different] + .dropna(subset=self.blocking_columns) + .copy() ) + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs if motives: return dict() @@ -185,9 +196,7 @@ def block(self, data, motives=False): } if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.blocking_columns - } + explanations = [EquivalenceMotive(col) for col in self.blocking_columns] return add_motives_to_coords(coords, explanations) else: return set(coords) # set is unnnecessary @@ -197,7 +206,11 @@ class OverlapBlocker(BlockerNode): # Leaf """To regroup rows based on overlap of one or more columns.""" def __init__( - self, blocking_columns, overlap=1, word_level=False, normalize_strings=True + self, + blocking_columns: str | Collection[str], + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -217,7 +230,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"OverlapBlocker({self.blocking_columns}, {self.overlap})" + return f"OverlapBlocker({self.blocking_columns}, {self.overlap}{', WORD-LEVEL' if self.word_level else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is OverlapBlocker: @@ -238,29 +251,31 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) - temp_data = data.copy() + temp_data = data[self.blocking_columns].dropna().copy() - temp_data = temp_data[self.blocking_columns].copy() - - for col in self.blocking_columns: - temp_data[col] = temp_data[col].apply( - parse_list, word_level=self.word_level - ) - temp_data = temp_data.explode(col) - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data.dropna( - subset=self.blocking_columns - ) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns + # Ensure we check for overlap between lists of strings + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.blocking_columns) + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) + ) + + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() @@ -268,7 +283,7 @@ def block(self, data, motives=False): return set() # Use the DataFrame index for grouping and forming pairs - # Using frozenset since they are ahshable and thus can be used as dictionary keys + # Using frozenset since they are hashable and thus can be used as dictionary keys groups = temp_data.groupby(self.blocking_columns).apply( lambda x: frozenset(x.index), include_groups=False ) @@ -276,10 +291,10 @@ def block(self, data, motives=False): coords = block_overlap(groups=groups, overlap=self.overlap) if motives: - explanations = { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.blocking_columns - } + explanations = [ + OverlapMotive(col, self.overlap, self.word_level) + for col in self.blocking_columns + ] return add_motives_to_coords(coords, explanations) else: return set(coords) @@ -287,17 +302,17 @@ def block(self, data, motives=False): class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM """Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker. - Designed for performance and RAM efficiency. + Used for performance and RAM efficiency. """ def __init__( self, - equivalence_columns, - overlap_columns, - must_not_be_different=None, - overlap=1, - word_level=False, - normalize_strings=True, + equivalence_columns: str | Collection[str], + overlap_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -341,7 +356,16 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"MixedBlocker({self.equivalence_columns}, {self.overlap_columns}, {self.overlap})" + return str( + AndNode( + AttributeEquivalenceBlocker( + self.equivalence_columns, self.must_not_be_different, self.normalize + ), + OverlapBlocker( + self.overlap_columns, self.overlap, self.word_level, self.normalize + ), + ) + ) def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: @@ -369,31 +393,30 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) total_columns = self.equivalence_columns + self.overlap_columns - temp_data = data[total_columns].copy() - - for col in total_columns: - if col in self.equivalence_columns: - temp_data[col] = temp_data[col].apply(normalize) - elif col in self.overlap_columns: - temp_data[col] = temp_data[col].apply( - lambda x: [ - normalize(item) for item in parse_list(x, self.word_level) - ] - if self.normalize - else parse_list(x, self.word_level) - ) - temp_data = temp_data.explode(col) + temp_data = data[total_columns].dropna().copy() - temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once(temp_data, total_columns) + # Ensure we check for overlap between lists of strings + temp_data[self.overlap_columns] = temp_data[self.overlap_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) + ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.overlap_columns) + # Normalize strings if required + if self.normalize: + temp_data[total_columns] = temp_data[total_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[temp_data.duplicated(keep=False, subset=total_columns)] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() @@ -426,17 +449,261 @@ def block(self, data, motives=False): coords = coords_equivalence.intersection(coords_overlap) if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.equivalence_columns - } | { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.overlap_columns - } + explanations = [ + EquivalenceMotive(col) for col in self.equivalence_columns + ] + [ + OverlapMotive(col, self.overlap, self.word_level) + for col in self.overlap_columns + ] + return add_motives_to_coords(coords, explanations) else: return set(coords) +def add_blocks_to_dataset( + data: pd.DataFrame, + coords: Coords, + sort: bool = True, + keep_ungrouped_rows: bool = False, + merge_blocks: bool = True, + motives: bool = False, + show_as_pairs: bool = False, + output_columns: Columns = None, + score: bool = False, +) -> pd.DataFrame: + """Returns the intersection of an array of links + + Takes two lists of paired elements, with or without motives, returns their intersection + + Parameters + ---------- + data : DataFrame + DataFrame for blocking + coords : Array + Blocked coordinates + sort : bool + Whether to sort the result by block, thereby regrouping rows of the same block + keep_ungrouped_rows : bool + Whether to display rows that do not belong to any block + merge_blocks : bool + Whether to merge transitively merge blocks + motives : bool + Whether to display the reason behind each block + show_as_pairs : bool + Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame + output_columns : list + Columns to show. Useful in combination with show_as_pairs as column names are altered + score : bool + Whether to show a score (computed from the number of motives) + + Returns + ------- + DataFrame + Blocked DataFrame + + Examples + -------- + >>> add_blocks_to_dataset(data=pd.DataFrame( + [ + [0, 'first', 4], + [1, 'second', 6], + [2, 'first', 2], + [3, 'third', 5] + ], + columns=['id', 'rank', 'score']), + coords=np.array([{0, 2}]), + show_as_pairs=True, + output_columns=['id', 'rank']) + id_l rank_l id_r rank_r block + 0 0 first 2 first 0 + """ + + if show_as_pairs and keep_ungrouped_rows: + raise ValueError("Cannot both return pairs and keep ungrouped rows") + + if motives: + if type(coords) is not dict: + raise TypeError("Cannot specify 'motives=True' without passing motives") + + # Ensure the index is a unique identifier + if not data.index.is_unique: + raise ValueError("DataFrame index must be unique to be used as an identifier.") + + if score and not motives: + raise ValueError("Cannot specify 'score=True' without passing motives") + + if "_motive" in data.columns: + if motives: + raise ValueError( + "Please rename existing '_motive' column OR do not pass 'motives=True'" + ) + + if "score" in data.columns: + if score: + raise ValueError( + "Please rename existing '_score' column OR do not pass 'score=True'" + ) + + if "_block" in data.columns: + raise ValueError("Please rename existing '_block' column") + + if output_columns is None: + output_columns = data.columns + + data = data[output_columns].copy() + + if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph + if show_as_pairs: + columns = [col + "_l" for col in data.columns] + [ + col + "_r" for col in data.columns + ] + output_data = pd.DataFrame(columns=columns) + else: + output_data = pd.DataFrame(columns=data.columns) + + if motives: + output_data["_motive"] = "" + if score: + output_data["_score"] = 0 + output_data["_block"] = -1 + + else: + output_data = data + # Map coords to connected component labels + if merge_blocks: # We solve the connected components problem + cc_labels = solve_connected_components_from_coords(coords) + # Match original index to new block ID + matcher = { + idx: label + for idx, label in enumerate(cc_labels) + if label != -1 and idx in data.index + } + else: # We solve the cliques problem + g = nx.Graph() + # noinspection PyTypeChecker + g.add_edges_from(coords) + complete_subgraphs = list(nx.find_cliques(g)) + complete_subgraphs = sorted(complete_subgraphs) + # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} + matcher = dict() + for i, clique in enumerate(complete_subgraphs): + for node_idx in clique: + if node_idx in matcher.keys(): + matcher[node_idx].append(i) + else: + matcher[node_idx] = [i] + + if show_as_pairs: + output_data = pd.DataFrame() + for pair in coords: + left_row = data.loc[[tuple(pair)[0]]].copy() + current_index = left_row.index + right_row = data.loc[[tuple(pair)[1]]].copy() + left_row.columns = [col + "_l" for col in left_row.columns] + right_row.columns = [col + "_r" for col in right_row.columns] + current_row = pd.concat( + [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], + axis=1, + ) + current_row.index = current_index + if motives: + motives_solved = solve_motives(coords[pair]) + current_row["_motive"] = [list(map(str, motives_solved))] + if score: + current_row["_score"] = len( + motives_solved + ) # Score is simply the number of non-redundant motives + output_data = pd.concat([output_data, current_row]) + + # Assign blocks to rows based on their original index + output_data["_block"] = output_data.index.map(matcher) + if not merge_blocks: + output_data = output_data.explode("_block") + + if keep_ungrouped_rows: + output_data["_block"] = output_data["_block"].fillna(-1) + matcher_ungrouped_rows = {} + block_temp = [] + i = 0 # Track # of blocks processed + for b in output_data["_block"]: + if b == -1: + block_temp.append(i) + i += 1 + elif b not in matcher_ungrouped_rows: + matcher_ungrouped_rows[b] = i + block_temp.append(i) + i += 1 + else: + block_temp.append(matcher_ungrouped_rows[b]) + output_data["_block"] = block_temp + else: + if not show_as_pairs: + output_data = output_data[ + output_data["_block"].duplicated(keep=False) + & output_data["_block"].notna() + ] + + output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) + + if sort: + # Sort by block, then by original index + sort_cols = ["_block"] + if output_data.index.name: + output_data = output_data.sort_values( + sort_cols + [output_data.index.name] + ) + else: + # If no named index, use the first column of the DataFrame + output_data = output_data.reset_index() + output_data = output_data.sort_values( + sort_cols + [output_data.columns[0]] + ) + output_data = output_data.set_index(output_data.columns[0]) + + if not show_as_pairs and motives: + id_list = flatten(coords.keys()) + motive_matcher = { + row_id: list(map(str, solve_motives(coords[pair]))) + for pair in coords.keys() + for row_id in id_list + if row_id in pair + } + # noinspection PyTypeChecker + output_data["_motive"] = output_data.index.map(motive_matcher) + if score: + output_data["_score"] = 0 + score_matcher = { # Horribly repetitive + row_id: len(solve_motives(coords[pair])) + for pair in coords.keys() + for row_id in id_list + if row_id in pair + } + output_data["_score"] = output_data.index.map(score_matcher) + + output_data = output_data.reset_index(drop=True) + output_data["_block"] = output_data["_block"].astype(int) + + return output_data + + +def generate_blocking_report( + data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None +) -> pd.DataFrame: + """ + Shorthand for add_blocks_to_dataset with below arguments + """ + return add_blocks_to_dataset( + data, + coords, + sort=True, + merge_blocks=False, + motives=True, + show_as_pairs=True, + output_columns=output_columns, + ) + + def merge_blockers( left: BlockerNode, right: BlockerNode ) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode: @@ -592,3 +859,6 @@ def merge_blockers( ) else: return AndNode(left, right) + + +# TODO: deport logic in a way that enables .progress_apply diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 837645f..b644a43 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -4,53 +4,72 @@ from scipy.sparse import coo_matrix from scipy.sparse.csgraph import connected_components import pandas as pd -import networkx as nx import random from collections import Counter from itertools import combinations from typing import List, Set, Iterable, Dict, Collection, Any + +class EquivalenceMotive: + def __init__(self, blocking_column: str): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + self.blocking_column = blocking_column + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") + return self.blocking_column == other.blocking_column + + def __str__(self): + return f"Same '{self.blocking_column}'" + + def __repr__(self): + return f"EquivalenceMotive(['{self.blocking_column}'])" + + +class OverlapMotive: + def __init__( + self, blocking_column: str, overlap: int = 1, word_level: bool = False + ): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + if not isinstance(overlap, int): + raise TypeError("overlap must be an int") + if not isinstance(word_level, bool): + raise TypeError("word_level must be a boolean") + self.blocking_column = blocking_column + self.overlap = overlap + self.word_level = word_level + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") + return ( + self.blocking_column == other.blocking_column + and self.overlap == other.overlap + and self.word_level == other.word_level + ) + + def __str__(self): + return f">={self.overlap}{' word-level' if self.word_level else ''} overlap in '{self.blocking_column}'" + + def __repr__(self): + return f"OverlapMotive(['{self.blocking_column}'], {self.overlap}{', word_level=True' if self.word_level else ''})" + + Columns = List[str] Pair = Collection[int] +Motive = EquivalenceMotive | OverlapMotive CoordsBasic = Set[Pair] -CoordsMotives = Dict[Pair, Set[str]] +CoordsMotives = Dict[Pair, List[Motive]] Coords = CoordsBasic | CoordsMotives _PUNCT_RE = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~]') _SPACE_RE = re.compile(r"\s+") -def remove_rows_if_value_appears_only_once( - data: pd.DataFrame, cols: Columns -) -> pd.DataFrame: - """Drop rows of a Pandas DataFrame where a certain column's values appears only once. - - Ensures all elements of provided columns appear at least twice in their column - - Parameters - ---------- - data : DataFrame - DataFrame to preprocess - - cols : List[str] - List of columns where rows that contain non-duplicated elements shall be discarded - - Returns - ------- - DataFrame - DataFrame with reduced number of rows - - Examples - -------- - >>> remove_rows_if_value_appears_only_once(data, ['name', 'city']) - """ - for col in cols: - counts = data[col].map(data[col].value_counts()) - data = data[counts >= 2] - return data - - def start_from_zero(figures: Collection[int]) -> List[int]: """Turns a list of integers into a same-length list that starts at 0, without gaps @@ -240,7 +259,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: if type(coords_1) is type(coords_2) is dict: # We have motives return { pair: ( - (coords_1[pair] | coords_2[pair]) + coords_1[pair] + coords_2[pair] if (pair in coords_1 and pair in coords_2) else coords_1[pair] if (pair in coords_1) @@ -278,7 +297,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: """ if type(coords_1) is type(coords_2) is dict: # We have motives return { - pair: (coords_1[pair] | coords_2[pair]) + pair: coords_1[pair] + coords_2[pair] for y in (coords_1, coords_2) for pair in y.keys() if (pair in coords_1 and pair in coords_2) @@ -287,219 +306,6 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: return coords_1.intersection(coords_2) -def add_blocks_to_dataset( - data: pd.DataFrame, - coords: Coords, - sort: bool = True, - keep_ungrouped_rows: bool = False, - merge_blocks: bool = True, - motives: bool = False, - show_as_pairs: bool = False, - output_columns: Columns = None, -) -> pd.DataFrame: - """Returns the intersection of an array of links - - Takes two lists of paired elements, with or without motives, returns their intersection - - Parameters - ---------- - data : DataFrame - DataFrame for blocking - coords : Array - Blocked coordinates - sort : bool - Whether to sort the result by block, thereby regrouping rows of the same block - keep_ungrouped_rows : bool - Whether to display rows that do not belong to any block - merge_blocks : bool - Whether to merge transitively merge blocks - motives : bool - Whether to display the reason behind each block - show_as_pairs : bool - Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame - output_columns : list - Columns to show. Useful in combination with show_as_pairs as column names are altered - - Returns - ------- - DataFrame - Blocked DataFrame - - Examples - -------- - >>> add_blocks_to_dataset(data=pd.DataFrame( - [ - [0, 'first', 4], - [1, 'second', 6], - [2, 'first', 2], - [3, 'third', 5] - ], - columns=['id', 'rank', 'score']), - coords=np.array([{0, 2}]), - show_as_pairs=True, - output_columns=['id', 'rank']) - id_l rank_l id_r rank_r block - 0 0 first 2 first 0 - """ - - if show_as_pairs and keep_ungrouped_rows: - raise ValueError("Cannot both return pairs and keep ungrouped rows") - - if motives: - if type(coords) is not dict: - raise TypeError("Cannot specify motives=True without passing motives") - - # Ensure the index is a unique identifier - if not data.index.is_unique: - raise ValueError("DataFrame index must be unique to be used as an identifier.") - - if "_motive" in data.columns: - if motives: - raise ValueError( - "Please rename existing '_motive' column OR do not pass 'motives=True'" - ) - - if "_block" in data.columns: - raise ValueError("Please rename existing '_block' column") - - if output_columns is None: - output_columns = data.columns - data = data[output_columns].copy() - - if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph - if show_as_pairs: - columns = [col + "_l" for col in data.columns] + [ - col + "_r" for col in data.columns - ] - output_data = pd.DataFrame(columns=columns) - else: - output_data = pd.DataFrame(columns=data.columns) - else: - output_data = data - # Map coords to connected component labels - if merge_blocks: # We solve the connected components problem - cc_labels = solve_connected_components_from_coords(coords) - # Match original index to new block ID - matcher = { - idx: label - for idx, label in enumerate(cc_labels) - if label != -1 and idx in data.index - } - else: # We solve the cliques problem - g = nx.Graph() - # noinspection PyTypeChecker - g.add_edges_from(coords) - complete_subgraphs = list(nx.find_cliques(g)) - complete_subgraphs = sorted(complete_subgraphs) - # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} - matcher = dict() - for i, clique in enumerate(complete_subgraphs): - for node_idx in clique: - if node_idx in matcher.keys(): - matcher[node_idx].append(i) - else: - matcher[node_idx] = [i] - - if show_as_pairs: - output_data = pd.DataFrame() - for pair in coords: - left_row = data.loc[[tuple(pair)[0]]].copy() - current_index = left_row.index - right_row = data.loc[[tuple(pair)[1]]].copy() - left_row.columns = [col + "_l" for col in left_row.columns] - right_row.columns = [col + "_r" for col in right_row.columns] - current_row = pd.concat( - [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], - axis=1, - ) - current_row.index = current_index - output_data = pd.concat([output_data, current_row]) - - # Assign blocks to rows based on their original index - output_data["_block"] = output_data.index.map(matcher) - if not merge_blocks: - output_data = output_data.explode("_block") - - if keep_ungrouped_rows: - output_data["_block"] = output_data["_block"].fillna(-1) - matcher_ungrouped_rows = {} - block_temp = [] - i = 0 # Track # of blocks processed - for b in output_data["_block"]: - if b == -1: - block_temp.append(i) - i += 1 - elif b not in matcher_ungrouped_rows: - matcher_ungrouped_rows[b] = i - block_temp.append(i) - i += 1 - else: - block_temp.append(matcher_ungrouped_rows[b]) - output_data["_block"] = block_temp - else: - if not show_as_pairs: - output_data = output_data[ - output_data["_block"].duplicated(keep=False) - & output_data["_block"].notna() - ] - - output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) - - if sort: - # Sort by block, then by original index - sort_cols = ["_block"] - if output_data.index.name: - output_data = output_data.sort_values( - sort_cols + [output_data.index.name] - ) - else: - # If no named index, use the first column of the DataFrame - output_data = output_data.reset_index() - output_data = output_data.sort_values( - sort_cols + [output_data.columns[0]] - ) - output_data = output_data.set_index(output_data.columns[0]) - - if motives: - output_data["_motive"] = "" - id_list = flatten(coords.keys()) - motive_matcher = { - row_id: frozenset( - reason - for pair in coords.keys() - if row_id in pair - for reason in coords[pair] - ) - for row_id in id_list - } - output_data["_motive"] = output_data.index.map(motive_matcher) - - if "_block" not in output_data.columns: # Empty coords - output_data["_block"] = -1 - - output_data = output_data.reset_index(drop=True) - output_data["_block"] = output_data["_block"].astype(int) - - return output_data - - -def generate_blocking_report( - data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None -) -> pd.DataFrame: - """ - Shorthand for add_blocks_to_dataset with below arguments - """ - return add_blocks_to_dataset( - data, - coords, - sort=True, - merge_blocks=False, - motives=True, - show_as_pairs=True, - output_columns=output_columns, - ) - - def parse_list(s: str | List, word_level: bool = False) -> List[str]: """Turns a stringified list into an actual python list, taking extra inner quotes into account @@ -511,7 +317,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: Stringified representation of a list e.g. "['string 1', 'string 2', ...]" word_level : bool - Whether to return a list of all words within s instead of a list of each comma-separated element + Whether to return a list of all words within s instead of a list of each comma-separated element; + Note that if passed a string that does not represent a list, this argument will be ignored and the function + will return a list of each word in the string Returns ------- @@ -527,7 +335,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: """ if type(s) is list: # If we already have a list - if len(s) == 1 and s[0][0] == "[" and s[0][-1] == "]": + if ( + len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).endswith("]") + ): # In case we have a stringified list INSIDE a normal list s = s[0] else: return s @@ -540,10 +350,15 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: if not s: return [] - try: - parts = ast.literal_eval(s) - except ValueError: # doesn't seem to be a stringified list - parts = s.split("', '") + if s.startswith("[") and s.endswith("]"): # Stringified list? + try: + parts = ast.literal_eval(s) + except ValueError: # doesn't seem to be a stringified list + parts = s.split("', '") + except SyntaxError: # In case we have a string surroudned by brackets + parts = s.split() + else: + parts = s.split() cleaned_items = [str(part).strip().strip("''") for part in parts] @@ -553,40 +368,6 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: return [s for s in cleaned_items if len(s) > 0] -def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series: - """Add a score to a blocked DataFrame based on the number of motives - - Parameters - ---------- - data : DataFrame - DataFrame with motives - - motives_column : str - Name of the column containing the motives - - Returns - ------- - Series[int] - A column of scores - """ - - # Check that we do have motives - if motives_column not in data.columns: - if motives_column == "_motive": - raise ValueError("No motives in DataFrame") - else: - raise ValueError( - f'Specified motives column "{motives_column}" does not exist' - ) - - if "score" in data.columns: - print("Renaming 'score' column to 'score_old'") - data = data.rename(columns={"score": "score_old"}) - - scores = data[motives_column].apply(len) - return scores - - def must_not_be_different_apply( # WIP temp_data: pd.DataFrame, blocking_columns: List[str], @@ -682,7 +463,9 @@ def block_overlap(groups: Iterable, overlap: int = 1) -> Coords: return coords -def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives: +def add_motives_to_coords( + coords: Coords, explanations: List[Motive] +) -> Dict[Pair, List[Motive]]: """Block a DataFrame based on overlap accross columns Parameters @@ -690,7 +473,7 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv coords : Coords Coords obtained by blocking - explanations : Set[str] + explanations : Set[EquivalenceMotive|OverlapMotive] Set of explanations Returns @@ -718,3 +501,99 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv } """ return {pair: explanations for pair in coords} + + +def solve_motives(motives: List[Motive]) -> List[Motive]: + """Remove duplicated and redundant motives from a list of motives + + Redundant motives refer to OverlapMotives on the same column(s) but with different overlap or word-level condition + + Parameters + ---------- + motives : List[Motive] + Coords obtained by blocking + + Returns + ------- + List[Motive] + A list of Motives whose length should be smaller or equal to the original list of motives + + Examples + -------- + >>> solve_motives([OverlapMotive('websites', 1), OverlapMotive('websites', 2), OverlapMotive('websites', 2, word_level=False)]) + [OverlapMotive(['websites'], 2, word_level=False)] + """ + if not motives: + raise ValueError("Motives must not be empty") + + # split_motives = [] + # for motive in motives: + # split_motives += split_motive(motive) + + final_motives = [ + motive for motive in motives if type(motive) is EquivalenceMotive + ] # With EquivalenceMotive, equality check suffices + overlap_motives = [motive for motive in motives if type(motive) is OverlapMotive] + overlap_columns = [motive.blocking_column for motive in overlap_motives] + + for column in overlap_columns: + overlap_motives_for_column = [ + motive for motive in overlap_motives if motive.blocking_column == column + ] + + # Select Blocker with stricter word/element-level condition + word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if motive.word_level + ] + not_word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if not motive.word_level + ] + + # Find biggest overlap among the non-word_level ones + if not_word_level_motives_for_column: + max_overlap_not_word_level_for_column = max( + not_word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_not_word_level_for_column_overlap = ( + max_overlap_not_word_level_for_column.overlap + ) + else: + max_overlap_not_word_level_for_column = [] + max_overlap_not_word_level_for_column_overlap = ( + 0 # Will never be used, left for linter + ) + + # Now find biggest overlap among the word_level ones + if word_level_motives_for_column: + max_overlap_word_level_for_column = max( + word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_word_level_for_column_overlap = ( + max_overlap_word_level_for_column.overlap + ) + if not_word_level_motives_for_column: + # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it + if ( + max_overlap_word_level_for_column_overlap + <= max_overlap_not_word_level_for_column_overlap + ): + max_overlap_word_level_for_column = [] + else: + max_overlap_word_level_for_column = [] + + if max_overlap_not_word_level_for_column: + max_overlap_not_word_level_for_column = [ + max_overlap_not_word_level_for_column + ] + if max_overlap_word_level_for_column: + max_overlap_word_level_for_column = [max_overlap_word_level_for_column] + final_motives += ( + max_overlap_word_level_for_column + max_overlap_not_word_level_for_column + ) + + # Remove duplicates + final_motives_no_duplicates = [] + for motive in final_motives: + if motive not in final_motives_no_duplicates: + final_motives_no_duplicates.append(motive) + return final_motives_no_duplicates diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index d3f9ab2..cf92924 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -84,18 +84,28 @@ def attribute_city_keep_ungrouped_rows_false(): @pytest.fixture def attribute_city_motives_true_block(): return { - frozenset({3, 8}): {"Same 'City'"}, - frozenset({1, 4}): {"Same 'City'"}, - frozenset({8, 11}): {"Same 'City'"}, - frozenset({3, 11}): {"Same 'City'"}, - frozenset({2, 5}): {"Same 'City'"}, - frozenset({10, 13}): {"Same 'City'"}, + frozenset({3, 8}): [msb.EquivalenceMotive("City")], + frozenset({1, 4}): [msb.EquivalenceMotive("City")], + frozenset({8, 11}): [msb.EquivalenceMotive("City")], + frozenset({3, 11}): [msb.EquivalenceMotive("City")], + frozenset({2, 5}): [msb.EquivalenceMotive("City")], + frozenset({10, 13}): [msb.EquivalenceMotive("City")], } @pytest.fixture def attribute_city_motives_true_add(): - return [{"Same 'City'"}] * 9 + return [ + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ] @pytest.fixture @@ -116,25 +126,30 @@ def city_age_name_websites_pipelining_id(): @pytest.fixture def city_age_websites_pipelining_motives(): return [ - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'"}, + {"Same 'City'", "Same 'Age'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, ] @pytest.fixture def city_age_websites_pipelining_scores(): - return [3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1] + return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +@pytest.fixture +def city_age_websites_pipelining_scores_not_show_as_pairs(): + return [3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1] @pytest.fixture @@ -335,9 +350,10 @@ def test_pipelining_motives(city_age_websites_pipelining_motives): websites_blocker = msb.OverlapBlocker(["websites"]) final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) - actual = msb.add_blocks_to_dataset( + motives = msb.add_blocks_to_dataset( # Use set to ignore ordering get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False )["_motive"].to_list() + actual = [set(motive) for motive in motives] assert actual == expected @@ -350,9 +366,36 @@ def test_pipelining_scores(city_age_websites_pipelining_scores): final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) report = msb.add_blocks_to_dataset( - get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False + get_users(), + links, + show_as_pairs=True, + motives=True, + merge_blocks=False, + score=True, + ) + actual = sorted(report["_score"], reverse=True) + assert actual == expected + + +def test_pipelining_scores_without_show_as_pairs( + city_age_websites_pipelining_scores_not_show_as_pairs, +): + """Test that scoring does work as intended""" + expected = city_age_websites_pipelining_scores_not_show_as_pairs + city_blocker = msb.AttributeEquivalenceBlocker(["City"]) + age_blocker = msb.AttributeEquivalenceBlocker(["Age"]) + websites_blocker = msb.OverlapBlocker(["websites"]) + final_blocker = (city_blocker & age_blocker) | websites_blocker + links = final_blocker.block(get_users(), motives=True) + report = msb.add_blocks_to_dataset( + get_users(), + links, + show_as_pairs=False, + motives=True, + merge_blocks=False, + score=True, ) - actual = sorted(msb.scoring(report), reverse=True) + actual = sorted(report["_score"], reverse=True) assert actual == expected