diff --git a/docs/example.ipynb b/docs/example.ipynb index 6b82165..612b705 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -32,11 +32,13 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.010997600Z", - "start_time": "2026-01-30T14:21:13.420790Z" + "end_time": "2026-02-04T11:08:15.717250Z", + "start_time": "2026-02-04T11:08:15.051987700Z" } }, "source": [ + "import ast\n", + "\n", "import ms_blocking.ms_blocking as msb" ], "outputs": [], @@ -60,8 +62,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.049404600Z", - "start_time": "2026-01-30T14:21:14.010997600Z" + "end_time": "2026-02-04T11:08:15.764256100Z", + "start_time": "2026-02-04T11:08:15.722778900Z" } }, "source": [ @@ -282,8 +284,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.190107400Z", - "start_time": "2026-01-30T14:21:14.089762400Z" + "end_time": "2026-02-04T11:08:15.931579300Z", + "start_time": "2026-02-04T11:08:15.807525700Z" } }, "source": [ @@ -310,8 +312,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.309413300Z", - "start_time": "2026-01-30T14:21:14.278545600Z" + "end_time": "2026-02-04T11:08:16.087038500Z", + "start_time": "2026-02-04T11:08:16.035029800Z" } }, "source": [ @@ -322,7 +324,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City'])\n" ] } ], @@ -339,8 +341,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.378808Z", - "start_time": "2026-01-30T14:21:14.349508200Z" + "end_time": "2026-02-04T11:08:16.144591700Z", + "start_time": "2026-02-04T11:08:16.125079Z" } }, "source": [ @@ -369,8 +371,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.558644200Z", - "start_time": "2026-01-30T14:21:14.459573100Z" + "end_time": "2026-02-04T11:08:16.394056600Z", + "start_time": "2026-02-04T11:08:16.254143400Z" } }, "source": [ @@ -409,8 +411,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.635514Z", - "start_time": "2026-01-30T14:21:14.598913Z" + "end_time": "2026-02-04T11:08:16.465615700Z", + "start_time": "2026-02-04T11:08:16.436149800Z" } }, "source": [ @@ -574,8 +576,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:14.829719100Z", - "start_time": "2026-01-30T14:21:14.676157200Z" + "end_time": "2026-02-04T11:08:16.669957500Z", + "start_time": "2026-02-04T11:08:16.517388400Z" } }, "source": [ @@ -622,8 +624,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.027923700Z", - "start_time": "2026-01-30T14:21:14.926401Z" + "end_time": "2026-02-04T11:08:16.897465Z", + "start_time": "2026-02-04T11:08:16.790223300Z" } }, "source": [ @@ -759,8 +761,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.403596500Z", - "start_time": "2026-01-30T14:21:15.279120300Z" + "end_time": "2026-02-04T11:08:17.178282600Z", + "start_time": "2026-02-04T11:08:17.085283500Z" } }, "source": [ @@ -796,8 +798,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.686136800Z", - "start_time": "2026-01-30T14:21:15.608444400Z" + "end_time": "2026-02-04T11:08:17.415392700Z", + "start_time": "2026-02-04T11:08:17.340879400Z" } }, "source": [ @@ -971,8 +973,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:15.998425200Z", - "start_time": "2026-01-30T14:21:15.931370100Z" + "end_time": "2026-02-04T11:08:17.748213300Z", + "start_time": "2026-02-04T11:08:17.686781800Z" } }, "source": [ @@ -1075,8 +1077,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.305679100Z", - "start_time": "2026-01-30T14:21:16.212470400Z" + "end_time": "2026-02-04T11:08:18.079269300Z", + "start_time": "2026-02-04T11:08:17.983904200Z" } }, "source": [ @@ -1089,7 +1091,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City', 'Age'])\n" ] }, { @@ -1223,8 +1225,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:16.678653800Z", - "start_time": "2026-01-30T14:21:16.558976200Z" + "end_time": "2026-02-04T11:08:18.745005600Z", + "start_time": "2026-02-04T11:08:18.452951600Z" } }, "source": [ @@ -1237,7 +1239,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'])\n" ] }, { @@ -1342,8 +1344,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.354294400Z", - "start_time": "2026-01-30T14:21:17.316050200Z" + "end_time": "2026-02-04T11:08:19.619033Z", + "start_time": "2026-02-04T11:08:19.558837100Z" } }, "source": [ @@ -1358,7 +1360,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Name'], [])\n" + "Processing AttributeEquivalenceBlocker(['Name'], NON-NORMALIZED)\n" ] }, { @@ -1440,8 +1442,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.537043700Z", - "start_time": "2026-01-30T14:21:17.392490700Z" + "end_time": "2026-02-04T11:08:20.087558700Z", + "start_time": "2026-02-04T11:08:20.054190800Z" } }, "source": [ @@ -1453,7 +1455,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City'])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] } @@ -1464,8 +1466,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.655177300Z", - "start_time": "2026-01-30T14:21:17.573776300Z" + "end_time": "2026-02-04T11:08:20.185145300Z", + "start_time": "2026-02-04T11:08:20.126031Z" } }, "source": [ @@ -1477,7 +1479,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing MixedBlocker(['City'], ['websites'], 1)\n" + "Processing AndNode{AttributeEquivalenceBlocker(['City']), OverlapBlocker(['websites'], 1)}\n" ] }, { @@ -1589,8 +1591,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:17.910335600Z", - "start_time": "2026-01-30T14:21:17.821453400Z" + "end_time": "2026-02-04T11:08:20.399421200Z", + "start_time": "2026-02-04T11:08:20.263975500Z" } }, "source": [ @@ -1602,7 +1604,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City'])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] }, @@ -1804,8 +1806,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.279899900Z", - "start_time": "2026-01-30T14:21:18.250988900Z" + "end_time": "2026-02-04T11:08:20.711888Z", + "start_time": "2026-02-04T11:08:20.674721300Z" } }, "source": [ @@ -1828,8 +1830,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.481263300Z", - "start_time": "2026-01-30T14:21:18.466284300Z" + "end_time": "2026-02-04T11:08:21.107369500Z", + "start_time": "2026-02-04T11:08:21.084976300Z" } }, "source": [ @@ -1849,8 +1851,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.562779600Z", - "start_time": "2026-01-30T14:21:18.520368200Z" + "end_time": "2026-02-04T11:08:21.217014300Z", + "start_time": "2026-02-04T11:08:21.157874500Z" } }, "source": [ @@ -1862,8 +1864,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", - "Processing MixedBlocker(['Name'], ['websites'], 1)\n" + "Processing AttributeEquivalenceBlocker(['City', 'Age'])\n", + "Processing AndNode{AttributeEquivalenceBlocker(['Name']), OverlapBlocker(['websites'], 1)}\n" ] }, { @@ -1990,8 +1992,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.843568700Z", - "start_time": "2026-01-30T14:21:18.686911500Z" + "end_time": "2026-02-04T11:08:21.451668200Z", + "start_time": "2026-02-04T11:08:21.321138300Z" } }, "source": [ @@ -2003,7 +2005,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City'])\n" ] } ], @@ -2034,8 +2036,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:18.967168700Z", - "start_time": "2026-01-30T14:21:18.928864500Z" + "end_time": "2026-02-04T11:08:21.692885600Z", + "start_time": "2026-02-04T11:08:21.621341500Z" } }, "source": [ @@ -2213,8 +2215,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.276047300Z", - "start_time": "2026-01-30T14:21:19.146886900Z" + "end_time": "2026-02-04T11:08:21.989978400Z", + "start_time": "2026-02-04T11:08:21.913152200Z" } }, "source": [ @@ -2443,8 +2445,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:19.820247800Z", - "start_time": "2026-01-30T14:21:19.653280100Z" + "end_time": "2026-02-04T11:08:22.361091600Z", + "start_time": "2026-02-04T11:08:22.283162500Z" } }, "source": [ @@ -2593,8 +2595,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.335572Z", - "start_time": "2026-01-30T14:21:20.302358700Z" + "end_time": "2026-02-04T11:08:22.741289200Z", + "start_time": "2026-02-04T11:08:22.663800600Z" } }, "source": [ @@ -2607,18 +2609,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['City'], [])\n" + "Processing AttributeEquivalenceBlocker(['City'])\n" ] }, { "data": { "text/plain": [ - "{frozenset({1, 4}): {\"Same 'City'\"},\n", - " frozenset({8, 11}): {\"Same 'City'\"},\n", - " frozenset({2, 5}): {\"Same 'City'\"},\n", - " frozenset({10, 13}): {\"Same 'City'\"},\n", - " frozenset({3, 8}): {\"Same 'City'\"},\n", - " frozenset({3, 11}): {\"Same 'City'\"}}" + "{frozenset({1, 4}): [EquivalenceMotive(['City'])],\n", + " frozenset({8, 11}): [EquivalenceMotive(['City'])],\n", + " frozenset({2, 5}): [EquivalenceMotive(['City'])],\n", + " frozenset({10, 13}): [EquivalenceMotive(['City'])],\n", + " frozenset({3, 8}): [EquivalenceMotive(['City'])],\n", + " frozenset({3, 11}): [EquivalenceMotive(['City'])]}" ] }, "execution_count": 26, @@ -2631,9 +2633,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "Of course, this will induce some overhead." - ] + "source": "This will induce some overhead." }, { "cell_type": "markdown", @@ -2646,8 +2646,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.409405100Z", - "start_time": "2026-01-30T14:21:20.374573700Z" + "end_time": "2026-02-04T11:08:23.024485100Z", + "start_time": "2026-02-04T11:08:22.987457200Z" } }, "source": [ @@ -2669,15 +2669,15 @@ "8 13 Benoît Benoît Lens 15 \n", "\n", " websites _block _motive \n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 (Same 'City') \n", - "1 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "2 ['somewebsite.com/users/rpz59'] 1 (Same 'City') \n", - "3 [] 1 (Same 'City') \n", - "4 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 [] 2 (Same 'City') \n", - "6 [] 2 (Same 'City') \n", - "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 (Same 'City') \n", - "8 ['lensfans.fr'] 3 (Same 'City') " + "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 0 [Same 'City'] \n", + "1 ['jacquesdupond.fr'] 0 [Same 'City'] \n", + "2 ['somewebsite.com/users/rpz59'] 1 [Same 'City'] \n", + "3 [] 1 [Same 'City'] \n", + "4 ['roubaixlove.fr'] 2 [Same 'City'] \n", + "5 [] 2 [Same 'City'] \n", + "6 [] 2 [Same 'City'] \n", + "7 ['pythonensamusant.fr', 'lensfans.fr'] 3 [Same 'City'] \n", + "8 ['lensfans.fr'] 3 [Same 'City'] " ], "text/html": [ "
\n", @@ -2716,7 +2716,7 @@ " 37\n", " ['somewebsite.com/users/jacquesdupond', 'jacqu...\n", " 0\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 1\n", @@ -2726,7 +2726,7 @@ " 37\n", " ['jacquesdupond.fr']\n", " 0\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 2\n", @@ -2736,7 +2736,7 @@ " 24\n", " ['somewebsite.com/users/rpz59']\n", " 1\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 3\n", @@ -2746,7 +2746,7 @@ " 24\n", " []\n", " 1\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 4\n", @@ -2756,7 +2756,7 @@ " 32\n", " ['roubaixlove.fr']\n", " 2\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 5\n", @@ -2766,7 +2766,7 @@ " 33\n", " []\n", " 2\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 6\n", @@ -2776,7 +2776,7 @@ " 33\n", " []\n", " 2\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 7\n", @@ -2786,7 +2786,7 @@ " 45\n", " ['pythonensamusant.fr', 'lensfans.fr']\n", " 3\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", " 8\n", @@ -2796,7 +2796,7 @@ " 15\n", " ['lensfans.fr']\n", " 3\n", - " (Same 'City')\n", + " [Same 'City']\n", " \n", " \n", "\n", @@ -2813,23 +2813,19 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "... Though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." - ] + "source": "... though since motives make more sense when considering pairs of rows instead of full blocks, the above visualization is not that interesting..." }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "... Which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" - ] + "source": "... which is the reason you can pass `show_as_pairs=True` to `msb.add_blocks_to_dataset` to see the output has a list of pairs:" }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.612990700Z", - "start_time": "2026-01-30T14:21:20.483928200Z" + "end_time": "2026-02-04T11:08:23.267363800Z", + "start_time": "2026-02-04T11:08:23.194418800Z" } }, "source": [ @@ -2855,13 +2851,13 @@ "4 [] 3 Paul Delarue \n", "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " + " City_r Age_r websites_r _motive _block \n", + "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] [Same 'City'] 0 \n", + "1 Phalempin 24 [] [Same 'City'] 1 \n", + "2 Roubaix 33 [] [Same 'City'] 2 \n", + "3 Roubaix 33 [] [Same 'City'] 2 \n", + "4 Roubaix 32 ['roubaixlove.fr'] [Same 'City'] 2 \n", + "5 Lens 15 ['lensfans.fr'] [Same 'City'] 3 " ], "text/html": [ "
\n", @@ -2892,8 +2888,8 @@ " City_r\n", " Age_r\n", " websites_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -2909,8 +2905,8 @@ " Villeneuve d'Ascq\n", " 37\n", " ['jacquesdupond.fr']\n", + " [Same 'City']\n", " 0\n", - " (Same 'City')\n", " \n", " \n", " 1\n", @@ -2924,8 +2920,8 @@ " Phalempin\n", " 24\n", " []\n", + " [Same 'City']\n", " 1\n", - " (Same 'City')\n", " \n", " \n", " 2\n", @@ -2939,8 +2935,8 @@ " Roubaix\n", " 33\n", " []\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 3\n", @@ -2954,8 +2950,8 @@ " Roubaix\n", " 33\n", " []\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 4\n", @@ -2969,8 +2965,8 @@ " Roubaix\n", " 32\n", " ['roubaixlove.fr']\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 5\n", @@ -2984,8 +2980,8 @@ " Lens\n", " 15\n", " ['lensfans.fr']\n", + " [Same 'City']\n", " 3\n", - " (Same 'City')\n", " \n", " \n", "\n", @@ -3010,8 +3006,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:20.944670700Z", - "start_time": "2026-01-30T14:21:20.834495500Z" + "end_time": "2026-02-04T11:08:23.633084600Z", + "start_time": "2026-02-04T11:08:23.522566900Z" } }, "source": [ @@ -3023,13 +3019,13 @@ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block _motive\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n", - "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n", - "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n", - "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n", - "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n", - "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')" + " id_l Name_l id_r Name_r _motive _block\n", + "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n", + "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n", + "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n", + "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n", + "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n", + "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3" ], "text/html": [ "
\n", @@ -3054,8 +3050,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -3065,8 +3061,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'City']\n", " 0\n", - " (Same 'City')\n", " \n", " \n", " 1\n", @@ -3074,8 +3070,8 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", + " [Same 'City']\n", " 1\n", - " (Same 'City')\n", " \n", " \n", " 2\n", @@ -3083,8 +3079,8 @@ " Paul Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 3\n", @@ -3092,8 +3088,8 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 4\n", @@ -3101,8 +3097,8 @@ " Sophie Delarue\n", " 3\n", " Paul Delarue\n", + " [Same 'City']\n", " 2\n", - " (Same 'City')\n", " \n", " \n", " 5\n", @@ -3110,8 +3106,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [Same 'City']\n", " 3\n", - " (Same 'City')\n", " \n", " \n", "\n", @@ -3132,188 +3128,13 @@ "Motives are dynamic:" ] }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.591044600Z", - "start_time": "2026-01-30T14:21:21.517777200Z" - } - }, - "source": [ - "msb.generate_blocking_report(df, links)" - ], - "outputs": [ - { - "data": { - "text/plain": [ - " id_l Name_l City_l Age_l \\\n", - "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n", - "1 2 Pierre Dusquesnes Phalempin 24 \n", - "2 3 Paul Delarue Roubaix 32 \n", - "3 8 Sophie Delarue Roubaix 33 \n", - "4 8 Sophie Delarue Roubaix 33 \n", - "5 10 Caroline Dufour Lens 45 \n", - "\n", - " websites_l id_r Name_r \\\n", - "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n", - "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n", - "2 ['roubaixlove.fr'] 11 sophie_delarue \n", - "3 [] 11 sophie_delarue \n", - "4 [] 3 Paul Delarue \n", - "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n", - "\n", - " City_r Age_r websites_r _block _motive \n", - "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n", - "1 Phalempin 24 [] 1 (Same 'City') \n", - "2 Roubaix 33 [] 2 (Same 'City') \n", - "3 Roubaix 33 [] 2 (Same 'City') \n", - "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n", - "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') " - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_lName_lCity_lAge_lwebsites_lid_rName_rCity_rAge_rwebsites_r_block_motive
01Jacques DupondVilleneuve d'Ascq37['somewebsite.com/users/jacquesdupond', 'jacqu...4Jacques DupontVilleneuve d'Ascq37['jacquesdupond.fr']0(Same 'City')
12Pierre DusquesnesPhalempin24['somewebsite.com/users/rpz59']5pierre dusquesnesPhalempin24[]1(Same 'City')
23Paul DelarueRoubaix32['roubaixlove.fr']11sophie_delarueRoubaix33[]2(Same 'City')
38Sophie DelarueRoubaix33[]11sophie_delarueRoubaix33[]2(Same 'City')
48Sophie DelarueRoubaix33[]3Paul DelarueRoubaix32['roubaixlove.fr']2(Same 'City')
510Caroline DufourLens45['pythonensamusant.fr', 'lensfans.fr']13Benoît BenoîtLens15['lensfans.fr']3(Same 'City')
\n", - "
" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 30 - }, { "cell_type": "code", "metadata": { "scrolled": true, "ExecuteTime": { - "end_time": "2026-01-30T14:21:21.867809800Z", - "start_time": "2026-01-30T14:21:21.674986800Z" + "end_time": "2026-02-04T11:08:24.180719900Z", + "start_time": "2026-02-04T11:08:24.107699800Z" } }, "source": [ @@ -3337,42 +3158,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n", + "Processing AttributeEquivalenceBlocker(['City', 'Age'])\n", "Processing OverlapBlocker(['websites'], 1)\n" ] }, { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont \n", + "1 1 Jacques Dupond 6 Jean-Michel Python \n", + "2 1 Jacques Dupond 10 Caroline Dufour \n", + "3 1 Jacques Dupond 4 Jacques Dupont \n", + "4 1 Jacques Dupond 6 Jean-Michel Python \n", + "5 1 Jacques Dupond 10 Caroline Dufour \n", + "6 10 Caroline Dufour 6 Jean-Michel Python \n", + "7 10 Caroline Dufour 13 Benoît Benoît \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", + "9 8 Sophie Delarue 11 sophie_delarue \n", + "10 10 Caroline Dufour 6 Jean-Michel Python \n", + "11 10 Caroline Dufour 13 Benoît Benoît \n", + "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", - " _motive \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... \n", - "6 (>=1 overlap in 'websites') \n", - "7 (>=1 overlap in 'websites') \n", - "8 (Same 'City', Same 'Age') \n", - "9 (Same 'City', Same 'Age') \n", - "10 (>=1 overlap in 'websites') \n", - "11 (>=1 overlap in 'websites') \n", - "12 (>=1 overlap in 'websites') " + " _motive _block \n", + "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 0 \n", + "1 [>=1 overlap in 'websites'] 0 \n", + "2 [>=1 overlap in 'websites'] 0 \n", + "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 1 \n", + "4 [>=1 overlap in 'websites'] 1 \n", + "5 [>=1 overlap in 'websites'] 1 \n", + "6 [>=1 overlap in 'websites'] 1 \n", + "7 [>=1 overlap in 'websites'] 1 \n", + "8 [Same 'City', Same 'Age'] 2 \n", + "9 [Same 'City', Same 'Age'] 3 \n", + "10 [>=1 overlap in 'websites'] 4 \n", + "11 [>=1 overlap in 'websites'] 4 \n", + "12 [>=1 overlap in 'websites'] 4 " ], "text/html": [ "
\n", @@ -3397,8 +3218,8 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", + " _block\n", " \n", " \n", " \n", @@ -3408,8 +3229,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 1\n", @@ -3417,8 +3238,8 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 2\n", @@ -3426,8 +3247,8 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 3\n", @@ -3435,8 +3256,8 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 4\n", @@ -3444,8 +3265,8 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 5\n", @@ -3453,8 +3274,8 @@ " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", " \n", " \n", " 6\n", @@ -3462,8 +3283,8 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 7\n", @@ -3471,8 +3292,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 8\n", @@ -3480,8 +3301,8 @@ " Pierre Dusquesnes\n", " 5\n", " pierre dusquesnes\n", + " [Same 'City', Same 'Age']\n", " 2\n", - " (Same 'City', Same 'Age')\n", " \n", " \n", " 9\n", @@ -3489,8 +3310,8 @@ " Sophie Delarue\n", " 11\n", " sophie_delarue\n", + " [Same 'City', Same 'Age']\n", " 3\n", - " (Same 'City', Same 'Age')\n", " \n", " \n", " 10\n", @@ -3498,8 +3319,8 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 11\n", @@ -3507,8 +3328,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", " 12\n", @@ -3516,20 +3337,20 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 4\n", - " (>=1 overlap in 'websites')\n", " \n", " \n", "\n", "
" ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 31 + "execution_count": 30 }, { "cell_type": "markdown", @@ -3539,53 +3360,61 @@ { "cell_type": "markdown", "metadata": {}, - "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of motives." + "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `score=True` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives." }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2026-01-30T14:21:22.186415700Z", - "start_time": "2026-01-30T14:21:22.127304600Z" + "end_time": "2026-02-04T11:08:24.439021100Z", + "start_time": "2026-02-04T11:08:24.368744500Z" } }, "source": [ - "report[\"score\"] = msb.scoring(report)\n", - "report.sort_values(\"score\", ascending=False)" + "report = msb.add_blocks_to_dataset(\n", + " df,\n", + " links,\n", + " motives=True,\n", + " show_as_pairs=True,\n", + " output_columns=[\"id\", \"Name\"],\n", + " merge_blocks=False,\n", + " score=True,\n", + ")\n", + "report.sort_values(\"_score\", ascending=False)" ], "outputs": [ { "data": { "text/plain": [ - " id_l Name_l id_r Name_r _block \\\n", - "0 1 Jacques Dupond 4 Jacques Dupont 0 \n", - "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n", - "2 1 Jacques Dupond 10 Caroline Dufour 0 \n", - "3 1 Jacques Dupond 4 Jacques Dupont 1 \n", - "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n", - "5 1 Jacques Dupond 10 Caroline Dufour 1 \n", - "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n", - "9 8 Sophie Delarue 11 sophie_delarue 3 \n", - "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n", - "7 10 Caroline Dufour 13 Benoît Benoît 1 \n", - "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n", - "11 10 Caroline Dufour 13 Benoît Benoît 4 \n", - "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n", + " id_l Name_l id_r Name_r \\\n", + "0 1 Jacques Dupond 4 Jacques Dupont \n", + "3 1 Jacques Dupond 4 Jacques Dupont \n", + "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n", + "9 8 Sophie Delarue 11 sophie_delarue \n", + "1 1 Jacques Dupond 6 Jean-Michel Python \n", + "4 1 Jacques Dupond 6 Jean-Michel Python \n", + "2 1 Jacques Dupond 10 Caroline Dufour \n", + "6 10 Caroline Dufour 6 Jean-Michel Python \n", + "5 1 Jacques Dupond 10 Caroline Dufour \n", + "7 10 Caroline Dufour 13 Benoît Benoît \n", + "10 10 Caroline Dufour 6 Jean-Michel Python \n", + "11 10 Caroline Dufour 13 Benoît Benoît \n", + "12 13 Benoît Benoît 6 Jean-Michel Python \n", "\n", - " _motive score \n", - "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n", - "8 (Same 'City', Same 'Age') 2 \n", - "9 (Same 'City', Same 'Age') 2 \n", - "6 (>=1 overlap in 'websites') 1 \n", - "7 (>=1 overlap in 'websites') 1 \n", - "10 (>=1 overlap in 'websites') 1 \n", - "11 (>=1 overlap in 'websites') 1 \n", - "12 (>=1 overlap in 'websites') 1 " + " _motive _score _block \n", + "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 0 \n", + "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 1 \n", + "8 [Same 'City', Same 'Age'] 2 2 \n", + "9 [Same 'City', Same 'Age'] 2 3 \n", + "1 [>=1 overlap in 'websites'] 1 0 \n", + "4 [>=1 overlap in 'websites'] 1 1 \n", + "2 [>=1 overlap in 'websites'] 1 0 \n", + "6 [>=1 overlap in 'websites'] 1 1 \n", + "5 [>=1 overlap in 'websites'] 1 1 \n", + "7 [>=1 overlap in 'websites'] 1 1 \n", + "10 [>=1 overlap in 'websites'] 1 4 \n", + "11 [>=1 overlap in 'websites'] 1 4 \n", + "12 [>=1 overlap in 'websites'] 1 4 " ], "text/html": [ "
\n", @@ -3610,9 +3439,9 @@ " Name_l\n", " id_r\n", " Name_r\n", - " _block\n", " _motive\n", - " score\n", + " _score\n", + " _block\n", " \n", " \n", " \n", @@ -3622,39 +3451,49 @@ " Jacques Dupond\n", " 4\n", " Jacques Dupont\n", - " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 3\n", + " 0\n", " \n", " \n", - " 1\n", + " 3\n", " 1\n", " Jacques Dupond\n", - " 6\n", - " Jean-Michel Python\n", - " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", + " 4\n", + " Jacques Dupont\n", + " [Same 'City', Same 'Age', >=1 overlap in 'webs...\n", " 3\n", + " 1\n", " \n", " \n", - " 2\n", - " 1\n", - " Jacques Dupond\n", - " 10\n", - " Caroline Dufour\n", - " 0\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", + " 8\n", + " 2\n", + " Pierre Dusquesnes\n", + " 5\n", + " pierre dusquesnes\n", + " [Same 'City', Same 'Age']\n", + " 2\n", + " 2\n", + " \n", + " \n", + " 9\n", + " 8\n", + " Sophie Delarue\n", + " 11\n", + " sophie_delarue\n", + " [Same 'City', Same 'Age']\n", + " 2\n", " 3\n", " \n", " \n", - " 3\n", + " 1\n", " 1\n", " Jacques Dupond\n", - " 4\n", - " Jacques Dupont\n", + " 6\n", + " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", + " 0\n", " \n", " \n", " 4\n", @@ -3662,39 +3501,19 @@ " Jacques Dupond\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", + " 1\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", " \n", " \n", - " 5\n", + " 2\n", " 1\n", " Jacques Dupond\n", " 10\n", " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites', Same 'City', Same ...\n", - " 3\n", - " \n", - " \n", - " 8\n", - " 2\n", - " Pierre Dusquesnes\n", - " 5\n", - " pierre dusquesnes\n", - " 2\n", - " (Same 'City', Same 'Age')\n", - " 2\n", - " \n", - " \n", - " 9\n", - " 8\n", - " Sophie Delarue\n", - " 11\n", - " sophie_delarue\n", - " 3\n", - " (Same 'City', Same 'Age')\n", - " 2\n", + " 0\n", " \n", " \n", " 6\n", @@ -3702,8 +3521,18 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", + " [>=1 overlap in 'websites']\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 5\n", + " 1\n", + " Jacques Dupond\n", + " 10\n", + " Caroline Dufour\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " 1\n", " \n", " \n", @@ -3712,8 +3541,8 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", + " [>=1 overlap in 'websites']\n", " 1\n", - " (>=1 overlap in 'websites')\n", " 1\n", " \n", " \n", @@ -3722,9 +3551,9 @@ " Caroline Dufour\n", " 6\n", " Jean-Michel Python\n", - " 4\n", - " (>=1 overlap in 'websites')\n", + " [>=1 overlap in 'websites']\n", " 1\n", + " 4\n", " \n", " \n", " 11\n", @@ -3732,9 +3561,9 @@ " Caroline Dufour\n", " 13\n", " Benoît Benoît\n", - " 4\n", - " (>=1 overlap in 'websites')\n", + " [>=1 overlap in 'websites']\n", " 1\n", + " 4\n", " \n", " \n", " 12\n", @@ -3742,15 +3571,53 @@ " Benoît Benoît\n", " 6\n", " Jean-Michel Python\n", - " 4\n", - " (>=1 overlap in 'websites')\n", + " [>=1 overlap in 'websites']\n", " 1\n", + " 4\n", " \n", " \n", "\n", "
" ] }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 31 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-02-04T11:08:24.877566800Z", + "start_time": "2026-02-04T11:08:24.843830900Z" + } + }, + "cell_type": "code", + "source": [ + "city_blocker = msb.OverlapBlocker([\"City\"])\n", + "city_blocker.block(df)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing OverlapBlocker(['City'], 1)\n" + ] + }, + { + "data": { + "text/plain": [ + "{frozenset({3, 8}),\n", + " frozenset({1, 4}),\n", + " frozenset({8, 11}),\n", + " frozenset({3, 11}),\n", + " frozenset({2, 5}),\n", + " frozenset({10, 13})}" + ] + }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py index fffbcc8..b3552af 100644 --- a/src/ms_blocking/ms_blocking.py +++ b/src/ms_blocking/ms_blocking.py @@ -1,5 +1,7 @@ from ms_blocking.utils import * # noqa: F403 +import networkx as nx + class BlockerNode: """Abstract class from which derive all classes in the module""" @@ -46,7 +48,7 @@ def __init__(self, left, right): def __repr__(self): return f"AndNode{{{self.left}, {self.right}}}" - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker coords_left = self.left.block(df, motives=motives) @@ -76,8 +78,7 @@ def __init__(self, left, right): def __repr__(self): return f"OrNode{{{self.left}, {self.right}}}" - - def block(self, df, motives=False): + def block(self, df: pd.DataFrame, motives: bool = False) -> Coords: # Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations coords_left = self.left.block(df, motives=motives) @@ -91,7 +92,10 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf """To regroup rows based on equality across columns.""" def __init__( - self, blocking_columns, normalize_strings=True, must_not_be_different=None + self, + blocking_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + normalize_strings: bool = True, ): super().__init__() @@ -120,7 +124,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"AttributeEquivalenceBlocker({self.blocking_columns}, {self.must_not_be_different})" + return f"AttributeEquivalenceBlocker({self.blocking_columns}{', ' + str(self.must_not_be_different) if self.must_not_be_different else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: @@ -139,21 +143,28 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on equality of one or more columns""" print("Processing", self) - temp_data = data.copy() - - for col in self.blocking_columns: - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data.dropna(subset=self.blocking_columns) - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns + temp_data = ( + data[self.blocking_columns + self.must_not_be_different] + .dropna(subset=self.blocking_columns) + .copy() ) + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs if motives: return dict() @@ -185,9 +196,7 @@ def block(self, data, motives=False): } if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.blocking_columns - } + explanations = [EquivalenceMotive(col) for col in self.blocking_columns] return add_motives_to_coords(coords, explanations) else: return set(coords) # set is unnnecessary @@ -197,7 +206,11 @@ class OverlapBlocker(BlockerNode): # Leaf """To regroup rows based on overlap of one or more columns.""" def __init__( - self, blocking_columns, overlap=1, word_level=False, normalize_strings=True + self, + blocking_columns: str | Collection[str], + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -217,7 +230,7 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"OverlapBlocker({self.blocking_columns}, {self.overlap})" + return f"OverlapBlocker({self.blocking_columns}, {self.overlap}{', WORD-LEVEL' if self.word_level else ''}{', NON-NORMALIZED' if not self.normalize else ''})" def __eq__(self, other): if type(other) is OverlapBlocker: @@ -238,29 +251,31 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) - temp_data = data.copy() + temp_data = data[self.blocking_columns].dropna().copy() - temp_data = temp_data[self.blocking_columns].copy() - - for col in self.blocking_columns: - temp_data[col] = temp_data[col].apply( - parse_list, word_level=self.word_level - ) - temp_data = temp_data.explode(col) - if self.normalize: - temp_data[col] = temp_data[col].apply(normalize) - temp_data = temp_data.dropna( - subset=self.blocking_columns - ) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once( - temp_data, self.blocking_columns + # Ensure we check for overlap between lists of strings + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.blocking_columns) + # Normalize strings if required + if self.normalize: + temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply( + lambda col: col.apply(normalize) + ) + + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[ + temp_data.duplicated(keep=False, subset=self.blocking_columns) + ] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() @@ -268,7 +283,7 @@ def block(self, data, motives=False): return set() # Use the DataFrame index for grouping and forming pairs - # Using frozenset since they are ahshable and thus can be used as dictionary keys + # Using frozenset since they are hashable and thus can be used as dictionary keys groups = temp_data.groupby(self.blocking_columns).apply( lambda x: frozenset(x.index), include_groups=False ) @@ -276,10 +291,10 @@ def block(self, data, motives=False): coords = block_overlap(groups=groups, overlap=self.overlap) if motives: - explanations = { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.blocking_columns - } + explanations = [ + OverlapMotive(col, self.overlap, self.word_level) + for col in self.blocking_columns + ] return add_motives_to_coords(coords, explanations) else: return set(coords) @@ -287,17 +302,17 @@ def block(self, data, motives=False): class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM """Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker. - Designed for performance and RAM efficiency. + Used for performance and RAM efficiency. """ def __init__( self, - equivalence_columns, - overlap_columns, - must_not_be_different=None, - overlap=1, - word_level=False, - normalize_strings=True, + equivalence_columns: str | Collection[str], + overlap_columns: str | Collection[str], + must_not_be_different: str | Collection[str] = None, + overlap: int = 1, + word_level: bool = False, + normalize_strings: bool = True, ): super().__init__() @@ -341,7 +356,16 @@ def __init__( self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them def __repr__(self): - return f"MixedBlocker({self.equivalence_columns}, {self.overlap_columns}, {self.overlap})" + return str( + AndNode( + AttributeEquivalenceBlocker( + self.equivalence_columns, self.must_not_be_different, self.normalize + ), + OverlapBlocker( + self.overlap_columns, self.overlap, self.word_level, self.normalize + ), + ) + ) def __eq__(self, other): if type(other) is AttributeEquivalenceBlocker: @@ -369,31 +393,30 @@ def __eq__(self, other): else: return False - def block(self, data, motives=False): + def block(self, data: pd.DataFrame, motives: bool = False) -> Coords: """Regroup rows based on overlap of one or more columns""" print("Processing", self) total_columns = self.equivalence_columns + self.overlap_columns - temp_data = data[total_columns].copy() - - for col in total_columns: - if col in self.equivalence_columns: - temp_data[col] = temp_data[col].apply(normalize) - elif col in self.overlap_columns: - temp_data[col] = temp_data[col].apply( - lambda x: [ - normalize(item) for item in parse_list(x, self.word_level) - ] - if self.normalize - else parse_list(x, self.word_level) - ) - temp_data = temp_data.explode(col) + temp_data = data[total_columns].dropna().copy() - temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects - temp_data = remove_rows_if_value_appears_only_once(temp_data, total_columns) + # Ensure we check for overlap between lists of strings + temp_data[self.overlap_columns] = temp_data[self.overlap_columns].apply( + lambda col: col.apply(parse_list, word_level=self.word_level) + ) + # Split elements of said lists to compare them one by one + temp_data = temp_data.explode(self.overlap_columns) + # Normalize strings if required + if self.normalize: + temp_data[total_columns] = temp_data[total_columns].apply( + lambda col: col.apply(normalize) + ) + # Non-duplicated values cannot belong to any block; We discard them + temp_data = temp_data[temp_data.duplicated(keep=False, subset=total_columns)] + # No need to run anything else if we already ran out of candidates if len(temp_data) == 0: # No pairs fulfill any overlap if motives: return dict() @@ -426,17 +449,261 @@ def block(self, data, motives=False): coords = coords_equivalence.intersection(coords_overlap) if motives: - explanations = { - f"Same '{column_name}'" for column_name in self.equivalence_columns - } | { - f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'" - for column_name in self.overlap_columns - } + explanations = [ + EquivalenceMotive(col) for col in self.equivalence_columns + ] + [ + OverlapMotive(col, self.overlap, self.word_level) + for col in self.overlap_columns + ] + return add_motives_to_coords(coords, explanations) else: return set(coords) +def add_blocks_to_dataset( + data: pd.DataFrame, + coords: Coords, + sort: bool = True, + keep_ungrouped_rows: bool = False, + merge_blocks: bool = True, + motives: bool = False, + show_as_pairs: bool = False, + output_columns: Columns = None, + score: bool = False, +) -> pd.DataFrame: + """Returns the intersection of an array of links + + Takes two lists of paired elements, with or without motives, returns their intersection + + Parameters + ---------- + data : DataFrame + DataFrame for blocking + coords : Array + Blocked coordinates + sort : bool + Whether to sort the result by block, thereby regrouping rows of the same block + keep_ungrouped_rows : bool + Whether to display rows that do not belong to any block + merge_blocks : bool + Whether to merge transitively merge blocks + motives : bool + Whether to display the reason behind each block + show_as_pairs : bool + Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame + output_columns : list + Columns to show. Useful in combination with show_as_pairs as column names are altered + score : bool + Whether to show a score (computed from the number of motives) + + Returns + ------- + DataFrame + Blocked DataFrame + + Examples + -------- + >>> add_blocks_to_dataset(data=pd.DataFrame( + [ + [0, 'first', 4], + [1, 'second', 6], + [2, 'first', 2], + [3, 'third', 5] + ], + columns=['id', 'rank', 'score']), + coords=np.array([{0, 2}]), + show_as_pairs=True, + output_columns=['id', 'rank']) + id_l rank_l id_r rank_r block + 0 0 first 2 first 0 + """ + + if show_as_pairs and keep_ungrouped_rows: + raise ValueError("Cannot both return pairs and keep ungrouped rows") + + if motives: + if type(coords) is not dict: + raise TypeError("Cannot specify 'motives=True' without passing motives") + + # Ensure the index is a unique identifier + if not data.index.is_unique: + raise ValueError("DataFrame index must be unique to be used as an identifier.") + + if score and not motives: + raise ValueError("Cannot specify 'score=True' without passing motives") + + if "_motive" in data.columns: + if motives: + raise ValueError( + "Please rename existing '_motive' column OR do not pass 'motives=True'" + ) + + if "score" in data.columns: + if score: + raise ValueError( + "Please rename existing '_score' column OR do not pass 'score=True'" + ) + + if "_block" in data.columns: + raise ValueError("Please rename existing '_block' column") + + if output_columns is None: + output_columns = data.columns + + data = data[output_columns].copy() + + if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph + if show_as_pairs: + columns = [col + "_l" for col in data.columns] + [ + col + "_r" for col in data.columns + ] + output_data = pd.DataFrame(columns=columns) + else: + output_data = pd.DataFrame(columns=data.columns) + + if motives: + output_data["_motive"] = "" + if score: + output_data["_score"] = 0 + output_data["_block"] = -1 + + else: + output_data = data + # Map coords to connected component labels + if merge_blocks: # We solve the connected components problem + cc_labels = solve_connected_components_from_coords(coords) + # Match original index to new block ID + matcher = { + idx: label + for idx, label in enumerate(cc_labels) + if label != -1 and idx in data.index + } + else: # We solve the cliques problem + g = nx.Graph() + # noinspection PyTypeChecker + g.add_edges_from(coords) + complete_subgraphs = list(nx.find_cliques(g)) + complete_subgraphs = sorted(complete_subgraphs) + # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} + matcher = dict() + for i, clique in enumerate(complete_subgraphs): + for node_idx in clique: + if node_idx in matcher.keys(): + matcher[node_idx].append(i) + else: + matcher[node_idx] = [i] + + if show_as_pairs: + output_data = pd.DataFrame() + for pair in coords: + left_row = data.loc[[tuple(pair)[0]]].copy() + current_index = left_row.index + right_row = data.loc[[tuple(pair)[1]]].copy() + left_row.columns = [col + "_l" for col in left_row.columns] + right_row.columns = [col + "_r" for col in right_row.columns] + current_row = pd.concat( + [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], + axis=1, + ) + current_row.index = current_index + if motives: + motives_solved = solve_motives(coords[pair]) + current_row["_motive"] = [list(map(str, motives_solved))] + if score: + current_row["_score"] = len( + motives_solved + ) # Score is simply the number of non-redundant motives + output_data = pd.concat([output_data, current_row]) + + # Assign blocks to rows based on their original index + output_data["_block"] = output_data.index.map(matcher) + if not merge_blocks: + output_data = output_data.explode("_block") + + if keep_ungrouped_rows: + output_data["_block"] = output_data["_block"].fillna(-1) + matcher_ungrouped_rows = {} + block_temp = [] + i = 0 # Track # of blocks processed + for b in output_data["_block"]: + if b == -1: + block_temp.append(i) + i += 1 + elif b not in matcher_ungrouped_rows: + matcher_ungrouped_rows[b] = i + block_temp.append(i) + i += 1 + else: + block_temp.append(matcher_ungrouped_rows[b]) + output_data["_block"] = block_temp + else: + if not show_as_pairs: + output_data = output_data[ + output_data["_block"].duplicated(keep=False) + & output_data["_block"].notna() + ] + + output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) + + if sort: + # Sort by block, then by original index + sort_cols = ["_block"] + if output_data.index.name: + output_data = output_data.sort_values( + sort_cols + [output_data.index.name] + ) + else: + # If no named index, use the first column of the DataFrame + output_data = output_data.reset_index() + output_data = output_data.sort_values( + sort_cols + [output_data.columns[0]] + ) + output_data = output_data.set_index(output_data.columns[0]) + + if not show_as_pairs and motives: + id_list = flatten(coords.keys()) + motive_matcher = { + row_id: list(map(str, solve_motives(coords[pair]))) + for pair in coords.keys() + for row_id in id_list + if row_id in pair + } + # noinspection PyTypeChecker + output_data["_motive"] = output_data.index.map(motive_matcher) + if score: + output_data["_score"] = 0 + score_matcher = { # Horribly repetitive + row_id: len(solve_motives(coords[pair])) + for pair in coords.keys() + for row_id in id_list + if row_id in pair + } + output_data["_score"] = output_data.index.map(score_matcher) + + output_data = output_data.reset_index(drop=True) + output_data["_block"] = output_data["_block"].astype(int) + + return output_data + + +def generate_blocking_report( + data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None +) -> pd.DataFrame: + """ + Shorthand for add_blocks_to_dataset with below arguments + """ + return add_blocks_to_dataset( + data, + coords, + sort=True, + merge_blocks=False, + motives=True, + show_as_pairs=True, + output_columns=output_columns, + ) + + def merge_blockers( left: BlockerNode, right: BlockerNode ) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode: @@ -592,3 +859,6 @@ def merge_blockers( ) else: return AndNode(left, right) + + +# TODO: deport logic in a way that enables .progress_apply diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py index 837645f..b644a43 100644 --- a/src/ms_blocking/utils.py +++ b/src/ms_blocking/utils.py @@ -4,53 +4,72 @@ from scipy.sparse import coo_matrix from scipy.sparse.csgraph import connected_components import pandas as pd -import networkx as nx import random from collections import Counter from itertools import combinations from typing import List, Set, Iterable, Dict, Collection, Any + +class EquivalenceMotive: + def __init__(self, blocking_column: str): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + self.blocking_column = blocking_column + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") + return self.blocking_column == other.blocking_column + + def __str__(self): + return f"Same '{self.blocking_column}'" + + def __repr__(self): + return f"EquivalenceMotive(['{self.blocking_column}'])" + + +class OverlapMotive: + def __init__( + self, blocking_column: str, overlap: int = 1, word_level: bool = False + ): + if not isinstance(blocking_column, str): + raise TypeError("blocking_column for Motive must be a string") + if not isinstance(overlap, int): + raise TypeError("overlap must be an int") + if not isinstance(word_level, bool): + raise TypeError("word_level must be a boolean") + self.blocking_column = blocking_column + self.overlap = overlap + self.word_level = word_level + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, EquivalenceMotive | OverlapMotive): + raise TypeError("Can only compare Motives") + return ( + self.blocking_column == other.blocking_column + and self.overlap == other.overlap + and self.word_level == other.word_level + ) + + def __str__(self): + return f">={self.overlap}{' word-level' if self.word_level else ''} overlap in '{self.blocking_column}'" + + def __repr__(self): + return f"OverlapMotive(['{self.blocking_column}'], {self.overlap}{', word_level=True' if self.word_level else ''})" + + Columns = List[str] Pair = Collection[int] +Motive = EquivalenceMotive | OverlapMotive CoordsBasic = Set[Pair] -CoordsMotives = Dict[Pair, Set[str]] +CoordsMotives = Dict[Pair, List[Motive]] Coords = CoordsBasic | CoordsMotives _PUNCT_RE = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~]') _SPACE_RE = re.compile(r"\s+") -def remove_rows_if_value_appears_only_once( - data: pd.DataFrame, cols: Columns -) -> pd.DataFrame: - """Drop rows of a Pandas DataFrame where a certain column's values appears only once. - - Ensures all elements of provided columns appear at least twice in their column - - Parameters - ---------- - data : DataFrame - DataFrame to preprocess - - cols : List[str] - List of columns where rows that contain non-duplicated elements shall be discarded - - Returns - ------- - DataFrame - DataFrame with reduced number of rows - - Examples - -------- - >>> remove_rows_if_value_appears_only_once(data, ['name', 'city']) - """ - for col in cols: - counts = data[col].map(data[col].value_counts()) - data = data[counts >= 2] - return data - - def start_from_zero(figures: Collection[int]) -> List[int]: """Turns a list of integers into a same-length list that starts at 0, without gaps @@ -240,7 +259,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords: if type(coords_1) is type(coords_2) is dict: # We have motives return { pair: ( - (coords_1[pair] | coords_2[pair]) + coords_1[pair] + coords_2[pair] if (pair in coords_1 and pair in coords_2) else coords_1[pair] if (pair in coords_1) @@ -278,7 +297,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: """ if type(coords_1) is type(coords_2) is dict: # We have motives return { - pair: (coords_1[pair] | coords_2[pair]) + pair: coords_1[pair] + coords_2[pair] for y in (coords_1, coords_2) for pair in y.keys() if (pair in coords_1 and pair in coords_2) @@ -287,219 +306,6 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords: return coords_1.intersection(coords_2) -def add_blocks_to_dataset( - data: pd.DataFrame, - coords: Coords, - sort: bool = True, - keep_ungrouped_rows: bool = False, - merge_blocks: bool = True, - motives: bool = False, - show_as_pairs: bool = False, - output_columns: Columns = None, -) -> pd.DataFrame: - """Returns the intersection of an array of links - - Takes two lists of paired elements, with or without motives, returns their intersection - - Parameters - ---------- - data : DataFrame - DataFrame for blocking - coords : Array - Blocked coordinates - sort : bool - Whether to sort the result by block, thereby regrouping rows of the same block - keep_ungrouped_rows : bool - Whether to display rows that do not belong to any block - merge_blocks : bool - Whether to merge transitively merge blocks - motives : bool - Whether to display the reason behind each block - show_as_pairs : bool - Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame - output_columns : list - Columns to show. Useful in combination with show_as_pairs as column names are altered - - Returns - ------- - DataFrame - Blocked DataFrame - - Examples - -------- - >>> add_blocks_to_dataset(data=pd.DataFrame( - [ - [0, 'first', 4], - [1, 'second', 6], - [2, 'first', 2], - [3, 'third', 5] - ], - columns=['id', 'rank', 'score']), - coords=np.array([{0, 2}]), - show_as_pairs=True, - output_columns=['id', 'rank']) - id_l rank_l id_r rank_r block - 0 0 first 2 first 0 - """ - - if show_as_pairs and keep_ungrouped_rows: - raise ValueError("Cannot both return pairs and keep ungrouped rows") - - if motives: - if type(coords) is not dict: - raise TypeError("Cannot specify motives=True without passing motives") - - # Ensure the index is a unique identifier - if not data.index.is_unique: - raise ValueError("DataFrame index must be unique to be used as an identifier.") - - if "_motive" in data.columns: - if motives: - raise ValueError( - "Please rename existing '_motive' column OR do not pass 'motives=True'" - ) - - if "_block" in data.columns: - raise ValueError("Please rename existing '_block' column") - - if output_columns is None: - output_columns = data.columns - data = data[output_columns].copy() - - if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph - if show_as_pairs: - columns = [col + "_l" for col in data.columns] + [ - col + "_r" for col in data.columns - ] - output_data = pd.DataFrame(columns=columns) - else: - output_data = pd.DataFrame(columns=data.columns) - else: - output_data = data - # Map coords to connected component labels - if merge_blocks: # We solve the connected components problem - cc_labels = solve_connected_components_from_coords(coords) - # Match original index to new block ID - matcher = { - idx: label - for idx, label in enumerate(cc_labels) - if label != -1 and idx in data.index - } - else: # We solve the cliques problem - g = nx.Graph() - # noinspection PyTypeChecker - g.add_edges_from(coords) - complete_subgraphs = list(nx.find_cliques(g)) - complete_subgraphs = sorted(complete_subgraphs) - # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))} - matcher = dict() - for i, clique in enumerate(complete_subgraphs): - for node_idx in clique: - if node_idx in matcher.keys(): - matcher[node_idx].append(i) - else: - matcher[node_idx] = [i] - - if show_as_pairs: - output_data = pd.DataFrame() - for pair in coords: - left_row = data.loc[[tuple(pair)[0]]].copy() - current_index = left_row.index - right_row = data.loc[[tuple(pair)[1]]].copy() - left_row.columns = [col + "_l" for col in left_row.columns] - right_row.columns = [col + "_r" for col in right_row.columns] - current_row = pd.concat( - [left_row.reset_index(drop=True), right_row.reset_index(drop=True)], - axis=1, - ) - current_row.index = current_index - output_data = pd.concat([output_data, current_row]) - - # Assign blocks to rows based on their original index - output_data["_block"] = output_data.index.map(matcher) - if not merge_blocks: - output_data = output_data.explode("_block") - - if keep_ungrouped_rows: - output_data["_block"] = output_data["_block"].fillna(-1) - matcher_ungrouped_rows = {} - block_temp = [] - i = 0 # Track # of blocks processed - for b in output_data["_block"]: - if b == -1: - block_temp.append(i) - i += 1 - elif b not in matcher_ungrouped_rows: - matcher_ungrouped_rows[b] = i - block_temp.append(i) - i += 1 - else: - block_temp.append(matcher_ungrouped_rows[b]) - output_data["_block"] = block_temp - else: - if not show_as_pairs: - output_data = output_data[ - output_data["_block"].duplicated(keep=False) - & output_data["_block"].notna() - ] - - output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"]) - - if sort: - # Sort by block, then by original index - sort_cols = ["_block"] - if output_data.index.name: - output_data = output_data.sort_values( - sort_cols + [output_data.index.name] - ) - else: - # If no named index, use the first column of the DataFrame - output_data = output_data.reset_index() - output_data = output_data.sort_values( - sort_cols + [output_data.columns[0]] - ) - output_data = output_data.set_index(output_data.columns[0]) - - if motives: - output_data["_motive"] = "" - id_list = flatten(coords.keys()) - motive_matcher = { - row_id: frozenset( - reason - for pair in coords.keys() - if row_id in pair - for reason in coords[pair] - ) - for row_id in id_list - } - output_data["_motive"] = output_data.index.map(motive_matcher) - - if "_block" not in output_data.columns: # Empty coords - output_data["_block"] = -1 - - output_data = output_data.reset_index(drop=True) - output_data["_block"] = output_data["_block"].astype(int) - - return output_data - - -def generate_blocking_report( - data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None -) -> pd.DataFrame: - """ - Shorthand for add_blocks_to_dataset with below arguments - """ - return add_blocks_to_dataset( - data, - coords, - sort=True, - merge_blocks=False, - motives=True, - show_as_pairs=True, - output_columns=output_columns, - ) - - def parse_list(s: str | List, word_level: bool = False) -> List[str]: """Turns a stringified list into an actual python list, taking extra inner quotes into account @@ -511,7 +317,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: Stringified representation of a list e.g. "['string 1', 'string 2', ...]" word_level : bool - Whether to return a list of all words within s instead of a list of each comma-separated element + Whether to return a list of all words within s instead of a list of each comma-separated element; + Note that if passed a string that does not represent a list, this argument will be ignored and the function + will return a list of each word in the string Returns ------- @@ -527,7 +335,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: """ if type(s) is list: # If we already have a list - if len(s) == 1 and s[0][0] == "[" and s[0][-1] == "]": + if ( + len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).endswith("]") + ): # In case we have a stringified list INSIDE a normal list s = s[0] else: return s @@ -540,10 +350,15 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: if not s: return [] - try: - parts = ast.literal_eval(s) - except ValueError: # doesn't seem to be a stringified list - parts = s.split("', '") + if s.startswith("[") and s.endswith("]"): # Stringified list? + try: + parts = ast.literal_eval(s) + except ValueError: # doesn't seem to be a stringified list + parts = s.split("', '") + except SyntaxError: # In case we have a string surroudned by brackets + parts = s.split() + else: + parts = s.split() cleaned_items = [str(part).strip().strip("''") for part in parts] @@ -553,40 +368,6 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]: return [s for s in cleaned_items if len(s) > 0] -def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series: - """Add a score to a blocked DataFrame based on the number of motives - - Parameters - ---------- - data : DataFrame - DataFrame with motives - - motives_column : str - Name of the column containing the motives - - Returns - ------- - Series[int] - A column of scores - """ - - # Check that we do have motives - if motives_column not in data.columns: - if motives_column == "_motive": - raise ValueError("No motives in DataFrame") - else: - raise ValueError( - f'Specified motives column "{motives_column}" does not exist' - ) - - if "score" in data.columns: - print("Renaming 'score' column to 'score_old'") - data = data.rename(columns={"score": "score_old"}) - - scores = data[motives_column].apply(len) - return scores - - def must_not_be_different_apply( # WIP temp_data: pd.DataFrame, blocking_columns: List[str], @@ -682,7 +463,9 @@ def block_overlap(groups: Iterable, overlap: int = 1) -> Coords: return coords -def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives: +def add_motives_to_coords( + coords: Coords, explanations: List[Motive] +) -> Dict[Pair, List[Motive]]: """Block a DataFrame based on overlap accross columns Parameters @@ -690,7 +473,7 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv coords : Coords Coords obtained by blocking - explanations : Set[str] + explanations : Set[EquivalenceMotive|OverlapMotive] Set of explanations Returns @@ -718,3 +501,99 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv } """ return {pair: explanations for pair in coords} + + +def solve_motives(motives: List[Motive]) -> List[Motive]: + """Remove duplicated and redundant motives from a list of motives + + Redundant motives refer to OverlapMotives on the same column(s) but with different overlap or word-level condition + + Parameters + ---------- + motives : List[Motive] + Coords obtained by blocking + + Returns + ------- + List[Motive] + A list of Motives whose length should be smaller or equal to the original list of motives + + Examples + -------- + >>> solve_motives([OverlapMotive('websites', 1), OverlapMotive('websites', 2), OverlapMotive('websites', 2, word_level=False)]) + [OverlapMotive(['websites'], 2, word_level=False)] + """ + if not motives: + raise ValueError("Motives must not be empty") + + # split_motives = [] + # for motive in motives: + # split_motives += split_motive(motive) + + final_motives = [ + motive for motive in motives if type(motive) is EquivalenceMotive + ] # With EquivalenceMotive, equality check suffices + overlap_motives = [motive for motive in motives if type(motive) is OverlapMotive] + overlap_columns = [motive.blocking_column for motive in overlap_motives] + + for column in overlap_columns: + overlap_motives_for_column = [ + motive for motive in overlap_motives if motive.blocking_column == column + ] + + # Select Blocker with stricter word/element-level condition + word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if motive.word_level + ] + not_word_level_motives_for_column = [ + motive for motive in overlap_motives_for_column if not motive.word_level + ] + + # Find biggest overlap among the non-word_level ones + if not_word_level_motives_for_column: + max_overlap_not_word_level_for_column = max( + not_word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_not_word_level_for_column_overlap = ( + max_overlap_not_word_level_for_column.overlap + ) + else: + max_overlap_not_word_level_for_column = [] + max_overlap_not_word_level_for_column_overlap = ( + 0 # Will never be used, left for linter + ) + + # Now find biggest overlap among the word_level ones + if word_level_motives_for_column: + max_overlap_word_level_for_column = max( + word_level_motives_for_column, key=lambda m: m.overlap + ) + max_overlap_word_level_for_column_overlap = ( + max_overlap_word_level_for_column.overlap + ) + if not_word_level_motives_for_column: + # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it + if ( + max_overlap_word_level_for_column_overlap + <= max_overlap_not_word_level_for_column_overlap + ): + max_overlap_word_level_for_column = [] + else: + max_overlap_word_level_for_column = [] + + if max_overlap_not_word_level_for_column: + max_overlap_not_word_level_for_column = [ + max_overlap_not_word_level_for_column + ] + if max_overlap_word_level_for_column: + max_overlap_word_level_for_column = [max_overlap_word_level_for_column] + final_motives += ( + max_overlap_word_level_for_column + max_overlap_not_word_level_for_column + ) + + # Remove duplicates + final_motives_no_duplicates = [] + for motive in final_motives: + if motive not in final_motives_no_duplicates: + final_motives_no_duplicates.append(motive) + return final_motives_no_duplicates diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py index d3f9ab2..cf92924 100644 --- a/tests/test_ms_blocking.py +++ b/tests/test_ms_blocking.py @@ -84,18 +84,28 @@ def attribute_city_keep_ungrouped_rows_false(): @pytest.fixture def attribute_city_motives_true_block(): return { - frozenset({3, 8}): {"Same 'City'"}, - frozenset({1, 4}): {"Same 'City'"}, - frozenset({8, 11}): {"Same 'City'"}, - frozenset({3, 11}): {"Same 'City'"}, - frozenset({2, 5}): {"Same 'City'"}, - frozenset({10, 13}): {"Same 'City'"}, + frozenset({3, 8}): [msb.EquivalenceMotive("City")], + frozenset({1, 4}): [msb.EquivalenceMotive("City")], + frozenset({8, 11}): [msb.EquivalenceMotive("City")], + frozenset({3, 11}): [msb.EquivalenceMotive("City")], + frozenset({2, 5}): [msb.EquivalenceMotive("City")], + frozenset({10, 13}): [msb.EquivalenceMotive("City")], } @pytest.fixture def attribute_city_motives_true_add(): - return [{"Same 'City'"}] * 9 + return [ + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ["Same 'City'"], + ] @pytest.fixture @@ -116,25 +126,30 @@ def city_age_name_websites_pipelining_id(): @pytest.fixture def city_age_websites_pipelining_motives(): return [ - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({"Same 'Age'", "Same 'City'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), - frozenset({">=1 overlap in 'websites'"}), + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {"Same 'City'", "Same 'Age'"}, + {"Same 'City'", "Same 'Age'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, + {">=1 overlap in 'websites'"}, ] @pytest.fixture def city_age_websites_pipelining_scores(): - return [3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1] + return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +@pytest.fixture +def city_age_websites_pipelining_scores_not_show_as_pairs(): + return [3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1] @pytest.fixture @@ -335,9 +350,10 @@ def test_pipelining_motives(city_age_websites_pipelining_motives): websites_blocker = msb.OverlapBlocker(["websites"]) final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) - actual = msb.add_blocks_to_dataset( + motives = msb.add_blocks_to_dataset( # Use set to ignore ordering get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False )["_motive"].to_list() + actual = [set(motive) for motive in motives] assert actual == expected @@ -350,9 +366,36 @@ def test_pipelining_scores(city_age_websites_pipelining_scores): final_blocker = (city_blocker & age_blocker) | websites_blocker links = final_blocker.block(get_users(), motives=True) report = msb.add_blocks_to_dataset( - get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False + get_users(), + links, + show_as_pairs=True, + motives=True, + merge_blocks=False, + score=True, + ) + actual = sorted(report["_score"], reverse=True) + assert actual == expected + + +def test_pipelining_scores_without_show_as_pairs( + city_age_websites_pipelining_scores_not_show_as_pairs, +): + """Test that scoring does work as intended""" + expected = city_age_websites_pipelining_scores_not_show_as_pairs + city_blocker = msb.AttributeEquivalenceBlocker(["City"]) + age_blocker = msb.AttributeEquivalenceBlocker(["Age"]) + websites_blocker = msb.OverlapBlocker(["websites"]) + final_blocker = (city_blocker & age_blocker) | websites_blocker + links = final_blocker.block(get_users(), motives=True) + report = msb.add_blocks_to_dataset( + get_users(), + links, + show_as_pairs=False, + motives=True, + merge_blocks=False, + score=True, ) - actual = sorted(msb.scoring(report), reverse=True) + actual = sorted(report["_score"], reverse=True) assert actual == expected