\n",
"\n",
@@ -3010,8 +3006,8 @@
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:20.944670700Z",
- "start_time": "2026-01-30T14:21:20.834495500Z"
+ "end_time": "2026-02-04T11:08:23.633084600Z",
+ "start_time": "2026-02-04T11:08:23.522566900Z"
}
},
"source": [
@@ -3023,13 +3019,13 @@
{
"data": {
"text/plain": [
- " id_l Name_l id_r Name_r _block _motive\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 (Same 'City')\n",
- "1 2 Pierre Dusquesnes 5 pierre dusquesnes 1 (Same 'City')\n",
- "2 3 Paul Delarue 11 sophie_delarue 2 (Same 'City')\n",
- "3 8 Sophie Delarue 11 sophie_delarue 2 (Same 'City')\n",
- "4 8 Sophie Delarue 3 Paul Delarue 2 (Same 'City')\n",
- "5 10 Caroline Dufour 13 Benoît Benoît 3 (Same 'City')"
+ " id_l Name_l id_r Name_r _motive _block\n",
+ "0 1 Jacques Dupond 4 Jacques Dupont [Same 'City'] 0\n",
+ "1 2 Pierre Dusquesnes 5 pierre dusquesnes [Same 'City'] 1\n",
+ "2 3 Paul Delarue 11 sophie_delarue [Same 'City'] 2\n",
+ "3 8 Sophie Delarue 11 sophie_delarue [Same 'City'] 2\n",
+ "4 8 Sophie Delarue 3 Paul Delarue [Same 'City'] 2\n",
+ "5 10 Caroline Dufour 13 Benoît Benoît [Same 'City'] 3"
],
"text/html": [
"
\n",
@@ -3054,8 +3050,8 @@
"
Name_l | \n",
"
id_r | \n",
"
Name_r | \n",
- "
_block | \n",
"
_motive | \n",
+ "
_block | \n",
" \n",
" \n",
"
\n",
@@ -3065,8 +3061,8 @@
" Jacques Dupond | \n",
" 4 | \n",
" Jacques Dupont | \n",
+ " [Same 'City'] | \n",
" 0 | \n",
- " (Same 'City') | \n",
" \n",
" \n",
" | 1 | \n",
@@ -3074,8 +3070,8 @@
" Pierre Dusquesnes | \n",
" 5 | \n",
" pierre dusquesnes | \n",
+ " [Same 'City'] | \n",
" 1 | \n",
- " (Same 'City') | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -3083,8 +3079,8 @@
" Paul Delarue | \n",
" 11 | \n",
" sophie_delarue | \n",
+ " [Same 'City'] | \n",
" 2 | \n",
- " (Same 'City') | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -3092,8 +3088,8 @@
" Sophie Delarue | \n",
" 11 | \n",
" sophie_delarue | \n",
+ " [Same 'City'] | \n",
" 2 | \n",
- " (Same 'City') | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -3101,8 +3097,8 @@
" Sophie Delarue | \n",
" 3 | \n",
" Paul Delarue | \n",
+ " [Same 'City'] | \n",
" 2 | \n",
- " (Same 'City') | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -3110,8 +3106,8 @@
" Caroline Dufour | \n",
" 13 | \n",
" Benoît Benoît | \n",
+ " [Same 'City'] | \n",
" 3 | \n",
- " (Same 'City') | \n",
"
\n",
" \n",
"\n",
@@ -3132,188 +3128,13 @@
"Motives are dynamic:"
]
},
- {
- "cell_type": "code",
- "metadata": {
- "ExecuteTime": {
- "end_time": "2026-01-30T14:21:21.591044600Z",
- "start_time": "2026-01-30T14:21:21.517777200Z"
- }
- },
- "source": [
- "msb.generate_blocking_report(df, links)"
- ],
- "outputs": [
- {
- "data": {
- "text/plain": [
- " id_l Name_l City_l Age_l \\\n",
- "0 1 Jacques Dupond Villeneuve d'Ascq 37 \n",
- "1 2 Pierre Dusquesnes Phalempin 24 \n",
- "2 3 Paul Delarue Roubaix 32 \n",
- "3 8 Sophie Delarue Roubaix 33 \n",
- "4 8 Sophie Delarue Roubaix 33 \n",
- "5 10 Caroline Dufour Lens 45 \n",
- "\n",
- " websites_l id_r Name_r \\\n",
- "0 ['somewebsite.com/users/jacquesdupond', 'jacqu... 4 Jacques Dupont \n",
- "1 ['somewebsite.com/users/rpz59'] 5 pierre dusquesnes \n",
- "2 ['roubaixlove.fr'] 11 sophie_delarue \n",
- "3 [] 11 sophie_delarue \n",
- "4 [] 3 Paul Delarue \n",
- "5 ['pythonensamusant.fr', 'lensfans.fr'] 13 Benoît Benoît \n",
- "\n",
- " City_r Age_r websites_r _block _motive \n",
- "0 Villeneuve d'Ascq 37 ['jacquesdupond.fr'] 0 (Same 'City') \n",
- "1 Phalempin 24 [] 1 (Same 'City') \n",
- "2 Roubaix 33 [] 2 (Same 'City') \n",
- "3 Roubaix 33 [] 2 (Same 'City') \n",
- "4 Roubaix 32 ['roubaixlove.fr'] 2 (Same 'City') \n",
- "5 Lens 15 ['lensfans.fr'] 3 (Same 'City') "
- ],
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_l | \n",
- " Name_l | \n",
- " City_l | \n",
- " Age_l | \n",
- " websites_l | \n",
- " id_r | \n",
- " Name_r | \n",
- " City_r | \n",
- " Age_r | \n",
- " websites_r | \n",
- " _block | \n",
- " _motive | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['somewebsite.com/users/jacquesdupond', 'jacqu... | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
- " Villeneuve d'Ascq | \n",
- " 37 | \n",
- " ['jacquesdupond.fr'] | \n",
- " 0 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " ['somewebsite.com/users/rpz59'] | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " Phalempin | \n",
- " 24 | \n",
- " [] | \n",
- " 1 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " Roubaix | \n",
- " 33 | \n",
- " [] | \n",
- " 3 | \n",
- " Paul Delarue | \n",
- " Roubaix | \n",
- " 32 | \n",
- " ['roubaixlove.fr'] | \n",
- " 2 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- " | 5 | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " Lens | \n",
- " 45 | \n",
- " ['pythonensamusant.fr', 'lensfans.fr'] | \n",
- " 13 | \n",
- " Benoît Benoît | \n",
- " Lens | \n",
- " 15 | \n",
- " ['lensfans.fr'] | \n",
- " 3 | \n",
- " (Same 'City') | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 30
- },
{
"cell_type": "code",
"metadata": {
"scrolled": true,
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:21.867809800Z",
- "start_time": "2026-01-30T14:21:21.674986800Z"
+ "end_time": "2026-02-04T11:08:24.180719900Z",
+ "start_time": "2026-02-04T11:08:24.107699800Z"
}
},
"source": [
@@ -3337,42 +3158,42 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Processing AttributeEquivalenceBlocker(['Age', 'City'], [])\n",
+ "Processing AttributeEquivalenceBlocker(['City', 'Age'])\n",
"Processing OverlapBlocker(['websites'], 1)\n"
]
},
{
"data": {
"text/plain": [
- " id_l Name_l id_r Name_r _block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
+ " id_l Name_l id_r Name_r \\\n",
+ "0 1 Jacques Dupond 4 Jacques Dupont \n",
+ "1 1 Jacques Dupond 6 Jean-Michel Python \n",
+ "2 1 Jacques Dupond 10 Caroline Dufour \n",
+ "3 1 Jacques Dupond 4 Jacques Dupont \n",
+ "4 1 Jacques Dupond 6 Jean-Michel Python \n",
+ "5 1 Jacques Dupond 10 Caroline Dufour \n",
+ "6 10 Caroline Dufour 6 Jean-Michel Python \n",
+ "7 10 Caroline Dufour 13 Benoît Benoît \n",
+ "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n",
+ "9 8 Sophie Delarue 11 sophie_delarue \n",
+ "10 10 Caroline Dufour 6 Jean-Michel Python \n",
+ "11 10 Caroline Dufour 13 Benoît Benoît \n",
+ "12 13 Benoît Benoît 6 Jean-Michel Python \n",
"\n",
- " _motive \n",
- "0 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "1 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "2 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "3 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "4 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "5 (>=1 overlap in 'websites', Same 'City', Same ... \n",
- "6 (>=1 overlap in 'websites') \n",
- "7 (>=1 overlap in 'websites') \n",
- "8 (Same 'City', Same 'Age') \n",
- "9 (Same 'City', Same 'Age') \n",
- "10 (>=1 overlap in 'websites') \n",
- "11 (>=1 overlap in 'websites') \n",
- "12 (>=1 overlap in 'websites') "
+ " _motive _block \n",
+ "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 0 \n",
+ "1 [>=1 overlap in 'websites'] 0 \n",
+ "2 [>=1 overlap in 'websites'] 0 \n",
+ "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 1 \n",
+ "4 [>=1 overlap in 'websites'] 1 \n",
+ "5 [>=1 overlap in 'websites'] 1 \n",
+ "6 [>=1 overlap in 'websites'] 1 \n",
+ "7 [>=1 overlap in 'websites'] 1 \n",
+ "8 [Same 'City', Same 'Age'] 2 \n",
+ "9 [Same 'City', Same 'Age'] 3 \n",
+ "10 [>=1 overlap in 'websites'] 4 \n",
+ "11 [>=1 overlap in 'websites'] 4 \n",
+ "12 [>=1 overlap in 'websites'] 4 "
],
"text/html": [
"
\n",
@@ -3397,8 +3218,8 @@
"
Name_l | \n",
" id_r | \n",
" Name_r | \n",
- " _block | \n",
" _motive | \n",
+ " _block | \n",
" \n",
" \n",
" \n",
@@ -3408,8 +3229,8 @@
" Jacques Dupond | \n",
" 4 | \n",
" Jacques Dupont | \n",
+ " [Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
" 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
" \n",
" \n",
" | 1 | \n",
@@ -3417,8 +3238,8 @@
" Jacques Dupond | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 2 | \n",
@@ -3426,8 +3247,8 @@
" Jacques Dupond | \n",
" 10 | \n",
" Caroline Dufour | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 3 | \n",
@@ -3435,8 +3256,8 @@
" Jacques Dupond | \n",
" 4 | \n",
" Jacques Dupont | \n",
+ " [Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -3444,8 +3265,8 @@
" Jacques Dupond | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 5 | \n",
@@ -3453,8 +3274,8 @@
" Jacques Dupond | \n",
" 10 | \n",
" Caroline Dufour | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
"
\n",
" \n",
" | 6 | \n",
@@ -3462,8 +3283,8 @@
" Caroline Dufour | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites') | \n",
"
\n",
" \n",
" | 7 | \n",
@@ -3471,8 +3292,8 @@
" Caroline Dufour | \n",
" 13 | \n",
" Benoît Benoît | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites') | \n",
"
\n",
" \n",
" | 8 | \n",
@@ -3480,8 +3301,8 @@
" Pierre Dusquesnes | \n",
" 5 | \n",
" pierre dusquesnes | \n",
+ " [Same 'City', Same 'Age'] | \n",
" 2 | \n",
- " (Same 'City', Same 'Age') | \n",
"
\n",
" \n",
" | 9 | \n",
@@ -3489,8 +3310,8 @@
" Sophie Delarue | \n",
" 11 | \n",
" sophie_delarue | \n",
+ " [Same 'City', Same 'Age'] | \n",
" 3 | \n",
- " (Same 'City', Same 'Age') | \n",
"
\n",
" \n",
" | 10 | \n",
@@ -3498,8 +3319,8 @@
" Caroline Dufour | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 4 | \n",
- " (>=1 overlap in 'websites') | \n",
"
\n",
" \n",
" | 11 | \n",
@@ -3507,8 +3328,8 @@
" Caroline Dufour | \n",
" 13 | \n",
" Benoît Benoît | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 4 | \n",
- " (>=1 overlap in 'websites') | \n",
"
\n",
" \n",
" | 12 | \n",
@@ -3516,20 +3337,20 @@
" Benoît Benoît | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 4 | \n",
- " (>=1 overlap in 'websites') | \n",
"
\n",
" \n",
"\n",
""
]
},
- "execution_count": 31,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 31
+ "execution_count": 30
},
{
"cell_type": "markdown",
@@ -3539,53 +3360,61 @@
{
"cell_type": "markdown",
"metadata": {},
- "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `scoring` gives you an indicator of the likelihood of rows behing duplicates based on the number of motives."
+ "source": "For reports, it can be interesting to have numbers to drive decision-making. Using `score=True` gives you an indicator of the likelihood of rows behing duplicates based on the number of distinct motives."
},
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
- "end_time": "2026-01-30T14:21:22.186415700Z",
- "start_time": "2026-01-30T14:21:22.127304600Z"
+ "end_time": "2026-02-04T11:08:24.439021100Z",
+ "start_time": "2026-02-04T11:08:24.368744500Z"
}
},
"source": [
- "report[\"score\"] = msb.scoring(report)\n",
- "report.sort_values(\"score\", ascending=False)"
+ "report = msb.add_blocks_to_dataset(\n",
+ " df,\n",
+ " links,\n",
+ " motives=True,\n",
+ " show_as_pairs=True,\n",
+ " output_columns=[\"id\", \"Name\"],\n",
+ " merge_blocks=False,\n",
+ " score=True,\n",
+ ")\n",
+ "report.sort_values(\"_score\", ascending=False)"
],
"outputs": [
{
"data": {
"text/plain": [
- " id_l Name_l id_r Name_r _block \\\n",
- "0 1 Jacques Dupond 4 Jacques Dupont 0 \n",
- "1 1 Jacques Dupond 6 Jean-Michel Python 0 \n",
- "2 1 Jacques Dupond 10 Caroline Dufour 0 \n",
- "3 1 Jacques Dupond 4 Jacques Dupont 1 \n",
- "4 1 Jacques Dupond 6 Jean-Michel Python 1 \n",
- "5 1 Jacques Dupond 10 Caroline Dufour 1 \n",
- "8 2 Pierre Dusquesnes 5 pierre dusquesnes 2 \n",
- "9 8 Sophie Delarue 11 sophie_delarue 3 \n",
- "6 10 Caroline Dufour 6 Jean-Michel Python 1 \n",
- "7 10 Caroline Dufour 13 Benoît Benoît 1 \n",
- "10 10 Caroline Dufour 6 Jean-Michel Python 4 \n",
- "11 10 Caroline Dufour 13 Benoît Benoît 4 \n",
- "12 13 Benoît Benoît 6 Jean-Michel Python 4 \n",
+ " id_l Name_l id_r Name_r \\\n",
+ "0 1 Jacques Dupond 4 Jacques Dupont \n",
+ "3 1 Jacques Dupond 4 Jacques Dupont \n",
+ "8 2 Pierre Dusquesnes 5 pierre dusquesnes \n",
+ "9 8 Sophie Delarue 11 sophie_delarue \n",
+ "1 1 Jacques Dupond 6 Jean-Michel Python \n",
+ "4 1 Jacques Dupond 6 Jean-Michel Python \n",
+ "2 1 Jacques Dupond 10 Caroline Dufour \n",
+ "6 10 Caroline Dufour 6 Jean-Michel Python \n",
+ "5 1 Jacques Dupond 10 Caroline Dufour \n",
+ "7 10 Caroline Dufour 13 Benoît Benoît \n",
+ "10 10 Caroline Dufour 6 Jean-Michel Python \n",
+ "11 10 Caroline Dufour 13 Benoît Benoît \n",
+ "12 13 Benoît Benoît 6 Jean-Michel Python \n",
"\n",
- " _motive score \n",
- "0 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "1 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "2 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "3 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "4 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "5 (>=1 overlap in 'websites', Same 'City', Same ... 3 \n",
- "8 (Same 'City', Same 'Age') 2 \n",
- "9 (Same 'City', Same 'Age') 2 \n",
- "6 (>=1 overlap in 'websites') 1 \n",
- "7 (>=1 overlap in 'websites') 1 \n",
- "10 (>=1 overlap in 'websites') 1 \n",
- "11 (>=1 overlap in 'websites') 1 \n",
- "12 (>=1 overlap in 'websites') 1 "
+ " _motive _score _block \n",
+ "0 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 0 \n",
+ "3 [Same 'City', Same 'Age', >=1 overlap in 'webs... 3 1 \n",
+ "8 [Same 'City', Same 'Age'] 2 2 \n",
+ "9 [Same 'City', Same 'Age'] 2 3 \n",
+ "1 [>=1 overlap in 'websites'] 1 0 \n",
+ "4 [>=1 overlap in 'websites'] 1 1 \n",
+ "2 [>=1 overlap in 'websites'] 1 0 \n",
+ "6 [>=1 overlap in 'websites'] 1 1 \n",
+ "5 [>=1 overlap in 'websites'] 1 1 \n",
+ "7 [>=1 overlap in 'websites'] 1 1 \n",
+ "10 [>=1 overlap in 'websites'] 1 4 \n",
+ "11 [>=1 overlap in 'websites'] 1 4 \n",
+ "12 [>=1 overlap in 'websites'] 1 4 "
],
"text/html": [
"
\n",
@@ -3610,9 +3439,9 @@
"
Name_l | \n",
" id_r | \n",
" Name_r | \n",
- " _block | \n",
" _motive | \n",
- " score | \n",
+ " _score | \n",
+ " _block | \n",
" \n",
" \n",
" \n",
@@ -3622,39 +3451,49 @@
" Jacques Dupond | \n",
" 4 | \n",
" Jacques Dupont | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
+ " [Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
" 3 | \n",
+ " 0 | \n",
" \n",
" \n",
- " | 1 | \n",
+ " 3 | \n",
" 1 | \n",
" Jacques Dupond | \n",
- " 6 | \n",
- " Jean-Michel Python | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
+ " 4 | \n",
+ " Jacques Dupont | \n",
+ " [Same 'City', Same 'Age', >=1 overlap in 'webs... | \n",
" 3 | \n",
+ " 1 | \n",
"
\n",
" \n",
- " | 2 | \n",
- " 1 | \n",
- " Jacques Dupond | \n",
- " 10 | \n",
- " Caroline Dufour | \n",
- " 0 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
+ " 8 | \n",
+ " 2 | \n",
+ " Pierre Dusquesnes | \n",
+ " 5 | \n",
+ " pierre dusquesnes | \n",
+ " [Same 'City', Same 'Age'] | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 8 | \n",
+ " Sophie Delarue | \n",
+ " 11 | \n",
+ " sophie_delarue | \n",
+ " [Same 'City', Same 'Age'] | \n",
+ " 2 | \n",
" 3 | \n",
"
\n",
" \n",
- " | 3 | \n",
+ " 1 | \n",
" 1 | \n",
" Jacques Dupond | \n",
- " 4 | \n",
- " Jacques Dupont | \n",
+ " 6 | \n",
+ " Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 4 | \n",
@@ -3662,39 +3501,19 @@
" Jacques Dupond | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
+ " 1 | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
"
\n",
" \n",
- " | 5 | \n",
+ " 2 | \n",
" 1 | \n",
" Jacques Dupond | \n",
" 10 | \n",
" Caroline Dufour | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites', Same 'City', Same ... | \n",
- " 3 | \n",
- "
\n",
- " \n",
- " | 8 | \n",
- " 2 | \n",
- " Pierre Dusquesnes | \n",
- " 5 | \n",
- " pierre dusquesnes | \n",
- " 2 | \n",
- " (Same 'City', Same 'Age') | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 9 | \n",
- " 8 | \n",
- " Sophie Delarue | \n",
- " 11 | \n",
- " sophie_delarue | \n",
- " 3 | \n",
- " (Same 'City', Same 'Age') | \n",
- " 2 | \n",
+ " 0 | \n",
"
\n",
" \n",
" | 6 | \n",
@@ -3702,8 +3521,18 @@
" Caroline Dufour | \n",
" 6 | \n",
" Jean-Michel Python | \n",
+ " [>=1 overlap in 'websites'] | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 1 | \n",
+ " Jacques Dupond | \n",
+ " 10 | \n",
+ " Caroline Dufour | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites') | \n",
" 1 | \n",
"
\n",
" \n",
@@ -3712,8 +3541,8 @@
" | Caroline Dufour | \n",
" 13 | \n",
" Benoît Benoît | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
- " (>=1 overlap in 'websites') | \n",
" 1 | \n",
"
\n",
" \n",
@@ -3722,9 +3551,9 @@
" | Caroline Dufour | \n",
" 6 | \n",
" Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | 11 | \n",
@@ -3732,9 +3561,9 @@
" Caroline Dufour | \n",
" 13 | \n",
" Benoît Benoît | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
+ " 4 | \n",
"
\n",
" \n",
" | 12 | \n",
@@ -3742,15 +3571,53 @@
" Benoît Benoît | \n",
" 6 | \n",
" Jean-Michel Python | \n",
- " 4 | \n",
- " (>=1 overlap in 'websites') | \n",
+ " [>=1 overlap in 'websites'] | \n",
" 1 | \n",
+ " 4 | \n",
"
\n",
" \n",
"\n",
""
]
},
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 31
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2026-02-04T11:08:24.877566800Z",
+ "start_time": "2026-02-04T11:08:24.843830900Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "city_blocker = msb.OverlapBlocker([\"City\"])\n",
+ "city_blocker.block(df)"
+ ],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Processing OverlapBlocker(['City'], 1)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{frozenset({3, 8}),\n",
+ " frozenset({1, 4}),\n",
+ " frozenset({8, 11}),\n",
+ " frozenset({3, 11}),\n",
+ " frozenset({2, 5}),\n",
+ " frozenset({10, 13})}"
+ ]
+ },
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
diff --git a/src/ms_blocking/ms_blocking.py b/src/ms_blocking/ms_blocking.py
index fffbcc8..b3552af 100644
--- a/src/ms_blocking/ms_blocking.py
+++ b/src/ms_blocking/ms_blocking.py
@@ -1,5 +1,7 @@
from ms_blocking.utils import * # noqa: F403
+import networkx as nx
+
class BlockerNode:
"""Abstract class from which derive all classes in the module"""
@@ -46,7 +48,7 @@ def __init__(self, left, right):
def __repr__(self):
return f"AndNode{{{self.left}, {self.right}}}"
- def block(self, df, motives=False):
+ def block(self, df: pd.DataFrame, motives: bool = False) -> Coords:
# In order not to perform redundant computations, we first filter out the rows that were not considered by the first blocker before running the second blocker
coords_left = self.left.block(df, motives=motives)
@@ -76,8 +78,7 @@ def __init__(self, left, right):
def __repr__(self):
return f"OrNode{{{self.left}, {self.right}}}"
-
- def block(self, df, motives=False):
+ def block(self, df: pd.DataFrame, motives: bool = False) -> Coords:
# Note: for performance, it would be wise to remove rows that are already paired with all other rows, though this case should be pretty rare in real situations
coords_left = self.left.block(df, motives=motives)
@@ -91,7 +92,10 @@ class AttributeEquivalenceBlocker(BlockerNode): # Leaf
"""To regroup rows based on equality across columns."""
def __init__(
- self, blocking_columns, normalize_strings=True, must_not_be_different=None
+ self,
+ blocking_columns: str | Collection[str],
+ must_not_be_different: str | Collection[str] = None,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -120,7 +124,7 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"AttributeEquivalenceBlocker({self.blocking_columns}, {self.must_not_be_different})"
+ return f"AttributeEquivalenceBlocker({self.blocking_columns}{', ' + str(self.must_not_be_different) if self.must_not_be_different else ''}{', NON-NORMALIZED' if not self.normalize else ''})"
def __eq__(self, other):
if type(other) is AttributeEquivalenceBlocker:
@@ -139,21 +143,28 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on equality of one or more columns"""
print("Processing", self)
- temp_data = data.copy()
-
- for col in self.blocking_columns:
- if self.normalize:
- temp_data[col] = temp_data[col].apply(normalize)
- temp_data = temp_data.dropna(subset=self.blocking_columns)
- temp_data = remove_rows_if_value_appears_only_once(
- temp_data, self.blocking_columns
+ temp_data = (
+ data[self.blocking_columns + self.must_not_be_different]
+ .dropna(subset=self.blocking_columns)
+ .copy()
)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[
+ temp_data.duplicated(keep=False, subset=self.blocking_columns)
+ ]
+
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs
if motives:
return dict()
@@ -185,9 +196,7 @@ def block(self, data, motives=False):
}
if motives:
- explanations = {
- f"Same '{column_name}'" for column_name in self.blocking_columns
- }
+ explanations = [EquivalenceMotive(col) for col in self.blocking_columns]
return add_motives_to_coords(coords, explanations)
else:
return set(coords) # set is unnnecessary
@@ -197,7 +206,11 @@ class OverlapBlocker(BlockerNode): # Leaf
"""To regroup rows based on overlap of one or more columns."""
def __init__(
- self, blocking_columns, overlap=1, word_level=False, normalize_strings=True
+ self,
+ blocking_columns: str | Collection[str],
+ overlap: int = 1,
+ word_level: bool = False,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -217,7 +230,7 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"OverlapBlocker({self.blocking_columns}, {self.overlap})"
+ return f"OverlapBlocker({self.blocking_columns}, {self.overlap}{', WORD-LEVEL' if self.word_level else ''}{', NON-NORMALIZED' if not self.normalize else ''})"
def __eq__(self, other):
if type(other) is OverlapBlocker:
@@ -238,29 +251,31 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on overlap of one or more columns"""
print("Processing", self)
- temp_data = data.copy()
+ temp_data = data[self.blocking_columns].dropna().copy()
- temp_data = temp_data[self.blocking_columns].copy()
-
- for col in self.blocking_columns:
- temp_data[col] = temp_data[col].apply(
- parse_list, word_level=self.word_level
- )
- temp_data = temp_data.explode(col)
- if self.normalize:
- temp_data[col] = temp_data[col].apply(normalize)
- temp_data = temp_data.dropna(
- subset=self.blocking_columns
- ) # Remove empty objects
- temp_data = remove_rows_if_value_appears_only_once(
- temp_data, self.blocking_columns
+ # Ensure we check for overlap between lists of strings
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(parse_list, word_level=self.word_level)
)
+ # Split elements of said lists to compare them one by one
+ temp_data = temp_data.explode(self.blocking_columns)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[self.blocking_columns] = temp_data[self.blocking_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[
+ temp_data.duplicated(keep=False, subset=self.blocking_columns)
+ ]
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs fulfill any overlap
if motives:
return dict()
@@ -268,7 +283,7 @@ def block(self, data, motives=False):
return set()
# Use the DataFrame index for grouping and forming pairs
- # Using frozenset since they are ahshable and thus can be used as dictionary keys
+ # Using frozenset since they are hashable and thus can be used as dictionary keys
groups = temp_data.groupby(self.blocking_columns).apply(
lambda x: frozenset(x.index), include_groups=False
)
@@ -276,10 +291,10 @@ def block(self, data, motives=False):
coords = block_overlap(groups=groups, overlap=self.overlap)
if motives:
- explanations = {
- f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'"
- for column_name in self.blocking_columns
- }
+ explanations = [
+ OverlapMotive(col, self.overlap, self.word_level)
+ for col in self.blocking_columns
+ ]
return add_motives_to_coords(coords, explanations)
else:
return set(coords)
@@ -287,17 +302,17 @@ def block(self, data, motives=False):
class MixedBlocker(BlockerNode): # Leaf; For ANDs and RAM
"""Represent the intersection of an AttributeEquivalenceBlocker and an OverlapBlocker.
- Designed for performance and RAM efficiency.
+ Used for performance and RAM efficiency.
"""
def __init__(
self,
- equivalence_columns,
- overlap_columns,
- must_not_be_different=None,
- overlap=1,
- word_level=False,
- normalize_strings=True,
+ equivalence_columns: str | Collection[str],
+ overlap_columns: str | Collection[str],
+ must_not_be_different: str | Collection[str] = None,
+ overlap: int = 1,
+ word_level: bool = False,
+ normalize_strings: bool = True,
):
super().__init__()
@@ -341,7 +356,16 @@ def __init__(
self.normalize = normalize_strings # if True, will casefold+remove punctation+strip spaces for all strings before comparing them
def __repr__(self):
- return f"MixedBlocker({self.equivalence_columns}, {self.overlap_columns}, {self.overlap})"
+ return str(
+ AndNode(
+ AttributeEquivalenceBlocker(
+ self.equivalence_columns, self.must_not_be_different, self.normalize
+ ),
+ OverlapBlocker(
+ self.overlap_columns, self.overlap, self.word_level, self.normalize
+ ),
+ )
+ )
def __eq__(self, other):
if type(other) is AttributeEquivalenceBlocker:
@@ -369,31 +393,30 @@ def __eq__(self, other):
else:
return False
- def block(self, data, motives=False):
+ def block(self, data: pd.DataFrame, motives: bool = False) -> Coords:
"""Regroup rows based on overlap of one or more columns"""
print("Processing", self)
total_columns = self.equivalence_columns + self.overlap_columns
- temp_data = data[total_columns].copy()
-
- for col in total_columns:
- if col in self.equivalence_columns:
- temp_data[col] = temp_data[col].apply(normalize)
- elif col in self.overlap_columns:
- temp_data[col] = temp_data[col].apply(
- lambda x: [
- normalize(item) for item in parse_list(x, self.word_level)
- ]
- if self.normalize
- else parse_list(x, self.word_level)
- )
- temp_data = temp_data.explode(col)
+ temp_data = data[total_columns].dropna().copy()
- temp_data = temp_data.dropna(subset=total_columns) # Remove empty objects
- temp_data = remove_rows_if_value_appears_only_once(temp_data, total_columns)
+ # Ensure we check for overlap between lists of strings
+ temp_data[self.overlap_columns] = temp_data[self.overlap_columns].apply(
+ lambda col: col.apply(parse_list, word_level=self.word_level)
+ )
+ # Split elements of said lists to compare them one by one
+ temp_data = temp_data.explode(self.overlap_columns)
+ # Normalize strings if required
+ if self.normalize:
+ temp_data[total_columns] = temp_data[total_columns].apply(
+ lambda col: col.apply(normalize)
+ )
+ # Non-duplicated values cannot belong to any block; We discard them
+ temp_data = temp_data[temp_data.duplicated(keep=False, subset=total_columns)]
+ # No need to run anything else if we already ran out of candidates
if len(temp_data) == 0: # No pairs fulfill any overlap
if motives:
return dict()
@@ -426,17 +449,261 @@ def block(self, data, motives=False):
coords = coords_equivalence.intersection(coords_overlap)
if motives:
- explanations = {
- f"Same '{column_name}'" for column_name in self.equivalence_columns
- } | {
- f">={self.overlap}{' word_level' if self.word_level else ''} overlap in '{column_name}'"
- for column_name in self.overlap_columns
- }
+ explanations = [
+ EquivalenceMotive(col) for col in self.equivalence_columns
+ ] + [
+ OverlapMotive(col, self.overlap, self.word_level)
+ for col in self.overlap_columns
+ ]
+
return add_motives_to_coords(coords, explanations)
else:
return set(coords)
+def add_blocks_to_dataset(
+ data: pd.DataFrame,
+ coords: Coords,
+ sort: bool = True,
+ keep_ungrouped_rows: bool = False,
+ merge_blocks: bool = True,
+ motives: bool = False,
+ show_as_pairs: bool = False,
+ output_columns: Columns = None,
+ score: bool = False,
+) -> pd.DataFrame:
+ """Returns the intersection of an array of links
+
+ Takes two lists of paired elements, with or without motives, returns their intersection
+
+ Parameters
+ ----------
+ data : DataFrame
+ DataFrame for blocking
+ coords : Array
+ Blocked coordinates
+ sort : bool
+ Whether to sort the result by block, thereby regrouping rows of the same block
+ keep_ungrouped_rows : bool
+ Whether to display rows that do not belong to any block
+ merge_blocks : bool
+ Whether to merge transitively merge blocks
+ motives : bool
+ Whether to display the reason behind each block
+ show_as_pairs : bool
+ Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame
+ output_columns : list
+ Columns to show. Useful in combination with show_as_pairs as column names are altered
+ score : bool
+ Whether to show a score (computed from the number of motives)
+
+ Returns
+ -------
+ DataFrame
+ Blocked DataFrame
+
+ Examples
+ --------
+ >>> add_blocks_to_dataset(data=pd.DataFrame(
+ [
+ [0, 'first', 4],
+ [1, 'second', 6],
+ [2, 'first', 2],
+ [3, 'third', 5]
+ ],
+ columns=['id', 'rank', 'score']),
+ coords=np.array([{0, 2}]),
+ show_as_pairs=True,
+ output_columns=['id', 'rank'])
+ id_l rank_l id_r rank_r block
+ 0 0 first 2 first 0
+ """
+
+ if show_as_pairs and keep_ungrouped_rows:
+ raise ValueError("Cannot both return pairs and keep ungrouped rows")
+
+ if motives:
+ if type(coords) is not dict:
+ raise TypeError("Cannot specify 'motives=True' without passing motives")
+
+ # Ensure the index is a unique identifier
+ if not data.index.is_unique:
+ raise ValueError("DataFrame index must be unique to be used as an identifier.")
+
+ if score and not motives:
+ raise ValueError("Cannot specify 'score=True' without passing motives")
+
+ if "_motive" in data.columns:
+ if motives:
+ raise ValueError(
+ "Please rename existing '_motive' column OR do not pass 'motives=True'"
+ )
+
+ if "score" in data.columns:
+ if score:
+ raise ValueError(
+ "Please rename existing '_score' column OR do not pass 'score=True'"
+ )
+
+ if "_block" in data.columns:
+ raise ValueError("Please rename existing '_block' column")
+
+ if output_columns is None:
+ output_columns = data.columns
+
+ data = data[output_columns].copy()
+
+ if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph
+ if show_as_pairs:
+ columns = [col + "_l" for col in data.columns] + [
+ col + "_r" for col in data.columns
+ ]
+ output_data = pd.DataFrame(columns=columns)
+ else:
+ output_data = pd.DataFrame(columns=data.columns)
+
+ if motives:
+ output_data["_motive"] = ""
+ if score:
+ output_data["_score"] = 0
+ output_data["_block"] = -1
+
+ else:
+ output_data = data
+ # Map coords to connected component labels
+ if merge_blocks: # We solve the connected components problem
+ cc_labels = solve_connected_components_from_coords(coords)
+ # Match original index to new block ID
+ matcher = {
+ idx: label
+ for idx, label in enumerate(cc_labels)
+ if label != -1 and idx in data.index
+ }
+ else: # We solve the cliques problem
+ g = nx.Graph()
+ # noinspection PyTypeChecker
+ g.add_edges_from(coords)
+ complete_subgraphs = list(nx.find_cliques(g))
+ complete_subgraphs = sorted(complete_subgraphs)
+ # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))}
+ matcher = dict()
+ for i, clique in enumerate(complete_subgraphs):
+ for node_idx in clique:
+ if node_idx in matcher.keys():
+ matcher[node_idx].append(i)
+ else:
+ matcher[node_idx] = [i]
+
+ if show_as_pairs:
+ output_data = pd.DataFrame()
+ for pair in coords:
+ left_row = data.loc[[tuple(pair)[0]]].copy()
+ current_index = left_row.index
+ right_row = data.loc[[tuple(pair)[1]]].copy()
+ left_row.columns = [col + "_l" for col in left_row.columns]
+ right_row.columns = [col + "_r" for col in right_row.columns]
+ current_row = pd.concat(
+ [left_row.reset_index(drop=True), right_row.reset_index(drop=True)],
+ axis=1,
+ )
+ current_row.index = current_index
+ if motives:
+ motives_solved = solve_motives(coords[pair])
+ current_row["_motive"] = [list(map(str, motives_solved))]
+ if score:
+ current_row["_score"] = len(
+ motives_solved
+ ) # Score is simply the number of non-redundant motives
+ output_data = pd.concat([output_data, current_row])
+
+ # Assign blocks to rows based on their original index
+ output_data["_block"] = output_data.index.map(matcher)
+ if not merge_blocks:
+ output_data = output_data.explode("_block")
+
+ if keep_ungrouped_rows:
+ output_data["_block"] = output_data["_block"].fillna(-1)
+ matcher_ungrouped_rows = {}
+ block_temp = []
+ i = 0 # Track # of blocks processed
+ for b in output_data["_block"]:
+ if b == -1:
+ block_temp.append(i)
+ i += 1
+ elif b not in matcher_ungrouped_rows:
+ matcher_ungrouped_rows[b] = i
+ block_temp.append(i)
+ i += 1
+ else:
+ block_temp.append(matcher_ungrouped_rows[b])
+ output_data["_block"] = block_temp
+ else:
+ if not show_as_pairs:
+ output_data = output_data[
+ output_data["_block"].duplicated(keep=False)
+ & output_data["_block"].notna()
+ ]
+
+ output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"])
+
+ if sort:
+ # Sort by block, then by original index
+ sort_cols = ["_block"]
+ if output_data.index.name:
+ output_data = output_data.sort_values(
+ sort_cols + [output_data.index.name]
+ )
+ else:
+ # If no named index, use the first column of the DataFrame
+ output_data = output_data.reset_index()
+ output_data = output_data.sort_values(
+ sort_cols + [output_data.columns[0]]
+ )
+ output_data = output_data.set_index(output_data.columns[0])
+
+ if not show_as_pairs and motives:
+ id_list = flatten(coords.keys())
+ motive_matcher = {
+ row_id: list(map(str, solve_motives(coords[pair])))
+ for pair in coords.keys()
+ for row_id in id_list
+ if row_id in pair
+ }
+ # noinspection PyTypeChecker
+ output_data["_motive"] = output_data.index.map(motive_matcher)
+ if score:
+ output_data["_score"] = 0
+ score_matcher = { # Horribly repetitive
+ row_id: len(solve_motives(coords[pair]))
+ for pair in coords.keys()
+ for row_id in id_list
+ if row_id in pair
+ }
+ output_data["_score"] = output_data.index.map(score_matcher)
+
+ output_data = output_data.reset_index(drop=True)
+ output_data["_block"] = output_data["_block"].astype(int)
+
+ return output_data
+
+
+def generate_blocking_report(
+ data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None
+) -> pd.DataFrame:
+ """
+ Shorthand for add_blocks_to_dataset with below arguments
+ """
+ return add_blocks_to_dataset(
+ data,
+ coords,
+ sort=True,
+ merge_blocks=False,
+ motives=True,
+ show_as_pairs=True,
+ output_columns=output_columns,
+ )
+
+
def merge_blockers(
left: BlockerNode, right: BlockerNode
) -> AttributeEquivalenceBlocker | OverlapBlocker | MixedBlocker | AndNode:
@@ -592,3 +859,6 @@ def merge_blockers(
)
else:
return AndNode(left, right)
+
+
+# TODO: deport logic in a way that enables .progress_apply
diff --git a/src/ms_blocking/utils.py b/src/ms_blocking/utils.py
index 837645f..b644a43 100644
--- a/src/ms_blocking/utils.py
+++ b/src/ms_blocking/utils.py
@@ -4,53 +4,72 @@
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
import pandas as pd
-import networkx as nx
import random
from collections import Counter
from itertools import combinations
from typing import List, Set, Iterable, Dict, Collection, Any
+
+class EquivalenceMotive:
+ def __init__(self, blocking_column: str):
+ if not isinstance(blocking_column, str):
+ raise TypeError("blocking_column for Motive must be a string")
+ self.blocking_column = blocking_column
+
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, EquivalenceMotive | OverlapMotive):
+ raise TypeError("Can only compare Motives")
+ return self.blocking_column == other.blocking_column
+
+ def __str__(self):
+ return f"Same '{self.blocking_column}'"
+
+ def __repr__(self):
+ return f"EquivalenceMotive(['{self.blocking_column}'])"
+
+
+class OverlapMotive:
+ def __init__(
+ self, blocking_column: str, overlap: int = 1, word_level: bool = False
+ ):
+ if not isinstance(blocking_column, str):
+ raise TypeError("blocking_column for Motive must be a string")
+ if not isinstance(overlap, int):
+ raise TypeError("overlap must be an int")
+ if not isinstance(word_level, bool):
+ raise TypeError("word_level must be a boolean")
+ self.blocking_column = blocking_column
+ self.overlap = overlap
+ self.word_level = word_level
+
+ def __eq__(self, other: Any) -> bool:
+ if not isinstance(other, EquivalenceMotive | OverlapMotive):
+ raise TypeError("Can only compare Motives")
+ return (
+ self.blocking_column == other.blocking_column
+ and self.overlap == other.overlap
+ and self.word_level == other.word_level
+ )
+
+ def __str__(self):
+ return f">={self.overlap}{' word-level' if self.word_level else ''} overlap in '{self.blocking_column}'"
+
+ def __repr__(self):
+ return f"OverlapMotive(['{self.blocking_column}'], {self.overlap}{', word_level=True' if self.word_level else ''})"
+
+
Columns = List[str]
Pair = Collection[int]
+Motive = EquivalenceMotive | OverlapMotive
CoordsBasic = Set[Pair]
-CoordsMotives = Dict[Pair, Set[str]]
+CoordsMotives = Dict[Pair, List[Motive]]
Coords = CoordsBasic | CoordsMotives
_PUNCT_RE = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~]')
_SPACE_RE = re.compile(r"\s+")
-def remove_rows_if_value_appears_only_once(
- data: pd.DataFrame, cols: Columns
-) -> pd.DataFrame:
- """Drop rows of a Pandas DataFrame where a certain column's values appears only once.
-
- Ensures all elements of provided columns appear at least twice in their column
-
- Parameters
- ----------
- data : DataFrame
- DataFrame to preprocess
-
- cols : List[str]
- List of columns where rows that contain non-duplicated elements shall be discarded
-
- Returns
- -------
- DataFrame
- DataFrame with reduced number of rows
-
- Examples
- --------
- >>> remove_rows_if_value_appears_only_once(data, ['name', 'city'])
- """
- for col in cols:
- counts = data[col].map(data[col].value_counts())
- data = data[counts >= 2]
- return data
-
-
def start_from_zero(figures: Collection[int]) -> List[int]:
"""Turns a list of integers into a same-length list that starts at 0, without gaps
@@ -240,7 +259,7 @@ def merge_blocks_or(coords_1: Coords, coords_2: Coords) -> Coords:
if type(coords_1) is type(coords_2) is dict: # We have motives
return {
pair: (
- (coords_1[pair] | coords_2[pair])
+ coords_1[pair] + coords_2[pair]
if (pair in coords_1 and pair in coords_2)
else coords_1[pair]
if (pair in coords_1)
@@ -278,7 +297,7 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords:
"""
if type(coords_1) is type(coords_2) is dict: # We have motives
return {
- pair: (coords_1[pair] | coords_2[pair])
+ pair: coords_1[pair] + coords_2[pair]
for y in (coords_1, coords_2)
for pair in y.keys()
if (pair in coords_1 and pair in coords_2)
@@ -287,219 +306,6 @@ def merge_blocks_and(coords_1: Coords, coords_2: Coords) -> Coords:
return coords_1.intersection(coords_2)
-def add_blocks_to_dataset(
- data: pd.DataFrame,
- coords: Coords,
- sort: bool = True,
- keep_ungrouped_rows: bool = False,
- merge_blocks: bool = True,
- motives: bool = False,
- show_as_pairs: bool = False,
- output_columns: Columns = None,
-) -> pd.DataFrame:
- """Returns the intersection of an array of links
-
- Takes two lists of paired elements, with or without motives, returns their intersection
-
- Parameters
- ----------
- data : DataFrame
- DataFrame for blocking
- coords : Array
- Blocked coordinates
- sort : bool
- Whether to sort the result by block, thereby regrouping rows of the same block
- keep_ungrouped_rows : bool
- Whether to display rows that do not belong to any block
- merge_blocks : bool
- Whether to merge transitively merge blocks
- motives : bool
- Whether to display the reason behind each block
- show_as_pairs : bool
- Whether to show the output as pairs or rows rather than simply reordering the initial DataFrame
- output_columns : list
- Columns to show. Useful in combination with show_as_pairs as column names are altered
-
- Returns
- -------
- DataFrame
- Blocked DataFrame
-
- Examples
- --------
- >>> add_blocks_to_dataset(data=pd.DataFrame(
- [
- [0, 'first', 4],
- [1, 'second', 6],
- [2, 'first', 2],
- [3, 'third', 5]
- ],
- columns=['id', 'rank', 'score']),
- coords=np.array([{0, 2}]),
- show_as_pairs=True,
- output_columns=['id', 'rank'])
- id_l rank_l id_r rank_r block
- 0 0 first 2 first 0
- """
-
- if show_as_pairs and keep_ungrouped_rows:
- raise ValueError("Cannot both return pairs and keep ungrouped rows")
-
- if motives:
- if type(coords) is not dict:
- raise TypeError("Cannot specify motives=True without passing motives")
-
- # Ensure the index is a unique identifier
- if not data.index.is_unique:
- raise ValueError("DataFrame index must be unique to be used as an identifier.")
-
- if "_motive" in data.columns:
- if motives:
- raise ValueError(
- "Please rename existing '_motive' column OR do not pass 'motives=True'"
- )
-
- if "_block" in data.columns:
- raise ValueError("Please rename existing '_block' column")
-
- if output_columns is None:
- output_columns = data.columns
- data = data[output_columns].copy()
-
- if len(coords) == 0 and not keep_ungrouped_rows: # Empty graph
- if show_as_pairs:
- columns = [col + "_l" for col in data.columns] + [
- col + "_r" for col in data.columns
- ]
- output_data = pd.DataFrame(columns=columns)
- else:
- output_data = pd.DataFrame(columns=data.columns)
- else:
- output_data = data
- # Map coords to connected component labels
- if merge_blocks: # We solve the connected components problem
- cc_labels = solve_connected_components_from_coords(coords)
- # Match original index to new block ID
- matcher = {
- idx: label
- for idx, label in enumerate(cc_labels)
- if label != -1 and idx in data.index
- }
- else: # We solve the cliques problem
- g = nx.Graph()
- # noinspection PyTypeChecker
- g.add_edges_from(coords)
- complete_subgraphs = list(nx.find_cliques(g))
- complete_subgraphs = sorted(complete_subgraphs)
- # matcher = {row_id:([i for i in range(len(complete_subgraphs)) if row_id in complete_subgraphs[i]]) for row_id in set(flatten(complete_subgraphs))}
- matcher = dict()
- for i, clique in enumerate(complete_subgraphs):
- for node_idx in clique:
- if node_idx in matcher.keys():
- matcher[node_idx].append(i)
- else:
- matcher[node_idx] = [i]
-
- if show_as_pairs:
- output_data = pd.DataFrame()
- for pair in coords:
- left_row = data.loc[[tuple(pair)[0]]].copy()
- current_index = left_row.index
- right_row = data.loc[[tuple(pair)[1]]].copy()
- left_row.columns = [col + "_l" for col in left_row.columns]
- right_row.columns = [col + "_r" for col in right_row.columns]
- current_row = pd.concat(
- [left_row.reset_index(drop=True), right_row.reset_index(drop=True)],
- axis=1,
- )
- current_row.index = current_index
- output_data = pd.concat([output_data, current_row])
-
- # Assign blocks to rows based on their original index
- output_data["_block"] = output_data.index.map(matcher)
- if not merge_blocks:
- output_data = output_data.explode("_block")
-
- if keep_ungrouped_rows:
- output_data["_block"] = output_data["_block"].fillna(-1)
- matcher_ungrouped_rows = {}
- block_temp = []
- i = 0 # Track # of blocks processed
- for b in output_data["_block"]:
- if b == -1:
- block_temp.append(i)
- i += 1
- elif b not in matcher_ungrouped_rows:
- matcher_ungrouped_rows[b] = i
- block_temp.append(i)
- i += 1
- else:
- block_temp.append(matcher_ungrouped_rows[b])
- output_data["_block"] = block_temp
- else:
- if not show_as_pairs:
- output_data = output_data[
- output_data["_block"].duplicated(keep=False)
- & output_data["_block"].notna()
- ]
-
- output_data.loc[:, ["_block"]] = start_from_zero(output_data["_block"])
-
- if sort:
- # Sort by block, then by original index
- sort_cols = ["_block"]
- if output_data.index.name:
- output_data = output_data.sort_values(
- sort_cols + [output_data.index.name]
- )
- else:
- # If no named index, use the first column of the DataFrame
- output_data = output_data.reset_index()
- output_data = output_data.sort_values(
- sort_cols + [output_data.columns[0]]
- )
- output_data = output_data.set_index(output_data.columns[0])
-
- if motives:
- output_data["_motive"] = ""
- id_list = flatten(coords.keys())
- motive_matcher = {
- row_id: frozenset(
- reason
- for pair in coords.keys()
- if row_id in pair
- for reason in coords[pair]
- )
- for row_id in id_list
- }
- output_data["_motive"] = output_data.index.map(motive_matcher)
-
- if "_block" not in output_data.columns: # Empty coords
- output_data["_block"] = -1
-
- output_data = output_data.reset_index(drop=True)
- output_data["_block"] = output_data["_block"].astype(int)
-
- return output_data
-
-
-def generate_blocking_report(
- data: pd.DataFrame, coords: Coords, output_columns: Collection[str] = None
-) -> pd.DataFrame:
- """
- Shorthand for add_blocks_to_dataset with below arguments
- """
- return add_blocks_to_dataset(
- data,
- coords,
- sort=True,
- merge_blocks=False,
- motives=True,
- show_as_pairs=True,
- output_columns=output_columns,
- )
-
-
def parse_list(s: str | List, word_level: bool = False) -> List[str]:
"""Turns a stringified list into an actual python list, taking extra inner quotes into account
@@ -511,7 +317,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
Stringified representation of a list e.g. "['string 1', 'string 2', ...]"
word_level : bool
- Whether to return a list of all words within s instead of a list of each comma-separated element
+ Whether to return a list of all words within s instead of a list of each comma-separated element;
+ Note that if passed a string that does not represent a list, this argument will be ignored and the function
+ will return a list of each word in the string
Returns
-------
@@ -527,7 +335,9 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
"""
if type(s) is list: # If we already have a list
- if len(s) == 1 and s[0][0] == "[" and s[0][-1] == "]":
+ if (
+ len(s) == 1 and str(s[0]).startswith("[") and str(s[0]).endswith("]")
+ ): # In case we have a stringified list INSIDE a normal list
s = s[0]
else:
return s
@@ -540,10 +350,15 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
if not s:
return []
- try:
- parts = ast.literal_eval(s)
- except ValueError: # doesn't seem to be a stringified list
- parts = s.split("', '")
+ if s.startswith("[") and s.endswith("]"): # Stringified list?
+ try:
+ parts = ast.literal_eval(s)
+ except ValueError: # doesn't seem to be a stringified list
+ parts = s.split("', '")
+ except SyntaxError: # In case we have a string surroudned by brackets
+ parts = s.split()
+ else:
+ parts = s.split()
cleaned_items = [str(part).strip().strip("''") for part in parts]
@@ -553,40 +368,6 @@ def parse_list(s: str | List, word_level: bool = False) -> List[str]:
return [s for s in cleaned_items if len(s) > 0]
-def scoring(data: pd.DataFrame, motives_column: str = "_motive") -> pd.Series:
- """Add a score to a blocked DataFrame based on the number of motives
-
- Parameters
- ----------
- data : DataFrame
- DataFrame with motives
-
- motives_column : str
- Name of the column containing the motives
-
- Returns
- -------
- Series[int]
- A column of scores
- """
-
- # Check that we do have motives
- if motives_column not in data.columns:
- if motives_column == "_motive":
- raise ValueError("No motives in DataFrame")
- else:
- raise ValueError(
- f'Specified motives column "{motives_column}" does not exist'
- )
-
- if "score" in data.columns:
- print("Renaming 'score' column to 'score_old'")
- data = data.rename(columns={"score": "score_old"})
-
- scores = data[motives_column].apply(len)
- return scores
-
-
def must_not_be_different_apply( # WIP
temp_data: pd.DataFrame,
blocking_columns: List[str],
@@ -682,7 +463,9 @@ def block_overlap(groups: Iterable, overlap: int = 1) -> Coords:
return coords
-def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotives:
+def add_motives_to_coords(
+ coords: Coords, explanations: List[Motive]
+) -> Dict[Pair, List[Motive]]:
"""Block a DataFrame based on overlap accross columns
Parameters
@@ -690,7 +473,7 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv
coords : Coords
Coords obtained by blocking
- explanations : Set[str]
+ explanations : Set[EquivalenceMotive|OverlapMotive]
Set of explanations
Returns
@@ -718,3 +501,99 @@ def add_motives_to_coords(coords: Coords, explanations: Set[str]) -> CoordsMotiv
}
"""
return {pair: explanations for pair in coords}
+
+
+def solve_motives(motives: List[Motive]) -> List[Motive]:
+ """Remove duplicated and redundant motives from a list of motives
+
+ Redundant motives refer to OverlapMotives on the same column(s) but with different overlap or word-level condition
+
+ Parameters
+ ----------
+ motives : List[Motive]
+ Coords obtained by blocking
+
+ Returns
+ -------
+ List[Motive]
+ A list of Motives whose length should be smaller or equal to the original list of motives
+
+ Examples
+ --------
+ >>> solve_motives([OverlapMotive('websites', 1), OverlapMotive('websites', 2), OverlapMotive('websites', 2, word_level=False)])
+ [OverlapMotive(['websites'], 2, word_level=False)]
+ """
+ if not motives:
+ raise ValueError("Motives must not be empty")
+
+ # split_motives = []
+ # for motive in motives:
+ # split_motives += split_motive(motive)
+
+ final_motives = [
+ motive for motive in motives if type(motive) is EquivalenceMotive
+ ] # With EquivalenceMotive, equality check suffices
+ overlap_motives = [motive for motive in motives if type(motive) is OverlapMotive]
+ overlap_columns = [motive.blocking_column for motive in overlap_motives]
+
+ for column in overlap_columns:
+ overlap_motives_for_column = [
+ motive for motive in overlap_motives if motive.blocking_column == column
+ ]
+
+ # Select Blocker with stricter word/element-level condition
+ word_level_motives_for_column = [
+ motive for motive in overlap_motives_for_column if motive.word_level
+ ]
+ not_word_level_motives_for_column = [
+ motive for motive in overlap_motives_for_column if not motive.word_level
+ ]
+
+ # Find biggest overlap among the non-word_level ones
+ if not_word_level_motives_for_column:
+ max_overlap_not_word_level_for_column = max(
+ not_word_level_motives_for_column, key=lambda m: m.overlap
+ )
+ max_overlap_not_word_level_for_column_overlap = (
+ max_overlap_not_word_level_for_column.overlap
+ )
+ else:
+ max_overlap_not_word_level_for_column = []
+ max_overlap_not_word_level_for_column_overlap = (
+ 0 # Will never be used, left for linter
+ )
+
+ # Now find biggest overlap among the word_level ones
+ if word_level_motives_for_column:
+ max_overlap_word_level_for_column = max(
+ word_level_motives_for_column, key=lambda m: m.overlap
+ )
+ max_overlap_word_level_for_column_overlap = (
+ max_overlap_word_level_for_column.overlap
+ )
+ if not_word_level_motives_for_column:
+ # If there is already an OverlapMotive on same column with equal or greater overlap but not word_level, discard it
+ if (
+ max_overlap_word_level_for_column_overlap
+ <= max_overlap_not_word_level_for_column_overlap
+ ):
+ max_overlap_word_level_for_column = []
+ else:
+ max_overlap_word_level_for_column = []
+
+ if max_overlap_not_word_level_for_column:
+ max_overlap_not_word_level_for_column = [
+ max_overlap_not_word_level_for_column
+ ]
+ if max_overlap_word_level_for_column:
+ max_overlap_word_level_for_column = [max_overlap_word_level_for_column]
+ final_motives += (
+ max_overlap_word_level_for_column + max_overlap_not_word_level_for_column
+ )
+
+ # Remove duplicates
+ final_motives_no_duplicates = []
+ for motive in final_motives:
+ if motive not in final_motives_no_duplicates:
+ final_motives_no_duplicates.append(motive)
+ return final_motives_no_duplicates
diff --git a/tests/test_ms_blocking.py b/tests/test_ms_blocking.py
index d3f9ab2..cf92924 100644
--- a/tests/test_ms_blocking.py
+++ b/tests/test_ms_blocking.py
@@ -84,18 +84,28 @@ def attribute_city_keep_ungrouped_rows_false():
@pytest.fixture
def attribute_city_motives_true_block():
return {
- frozenset({3, 8}): {"Same 'City'"},
- frozenset({1, 4}): {"Same 'City'"},
- frozenset({8, 11}): {"Same 'City'"},
- frozenset({3, 11}): {"Same 'City'"},
- frozenset({2, 5}): {"Same 'City'"},
- frozenset({10, 13}): {"Same 'City'"},
+ frozenset({3, 8}): [msb.EquivalenceMotive("City")],
+ frozenset({1, 4}): [msb.EquivalenceMotive("City")],
+ frozenset({8, 11}): [msb.EquivalenceMotive("City")],
+ frozenset({3, 11}): [msb.EquivalenceMotive("City")],
+ frozenset({2, 5}): [msb.EquivalenceMotive("City")],
+ frozenset({10, 13}): [msb.EquivalenceMotive("City")],
}
@pytest.fixture
def attribute_city_motives_true_add():
- return [{"Same 'City'"}] * 9
+ return [
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ["Same 'City'"],
+ ]
@pytest.fixture
@@ -116,25 +126,30 @@ def city_age_name_websites_pipelining_id():
@pytest.fixture
def city_age_websites_pipelining_motives():
return [
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'", ">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({"Same 'Age'", "Same 'City'"}),
- frozenset({"Same 'Age'", "Same 'City'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
- frozenset({">=1 overlap in 'websites'"}),
+ {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {"Same 'City'", "Same 'Age'", ">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {"Same 'City'", "Same 'Age'"},
+ {"Same 'City'", "Same 'Age'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
+ {">=1 overlap in 'websites'"},
]
@pytest.fixture
def city_age_websites_pipelining_scores():
- return [3, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1]
+ return [3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+@pytest.fixture
+def city_age_websites_pipelining_scores_not_show_as_pairs():
+ return [3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1]
@pytest.fixture
@@ -335,9 +350,10 @@ def test_pipelining_motives(city_age_websites_pipelining_motives):
websites_blocker = msb.OverlapBlocker(["websites"])
final_blocker = (city_blocker & age_blocker) | websites_blocker
links = final_blocker.block(get_users(), motives=True)
- actual = msb.add_blocks_to_dataset(
+ motives = msb.add_blocks_to_dataset( # Use set to ignore ordering
get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False
)["_motive"].to_list()
+ actual = [set(motive) for motive in motives]
assert actual == expected
@@ -350,9 +366,36 @@ def test_pipelining_scores(city_age_websites_pipelining_scores):
final_blocker = (city_blocker & age_blocker) | websites_blocker
links = final_blocker.block(get_users(), motives=True)
report = msb.add_blocks_to_dataset(
- get_users(), links, show_as_pairs=True, motives=True, merge_blocks=False
+ get_users(),
+ links,
+ show_as_pairs=True,
+ motives=True,
+ merge_blocks=False,
+ score=True,
+ )
+ actual = sorted(report["_score"], reverse=True)
+ assert actual == expected
+
+
+def test_pipelining_scores_without_show_as_pairs(
+ city_age_websites_pipelining_scores_not_show_as_pairs,
+):
+ """Test that scoring does work as intended"""
+ expected = city_age_websites_pipelining_scores_not_show_as_pairs
+ city_blocker = msb.AttributeEquivalenceBlocker(["City"])
+ age_blocker = msb.AttributeEquivalenceBlocker(["Age"])
+ websites_blocker = msb.OverlapBlocker(["websites"])
+ final_blocker = (city_blocker & age_blocker) | websites_blocker
+ links = final_blocker.block(get_users(), motives=True)
+ report = msb.add_blocks_to_dataset(
+ get_users(),
+ links,
+ show_as_pairs=False,
+ motives=True,
+ merge_blocks=False,
+ score=True,
)
- actual = sorted(msb.scoring(report), reverse=True)
+ actual = sorted(report["_score"], reverse=True)
assert actual == expected