INCF · Areeba-Tahir-18 · Mar 15, 2026 · Mar 15, 2026
diff --git a/backend/deduplication_testing.py b/backend/deduplication_testing.py
@@ -0,0 +1,127 @@
+import pytest
+from ks_search_tool import deduplicate_datasets
+
+
+# Test1 : basic deduplication by _id
+def test_deduplicate_basic():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1
+
+
+
+
+
+# Test2 : URL variations deduplication
+def test_deduplicate_url_variation():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=1"},
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=2"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1
+
+
+
+
+
+# Test3:  title normalization (punctuation, spaces, case)
+def test_deduplicate_title_variation():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds2", "title": "EEG  Data!", "primary_link": "https://site.com/ds2"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1
+
+
+
+
+
+# Test4 :  fuzzy title matching
+def test_deduplicate_fuzzy_title():
+    datasets = [
+        {"_id": "ds1", "title": "Anesthesia EEG Dataset", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds2", "title": "Anesthesia EGG Dataset", "primary_link": "https://site.com/ds2"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1  
+
+
+
+
+
+# Test5 :  multiple duplicates
+def test_deduplicate_multiple_duplicates():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds2", "title": "EEG  Data!", "primary_link": "https://site.com/ds2"},
+        {"_id": "ds3", "title": "MRI Data", "primary_link": "https://site.com/ds3"}
+    ]
+    result = deduplicate_datasets(datasets)
+
+    assert len(result) == 2
+    titles = [d["title"] for d in result]
+    assert "EEG Data" in titles or "EEG  Data!" in titles
+    assert "MRI Data" in titles
+
+
+
+
+
+# Test6 :  empty input
+def test_deduplicate_empty():
+    datasets = []
+    result = deduplicate_datasets(datasets)
+    assert result == []
+
+
+
+
+# Test7 :  datasets with different _id but same normalized title
+def test_deduplicate_different_id_same_title():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds2", "title": "EEG Data", "primary_link": "https://site.com/ds2"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1  
+
+
+
+
+
+
+# Test8 :  datasets with same _id but different capitalization
+def test_deduplicate_same_id_diff_case():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "DS1", "title": "eeg data", "primary_link": "https://site.com/ds1"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 1
+
+
+# Test9 :  that unique datasets remain
+
+def test_deduplicate_unique_datasets():
+    datasets = [
+        {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
+        {"_id": "ds2", "title": "MRI Data", "primary_link": "https://site.com/ds2"},
+        {"_id": "ds3", "title": "CT Scan Data", "primary_link": "https://site.com/ds3"}
+    ]
+    result = deduplicate_datasets(datasets)
+    assert len(result) == 3
+
+
+# Test10 : large dataset
+
+def test_deduplicate_large_dataset():
+   datasets = [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(100)]
+   datasets += [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(50)]
+   result = deduplicate_datasets(datasets)
+   print(len(result))  
diff --git a/backend/ks_search_tool.py b/backend/ks_search_tool.py
@@ -4,10 +4,10 @@
 import requests
 import asyncio
 import aiohttp
-from typing import Dict, Optional, Set, Union, List, Any, Iterable
 import re
-from urllib.parse import urlparse
 from difflib import SequenceMatcher
+from typing import Dict, Optional, Set, Union, List, Any, Iterable
+from urllib.parse import urlparse, urlunparse
 
 
 def tool(args_schema):
@@ -443,6 +443,110 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs:
     except requests.RequestException as e:
         print(f"  -> Error searching {data_source_id}: {e}")
         return []
+
+
+    # Deduplication feature
+
+def normalize_url(url: str) -> str:
+    """Normalize URLs by stripping query params and fragments."""
+    if not url:
+        return ""
+    parsed = urlparse(url)
+    normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
+    return normalized.lower().rstrip("/")
+
+
+def normalize_title(title: str) -> str:
+    """Normalize title: lowercase, strip punctuation, extra spaces."""
+    if not title:
+        return ""
+    title = title.lower()
+    title = re.sub(r"[^\w\s]", "", title)
+    title = re.sub(r"\s+", " ", title)
+    return title.strip()
+
+
+def titles_reordered_match(t1: str, t2: str) -> bool:
+    """Detect titles with same words but different order."""
+    tokens1 = set(t1.split())
+    tokens2 = set(t2.split())
+    return tokens1 == tokens2
+
+
+def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]:
+    """Deduplicate datasets using canonical ID, normalized URL, fuzzy title, and reordered title detection."""
+
+    if not all_datasets:
+        return []
+
+    cleaned = []
+    seen_canonical = set()
+    seen_urls = set()
+
+    for dataset in all_datasets:
+        metadata = dataset.get("metadata", {}) or dataset.get("_source", {})
+
+        # Canonical ID
+        dataset_id = metadata.get("id") or metadata.get("dataset_id") or dataset.get("_id")
+        dataset_id = str(dataset_id).lower() if dataset_id else ""
+
+        datasource_id = str(dataset.get("datasource_id") or "default_source").lower()
+        canonical_key = f"{datasource_id}:{dataset_id}"
+
+        if dataset_id and canonical_key in seen_canonical:
+            continue
+
+        if dataset_id:
+            seen_canonical.add(canonical_key)
+
+        # URL deduplication
+        raw_url = dataset.get("primary_link", "")
+        normalized_url = normalize_url(raw_url)
+
+        if normalized_url and normalized_url in seen_urls:
+            continue
+
+        if normalized_url:
+            seen_urls.add(normalized_url)
+
+        # Title normalization
+        title = normalize_title(
+            dataset.get("title")
+            or dataset.get("title_guess")
+            or metadata.get("title")
+            or ""
+        )
+
+        duplicate_found = False
+
+        if title:
+            for existing in cleaned:
+                existing_title = normalize_title(
+                    existing.get("title")
+                    or existing.get("title_guess")
+                    or ""
+                )
+
+                if not existing_title:
+                    continue
+
+                # Fuzzy match
+                similarity = SequenceMatcher(None, title, existing_title).ratio()
+
+                if similarity > 0.93:
+                    duplicate_found = True
+                    break
+
+                # Reordered title match
+                if titles_reordered_match(title, existing_title):
+                    duplicate_found = True
+                    break
+
+        if not duplicate_found:
+            cleaned.append(dataset)
+
+    return cleaned
+
 
 
 @tool(args_schema=BaseModel)