Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions backend/deduplication_testing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import pytest
from ks_search_tool import deduplicate_datasets


# Test1 : basic deduplication by _id
def test_deduplicate_basic():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1





# Test2 : URL variations deduplication
def test_deduplicate_url_variation():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=1"},
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=2"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1





# Test3: title normalization (punctuation, spaces, case)
def test_deduplicate_title_variation():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds2", "title": "EEG Data!", "primary_link": "https://site.com/ds2"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1





# Test4 : fuzzy title matching
def test_deduplicate_fuzzy_title():
datasets = [
{"_id": "ds1", "title": "Anesthesia EEG Dataset", "primary_link": "https://site.com/ds1"},
{"_id": "ds2", "title": "Anesthesia EGG Dataset", "primary_link": "https://site.com/ds2"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1





# Test5 : multiple duplicates
def test_deduplicate_multiple_duplicates():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds2", "title": "EEG Data!", "primary_link": "https://site.com/ds2"},
{"_id": "ds3", "title": "MRI Data", "primary_link": "https://site.com/ds3"}
]
result = deduplicate_datasets(datasets)

assert len(result) == 2
titles = [d["title"] for d in result]
assert "EEG Data" in titles or "EEG Data!" in titles
assert "MRI Data" in titles





# Test6 : empty input
def test_deduplicate_empty():
datasets = []
result = deduplicate_datasets(datasets)
assert result == []




# Test7 : datasets with different _id but same normalized title
def test_deduplicate_different_id_same_title():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds2", "title": "EEG Data", "primary_link": "https://site.com/ds2"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1






# Test8 : datasets with same _id but different capitalization
def test_deduplicate_same_id_diff_case():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "DS1", "title": "eeg data", "primary_link": "https://site.com/ds1"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 1


# Test9 : that unique datasets remain

def test_deduplicate_unique_datasets():
datasets = [
{"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"},
{"_id": "ds2", "title": "MRI Data", "primary_link": "https://site.com/ds2"},
{"_id": "ds3", "title": "CT Scan Data", "primary_link": "https://site.com/ds3"}
]
result = deduplicate_datasets(datasets)
assert len(result) == 3


# Test10 : large dataset

def test_deduplicate_large_dataset():
datasets = [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(100)]
datasets += [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(50)]
result = deduplicate_datasets(datasets)
print(len(result))
108 changes: 106 additions & 2 deletions backend/ks_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import requests
import asyncio
import aiohttp
from typing import Dict, Optional, Set, Union, List, Any, Iterable
import re
from urllib.parse import urlparse
from difflib import SequenceMatcher
from typing import Dict, Optional, Set, Union, List, Any, Iterable
from urllib.parse import urlparse, urlunparse


def tool(args_schema):
Expand Down Expand Up @@ -443,6 +443,110 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs:
except requests.RequestException as e:
print(f" -> Error searching {data_source_id}: {e}")
return []


# Deduplication feature

def normalize_url(url: str) -> str:
"""Normalize URLs by stripping query params and fragments."""
if not url:
return ""
parsed = urlparse(url)
normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
return normalized.lower().rstrip("/")


def normalize_title(title: str) -> str:
"""Normalize title: lowercase, strip punctuation, extra spaces."""
if not title:
return ""
title = title.lower()
title = re.sub(r"[^\w\s]", "", title)
title = re.sub(r"\s+", " ", title)
return title.strip()


def titles_reordered_match(t1: str, t2: str) -> bool:
"""Detect titles with same words but different order."""
tokens1 = set(t1.split())
tokens2 = set(t2.split())
return tokens1 == tokens2


def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]:
"""Deduplicate datasets using canonical ID, normalized URL, fuzzy title, and reordered title detection."""

if not all_datasets:
return []

cleaned = []
seen_canonical = set()
seen_urls = set()

for dataset in all_datasets:
metadata = dataset.get("metadata", {}) or dataset.get("_source", {})

# Canonical ID
dataset_id = metadata.get("id") or metadata.get("dataset_id") or dataset.get("_id")
dataset_id = str(dataset_id).lower() if dataset_id else ""

datasource_id = str(dataset.get("datasource_id") or "default_source").lower()
canonical_key = f"{datasource_id}:{dataset_id}"

if dataset_id and canonical_key in seen_canonical:
continue

if dataset_id:
seen_canonical.add(canonical_key)

# URL deduplication
raw_url = dataset.get("primary_link", "")
normalized_url = normalize_url(raw_url)

if normalized_url and normalized_url in seen_urls:
continue

if normalized_url:
seen_urls.add(normalized_url)

# Title normalization
title = normalize_title(
dataset.get("title")
or dataset.get("title_guess")
or metadata.get("title")
or ""
)

duplicate_found = False

if title:
for existing in cleaned:
existing_title = normalize_title(
existing.get("title")
or existing.get("title_guess")
or ""
)

if not existing_title:
continue

# Fuzzy match
similarity = SequenceMatcher(None, title, existing_title).ratio()

if similarity > 0.93:
duplicate_found = True
break

# Reordered title match
if titles_reordered_match(title, existing_title):
duplicate_found = True
break

if not duplicate_found:
cleaned.append(dataset)

return cleaned



@tool(args_schema=BaseModel)
Expand Down