From ffa86e4b97a411bd6d1b6acbf7df7f24e68dc520 Mon Sep 17 00:00:00 2001 From: Areeba-Tahir-18 Date: Sun, 15 Mar 2026 07:48:05 +0500 Subject: [PATCH 1/2] Add testing file --- backend/deduplication_testing.py | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 backend/deduplication_testing.py diff --git a/backend/deduplication_testing.py b/backend/deduplication_testing.py new file mode 100644 index 0000000..9aeb906 --- /dev/null +++ b/backend/deduplication_testing.py @@ -0,0 +1,127 @@ +import pytest +from ks_search_tool import deduplicate_datasets + + +# Test1 : basic deduplication by _id +def test_deduplicate_basic(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + + + + +# Test2 : URL variations deduplication +def test_deduplicate_url_variation(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=1"}, + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1?version=2"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + + + + +# Test3: title normalization (punctuation, spaces, case) +def test_deduplicate_title_variation(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds2", "title": "EEG Data!", "primary_link": "https://site.com/ds2"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + + + + +# Test4 : fuzzy title matching +def test_deduplicate_fuzzy_title(): + datasets = [ + {"_id": "ds1", "title": "Anesthesia EEG Dataset", "primary_link": "https://site.com/ds1"}, + {"_id": "ds2", "title": "Anesthesia EGG Dataset", "primary_link": "https://site.com/ds2"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + + + + +# Test5 : multiple duplicates +def test_deduplicate_multiple_duplicates(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds2", "title": "EEG Data!", "primary_link": "https://site.com/ds2"}, + {"_id": "ds3", "title": "MRI Data", "primary_link": "https://site.com/ds3"} + ] + result = deduplicate_datasets(datasets) + + assert len(result) == 2 + titles = [d["title"] for d in result] + assert "EEG Data" in titles or "EEG Data!" in titles + assert "MRI Data" in titles + + + + + +# Test6 : empty input +def test_deduplicate_empty(): + datasets = [] + result = deduplicate_datasets(datasets) + assert result == [] + + + + +# Test7 : datasets with different _id but same normalized title +def test_deduplicate_different_id_same_title(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds2", "title": "EEG Data", "primary_link": "https://site.com/ds2"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + + + + + +# Test8 : datasets with same _id but different capitalization +def test_deduplicate_same_id_diff_case(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "DS1", "title": "eeg data", "primary_link": "https://site.com/ds1"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 1 + + +# Test9 : that unique datasets remain + +def test_deduplicate_unique_datasets(): + datasets = [ + {"_id": "ds1", "title": "EEG Data", "primary_link": "https://site.com/ds1"}, + {"_id": "ds2", "title": "MRI Data", "primary_link": "https://site.com/ds2"}, + {"_id": "ds3", "title": "CT Scan Data", "primary_link": "https://site.com/ds3"} + ] + result = deduplicate_datasets(datasets) + assert len(result) == 3 + + +# Test10 : large dataset + +def test_deduplicate_large_dataset(): + datasets = [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(100)] + datasets += [{"_id": f"ds{i}", "title": f"Dataset {i}", "primary_link": f"https://site.com/ds{i}"} for i in range(50)] + result = deduplicate_datasets(datasets) + print(len(result)) \ No newline at end of file From eeed0aa84d18b20274b4852dc80afaf5fe6eee53 Mon Sep 17 00:00:00 2001 From: Areeba-Tahir-18 Date: Sun, 15 Mar 2026 08:39:25 +0500 Subject: [PATCH 2/2] Add/Update Knowledge Space Search Tool --- backend/ks_search_tool.py | 108 +++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/backend/ks_search_tool.py b/backend/ks_search_tool.py index 3004a02..f204ecb 100644 --- a/backend/ks_search_tool.py +++ b/backend/ks_search_tool.py @@ -4,10 +4,10 @@ import requests import asyncio import aiohttp -from typing import Dict, Optional, Set, Union, List, Any, Iterable import re -from urllib.parse import urlparse from difflib import SequenceMatcher +from typing import Dict, Optional, Set, Union, List, Any, Iterable +from urllib.parse import urlparse, urlunparse def tool(args_schema): @@ -443,6 +443,110 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs: except requests.RequestException as e: print(f" -> Error searching {data_source_id}: {e}") return [] + + + # Deduplication feature + +def normalize_url(url: str) -> str: + """Normalize URLs by stripping query params and fragments.""" + if not url: + return "" + parsed = urlparse(url) + normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) + return normalized.lower().rstrip("/") + + +def normalize_title(title: str) -> str: + """Normalize title: lowercase, strip punctuation, extra spaces.""" + if not title: + return "" + title = title.lower() + title = re.sub(r"[^\w\s]", "", title) + title = re.sub(r"\s+", " ", title) + return title.strip() + + +def titles_reordered_match(t1: str, t2: str) -> bool: + """Detect titles with same words but different order.""" + tokens1 = set(t1.split()) + tokens2 = set(t2.split()) + return tokens1 == tokens2 + + +def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]: + """Deduplicate datasets using canonical ID, normalized URL, fuzzy title, and reordered title detection.""" + + if not all_datasets: + return [] + + cleaned = [] + seen_canonical = set() + seen_urls = set() + + for dataset in all_datasets: + metadata = dataset.get("metadata", {}) or dataset.get("_source", {}) + + # Canonical ID + dataset_id = metadata.get("id") or metadata.get("dataset_id") or dataset.get("_id") + dataset_id = str(dataset_id).lower() if dataset_id else "" + + datasource_id = str(dataset.get("datasource_id") or "default_source").lower() + canonical_key = f"{datasource_id}:{dataset_id}" + + if dataset_id and canonical_key in seen_canonical: + continue + + if dataset_id: + seen_canonical.add(canonical_key) + + # URL deduplication + raw_url = dataset.get("primary_link", "") + normalized_url = normalize_url(raw_url) + + if normalized_url and normalized_url in seen_urls: + continue + + if normalized_url: + seen_urls.add(normalized_url) + + # Title normalization + title = normalize_title( + dataset.get("title") + or dataset.get("title_guess") + or metadata.get("title") + or "" + ) + + duplicate_found = False + + if title: + for existing in cleaned: + existing_title = normalize_title( + existing.get("title") + or existing.get("title_guess") + or "" + ) + + if not existing_title: + continue + + # Fuzzy match + similarity = SequenceMatcher(None, title, existing_title).ratio() + + if similarity > 0.93: + duplicate_found = True + break + + # Reordered title match + if titles_reordered_match(title, existing_title): + duplicate_found = True + break + + if not duplicate_found: + cleaned.append(dataset) + + return cleaned + @tool(args_schema=BaseModel)