INCF · Areeba-Tahir-18 · Jan 16, 2026 · Feb 14, 2026 · Mar 6, 2026
diff --git a/.env.template b/.env.template
@@ -1,15 +1,16 @@
-GOOGLE_API_KEY=
+#GOOGLE_API_KEY=false
 
 # To use vertexai keep it true and false to use gemini
-GEMINI_USE_VERTEX=true 
+GEMINI_USE_VERTEX=true
 
-GCP_PROJECT_ID=
-GCP_REGION=      
+GCP_PROJECT_ID=silicon-shape-484306-u7
+
+GCP_REGION=us-central1
 
 # BigQuery 
-BQ_DATASET_ID=
-BQ_TABLE_ID=
-BQ_LOCATION=                 
+BQ_DATASET_ID=silicon-shape-484306-u7.knowledgespace_metadata
+BQ_TABLE_ID=silicon-shape-484306-u7.knowledgespace_metadata.knowledge_table
+BQ_LOCATION=US            
 
 INDEX_ENDPOINT_ID=
 DEPLOYED_INDEX_ID=

diff --git a/README.md b/README.md
@@ -80,7 +80,10 @@ uv venv
 
 # Activate it:
 # On Windows (cmd):
- .venv/bin/activate
+
+venv\Scripts\activate  # Corrected cmd for window users 
+
+
 
 ```
 
@@ -144,6 +147,12 @@ The backend requires specific environment variables to connect to **Google Cloud
 
 In one terminal, from the project root with the virtual environment active:
 
+
+```bash
+cd backend
+```
+
+
 ```bash
 uv run main.py
 ```

diff --git a/Users - Shortcut.lnk b/Users - Shortcut.lnk
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,18 +1,11 @@
-FROM python:3.12-slim
-WORKDIR /app
-
-RUN pip install uv
-
-COPY pyproject.toml ./
-COPY .env ./
-COPY backend/service-account.json ./service-account.json
+FROM python:3.11-slim
 
+WORKDIR /app
 
-RUN UV_HTTP_TIMEOUT=300 uv sync
+COPY . .
 
-COPY backend/ ./
+RUN pip install --no-cache-dir -r requirements.txt
 
 EXPOSE 8000
 
-ENV GOOGLE_APPLICATION_CREDENTIALS=/app/service-account.json
-CMD ["uv", "run", "main.py", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"]
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/backend/ks_search_tool.py b/backend/ks_search_tool.py
@@ -10,6 +10,22 @@
 from difflib import SequenceMatcher
 
 
+
+def normalize_title(title: str) -> str:
+    """
+    Normalize a dataset title for better fuzzy matching:
+    - Lowercase
+    - Remove punctuation
+    - Remove extra spaces
+    """
+    if not title:
+        return ""
+    title = title.lower()
+    title = re.sub(r"[^\w\s]", "", title)  # remove punctuation
+    title = re.sub(r"\s+", " ", title).strip()  # normalize spaces
+    return title
+
+
 def tool(args_schema):
     def decorator(func):
         func.args_schema = args_schema
@@ -361,16 +377,16 @@ def general_search(query: str, top_k: int = 10, enrich_details: bool = True) ->
                 or "https://knowledge-space.org"
             )
             normalized_results.append(
-                {
-                    "id": item.get("id", f"ks{i}"),
-                    "_source": item,
-                    "_score": 1.0,
-                    "title_guess": title,
-                    "content": description,
-                    "primary_link": url,
-                    "metadata": item,
-                }
-            )
+    {
+        "_id": item.get("id") or item.get("_id") or url,
+        "_source": item,
+        "_score": 1.0,
+        "title_guess": title,
+        "content": description,
+        "primary_link": url,
+        "metadata": item,
+    }
+)
         print(f"  -> General search returned {len(normalized_results)} results")
         if enrich_details and normalized_results:
             print("  -> Enriching results with detailed dataset information (parallel)...")
@@ -443,23 +459,126 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs:
     except requests.RequestException as e:
         print(f"  -> Error searching {data_source_id}: {e}")
         return []
+
+
+# Duplicate Removal Feature 
+
+def normalize_url(url: str) -> str:
+    """Remove query params and fragments from URL"""
+    if not url:
+        return ""
+    parsed = urlparse(url)
+    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip("/")
+
+def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]:
+    """
+    Strong deduplication using:
+    1. datasource_id + dataset_id (canonical identity)
+    2. normalized URL
+    3. fuzzy title similarity
+    """
+
+    cleaned = []
+    seen_canonical = set()
+    seen_urls = set()
+
+    for dataset in all_datasets:
+        metadata = dataset.get("metadata", {}) or dataset.get("_source", {})
+
+        # Extract canonical identity
+        datasource_id = dataset.get("datasource_id")
+        dataset_id = (
+            metadata.get("id")
+            or metadata.get("dataset_id")
+            or dataset.get("_id")
+        )
+
+        if datasource_id and dataset_id:
+            canonical_key = f"{datasource_id}:{dataset_id}"
+            if canonical_key in seen_canonical:
+                continue
+            seen_canonical.add(canonical_key)
+
+        # Normalize URL
+        raw_url = dataset.get("primary_link", "")
+        normalized_url = normalize_url(raw_url)
 
+        if normalized_url and normalized_url in seen_urls:
+            continue
+        if normalized_url:
+            seen_urls.add(normalized_url)
+
+
+        title = normalize_title(
+            dataset.get("title")
+            or dataset.get("title_guess")
+            or metadata.get("title")
+            or ""
+        )
+
+
+        duplicate_found = False
+        for existing in cleaned:
+            existing_title = normalize_title(
+                existing.get("title")
+                or existing.get("title_guess")
+                or ""
+            )
+            similarity = SequenceMatcher(None, title, existing_title).ratio()
+
+
+            if similarity > 0.93:
+                duplicate_found = True
+                break
+
+        if duplicate_found:
+            continue
+
+        cleaned.append(dataset)
+
+    return cleaned
+
+
+
+# ks_search_tool.py
 
 @tool(args_schema=BaseModel)
 def smart_knowledge_search(
+
+
     query: Optional[str] = None,
     filters: Optional[Union[Dict, Set]] = None,
     data_source: Optional[str] = None,
     top_k: int = 10,
 ) -> dict:
     q = query or "*"
-    if filters:
-        config_path = "datasources_config.json"
-        if os.path.exists(config_path):
-            with open(config_path, "r", encoding="utf-8") as fh:
-                all_configs = json.load(fh)
-            target_id = DATASOURCE_NAME_TO_ID.get(data_source) or (data_source if data_source in all_configs else None)
-            if target_id:
-                results = _perform_search(target_id, q, dict(filters), all_configs)
-                return {"combined_results": results[:top_k]}
-    return general_search(q, top_k, enrich_details=True)
+
+    try:
+        if filters:
+            # if filters are provided, use _perform_search if possible
+            config_path = "datasources_config.json"
+            if os.path.exists(config_path):
+                with open(config_path, "r", encoding="utf-8") as fh:
+                    all_configs = json.load(fh)
+                target_id = DATASOURCE_NAME_TO_ID.get(data_source) or (
+                    data_source if data_source in all_configs else None
+                )
+                if target_id:
+                    results = _perform_search(target_id, q, dict(filters), all_configs)
+                    results = deduplicate_datasets(results)
+                    return {"combined_results": results[:top_k]}
+
+        # fallback to general_search if no filters or target_id
+        results = general_search(q, top_k*2, enrich_details=True).get("combined_results", [])
+        results = deduplicate_datasets(results)
+        return {"combined_results": results[:top_k]}
+
+    except Exception as e:
+        print(f"Error in smart_knowledge_search: {e}")
+        return {"combined_results": []}
+
+
+
+
+
+
diff --git a/backend/requirements.txt b/backend/requirements.txt
diff --git a/backend/service-account.json b/backend/service-account.json
diff --git a/frontend/google-cloud-sdk/.install/bq-nix.snapshot.json b/frontend/google-cloud-sdk/.install/bq-nix.snapshot.json
@@ -0,0 +1,44 @@
+{
+  "components": [
+    {
+      "data": {
+        "contents_checksum": "239d13b0bd2ce8c978287c3a07407946f459b829fda560a93665230e0b6f7338",
+        "source": "",
+        "type": "tar"
+      },
+      "dependencies": [
+        "bq",
+        "core"
+      ],
+      "details": {
+        "description": "Provides the bq tool for interacting with the BigQuery service.",
+        "display_name": "BigQuery Command Line Tool (Platform Specific)"
+      },
+      "gdu_only": false,
+      "id": "bq-nix",
+      "is_configuration": false,
+      "is_hidden": true,
+      "is_required": false,
+      "platform": {
+        "operating_systems": [
+          "CYGWIN",
+          "LINUX",
+          "MACOSX",
+          "MSYS"
+        ]
+      },
+      "platform_required": false,
+      "version": {
+        "build_number": 20260109121340,
+        "version_string": "2.1.27"
+      }
+    }
+  ],
+  "revision": 20260109121340,
+  "schema_version": {
+    "no_update": false,
+    "url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
+    "version": 3
+  },
+  "version": "552.0.0"
+}
diff --git a/frontend/google-cloud-sdk/.install/bq.snapshot.json b/frontend/google-cloud-sdk/.install/bq.snapshot.json
@@ -0,0 +1,38 @@
+{
+  "components": [
+    {
+      "data": {
+        "contents_checksum": "f8ce8c5d2fd09e489df1b824959d65129207935942fdb31c4da9bb9e9ab4e375",
+        "source": "",
+        "type": "tar"
+      },
+      "dependencies": [
+        "bq-nix",
+        "bq-win",
+        "core"
+      ],
+      "details": {
+        "description": "Provides the bq tool for interacting with the BigQuery service.",
+        "display_name": "BigQuery Command Line Tool"
+      },
+      "gdu_only": false,
+      "id": "bq",
+      "is_configuration": false,
+      "is_hidden": false,
+      "is_required": false,
+      "platform": {},
+      "platform_required": false,
+      "version": {
+        "build_number": 20260109121340,
+        "version_string": "2.1.27"
+      }
+    }
+  ],
+  "revision": 20260109121340,
+  "schema_version": {
+    "no_update": false,
+    "url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
+    "version": 3
+  },
+  "version": "552.0.0"
+}
diff --git a/frontend/google-cloud-sdk/.install/bundled-python3-unix-linux-x86_64.snapshot.json b/frontend/google-cloud-sdk/.install/bundled-python3-unix-linux-x86_64.snapshot.json
@@ -0,0 +1,44 @@
+{
+  "components": [
+    {
+      "data": {
+        "contents_checksum": "8512cea4de7af7cc63b12501e7a23b45ba0f07c7debbf1fc52af75ce07539c8b",
+        "source": "",
+        "type": "tar"
+      },
+      "dependencies": [
+        "bundled-python3-unix",
+        "core"
+      ],
+      "details": {
+        "description": "Provides stand-alone Python 3.13.10 installation for UNIX.",
+        "display_name": "Bundled Python 3.13 (Platform Specific)"
+      },
+      "gdu_only": false,
+      "id": "bundled-python3-unix-linux-x86_64",
+      "is_configuration": false,
+      "is_hidden": true,
+      "is_required": false,
+      "platform": {
+        "architectures": [
+          "x86_64"
+        ],
+        "operating_systems": [
+          "LINUX"
+        ]
+      },
+      "platform_required": false,
+      "version": {
+        "build_number": 20260109121340,
+        "version_string": "3.13.10"
+      }
+    }
+  ],
+  "revision": 20260109121340,
+  "schema_version": {
+    "no_update": false,
+    "url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
+    "version": 3
+  },
+  "version": "552.0.0"
+}