Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
15 changes: 8 additions & 7 deletions .env.template
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
GOOGLE_API_KEY=
#GOOGLE_API_KEY=false

# To use vertexai keep it true and false to use gemini
GEMINI_USE_VERTEX=true
GEMINI_USE_VERTEX=true

GCP_PROJECT_ID=
GCP_REGION=
GCP_PROJECT_ID=silicon-shape-484306-u7

GCP_REGION=us-central1

# BigQuery
BQ_DATASET_ID=
BQ_TABLE_ID=
BQ_LOCATION=
BQ_DATASET_ID=silicon-shape-484306-u7.knowledgespace_metadata
BQ_TABLE_ID=silicon-shape-484306-u7.knowledgespace_metadata.knowledge_table
BQ_LOCATION=US

INDEX_ENDPOINT_ID=
DEPLOYED_INDEX_ID=
Expand Down
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ uv venv

# Activate it:
# On Windows (cmd):
.venv/bin/activate

venv\Scripts\activate # Corrected cmd for window users



```

Expand Down Expand Up @@ -144,6 +147,12 @@ The backend requires specific environment variables to connect to **Google Cloud

In one terminal, from the project root with the virtual environment active:


```bash
cd backend
```


```bash
uv run main.py
```
Expand Down
Binary file added Users - Shortcut.lnk
Binary file not shown.
17 changes: 5 additions & 12 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,11 @@
FROM python:3.12-slim
WORKDIR /app

RUN pip install uv

COPY pyproject.toml ./
COPY .env ./
COPY backend/service-account.json ./service-account.json
FROM python:3.11-slim

WORKDIR /app

RUN UV_HTTP_TIMEOUT=300 uv sync
COPY . .

COPY backend/ ./
RUN pip install --no-cache-dir -r requirements.txt

EXPOSE 8000

ENV GOOGLE_APPLICATION_CREDENTIALS=/app/service-account.json
CMD ["uv", "run", "main.py", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"]
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
159 changes: 139 additions & 20 deletions backend/ks_search_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@
from difflib import SequenceMatcher



def normalize_title(title: str) -> str:
"""
Normalize a dataset title for better fuzzy matching:
- Lowercase
- Remove punctuation
- Remove extra spaces
"""
if not title:
return ""
title = title.lower()
title = re.sub(r"[^\w\s]", "", title) # remove punctuation
title = re.sub(r"\s+", " ", title).strip() # normalize spaces
return title


def tool(args_schema):
def decorator(func):
func.args_schema = args_schema
Expand Down Expand Up @@ -361,16 +377,16 @@ def general_search(query: str, top_k: int = 10, enrich_details: bool = True) ->
or "https://knowledge-space.org"
)
normalized_results.append(
{
"id": item.get("id", f"ks{i}"),
"_source": item,
"_score": 1.0,
"title_guess": title,
"content": description,
"primary_link": url,
"metadata": item,
}
)
{
"_id": item.get("id") or item.get("_id") or url,
"_source": item,
"_score": 1.0,
"title_guess": title,
"content": description,
"primary_link": url,
"metadata": item,
}
)
print(f" -> General search returned {len(normalized_results)} results")
if enrich_details and normalized_results:
print(" -> Enriching results with detailed dataset information (parallel)...")
Expand Down Expand Up @@ -443,23 +459,126 @@ def _perform_search(data_source_id: str, query: str, filters: dict, all_configs:
except requests.RequestException as e:
print(f" -> Error searching {data_source_id}: {e}")
return []


# Duplicate Removal Feature

def normalize_url(url: str) -> str:
"""Remove query params and fragments from URL"""
if not url:
return ""
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}{parsed.path}".rstrip("/")

def deduplicate_datasets(all_datasets: List[dict]) -> List[dict]:
"""
Strong deduplication using:
1. datasource_id + dataset_id (canonical identity)
2. normalized URL
3. fuzzy title similarity
"""

cleaned = []
seen_canonical = set()
seen_urls = set()

for dataset in all_datasets:
metadata = dataset.get("metadata", {}) or dataset.get("_source", {})

# Extract canonical identity
datasource_id = dataset.get("datasource_id")
dataset_id = (
metadata.get("id")
or metadata.get("dataset_id")
or dataset.get("_id")
)

if datasource_id and dataset_id:
canonical_key = f"{datasource_id}:{dataset_id}"
if canonical_key in seen_canonical:
continue
seen_canonical.add(canonical_key)

# Normalize URL
raw_url = dataset.get("primary_link", "")
normalized_url = normalize_url(raw_url)

if normalized_url and normalized_url in seen_urls:
continue
if normalized_url:
seen_urls.add(normalized_url)


title = normalize_title(
dataset.get("title")
or dataset.get("title_guess")
or metadata.get("title")
or ""
)


duplicate_found = False
for existing in cleaned:
existing_title = normalize_title(
existing.get("title")
or existing.get("title_guess")
or ""
)
similarity = SequenceMatcher(None, title, existing_title).ratio()


if similarity > 0.93:
duplicate_found = True
break

if duplicate_found:
continue

cleaned.append(dataset)

return cleaned



# ks_search_tool.py

@tool(args_schema=BaseModel)
def smart_knowledge_search(


query: Optional[str] = None,
filters: Optional[Union[Dict, Set]] = None,
data_source: Optional[str] = None,
top_k: int = 10,
) -> dict:
q = query or "*"
if filters:
config_path = "datasources_config.json"
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as fh:
all_configs = json.load(fh)
target_id = DATASOURCE_NAME_TO_ID.get(data_source) or (data_source if data_source in all_configs else None)
if target_id:
results = _perform_search(target_id, q, dict(filters), all_configs)
return {"combined_results": results[:top_k]}
return general_search(q, top_k, enrich_details=True)

try:
if filters:
# if filters are provided, use _perform_search if possible
config_path = "datasources_config.json"
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as fh:
all_configs = json.load(fh)
target_id = DATASOURCE_NAME_TO_ID.get(data_source) or (
data_source if data_source in all_configs else None
)
if target_id:
results = _perform_search(target_id, q, dict(filters), all_configs)
results = deduplicate_datasets(results)
return {"combined_results": results[:top_k]}

# fallback to general_search if no filters or target_id
results = general_search(q, top_k*2, enrich_details=True).get("combined_results", [])
results = deduplicate_datasets(results)
return {"combined_results": results[:top_k]}

except Exception as e:
print(f"Error in smart_knowledge_search: {e}")
return {"combined_results": []}






Binary file added backend/requirements.txt
Binary file not shown.
Empty file added backend/service-account.json
Empty file.
44 changes: 44 additions & 0 deletions frontend/google-cloud-sdk/.install/bq-nix.snapshot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"components": [
{
"data": {
"contents_checksum": "239d13b0bd2ce8c978287c3a07407946f459b829fda560a93665230e0b6f7338",
"source": "",
"type": "tar"
},
"dependencies": [
"bq",
"core"
],
"details": {
"description": "Provides the bq tool for interacting with the BigQuery service.",
"display_name": "BigQuery Command Line Tool (Platform Specific)"
},
"gdu_only": false,
"id": "bq-nix",
"is_configuration": false,
"is_hidden": true,
"is_required": false,
"platform": {
"operating_systems": [
"CYGWIN",
"LINUX",
"MACOSX",
"MSYS"
]
},
"platform_required": false,
"version": {
"build_number": 20260109121340,
"version_string": "2.1.27"
}
}
],
"revision": 20260109121340,
"schema_version": {
"no_update": false,
"url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
"version": 3
},
"version": "552.0.0"
}
38 changes: 38 additions & 0 deletions frontend/google-cloud-sdk/.install/bq.snapshot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"components": [
{
"data": {
"contents_checksum": "f8ce8c5d2fd09e489df1b824959d65129207935942fdb31c4da9bb9e9ab4e375",
"source": "",
"type": "tar"
},
"dependencies": [
"bq-nix",
"bq-win",
"core"
],
"details": {
"description": "Provides the bq tool for interacting with the BigQuery service.",
"display_name": "BigQuery Command Line Tool"
},
"gdu_only": false,
"id": "bq",
"is_configuration": false,
"is_hidden": false,
"is_required": false,
"platform": {},
"platform_required": false,
"version": {
"build_number": 20260109121340,
"version_string": "2.1.27"
}
}
],
"revision": 20260109121340,
"schema_version": {
"no_update": false,
"url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
"version": 3
},
"version": "552.0.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"components": [
{
"data": {
"contents_checksum": "8512cea4de7af7cc63b12501e7a23b45ba0f07c7debbf1fc52af75ce07539c8b",
"source": "",
"type": "tar"
},
"dependencies": [
"bundled-python3-unix",
"core"
],
"details": {
"description": "Provides stand-alone Python 3.13.10 installation for UNIX.",
"display_name": "Bundled Python 3.13 (Platform Specific)"
},
"gdu_only": false,
"id": "bundled-python3-unix-linux-x86_64",
"is_configuration": false,
"is_hidden": true,
"is_required": false,
"platform": {
"architectures": [
"x86_64"
],
"operating_systems": [
"LINUX"
]
},
"platform_required": false,
"version": {
"build_number": 20260109121340,
"version_string": "3.13.10"
}
}
],
"revision": 20260109121340,
"schema_version": {
"no_update": false,
"url": "https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz",
"version": 3
},
"version": "552.0.0"
}
Loading