Edge-Intelligence-Lab · pomegranar · Apr 16, 2026 · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -224,3 +224,13 @@ benchmarks/questions.json
 
 #For testing
 **/benchmarks
+
+# Claude local and transient stuff
+.claude/settings.local.json
+.claude/skills/.cache/
+.claude/skills/*.log
+.claude/skills/state/
+.claude/skills/tmp/
+.claude/conversations/
+.claude/sessions/
+.claude/checkpoints/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,96 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+ChatDKU is an agentic RAG (Retrieval-Augmented Generation) system for Duke Kunshan University. It answers student questions about courses, policies, requirements, and campus resources using a three-stage DSPy pipeline: **Planner** -> **Executor** (assess-act-distill loop) -> **Synthesizer**.
+
+## Commands
+
+```bash
+# Install dependencies
+uv sync
+
+# Run the agent CLI
+python -m chatdku.core.agent
+
+# Run Django backend
+python manage.py runserver
+
+# Run tests
+python -m pytest tests/
+python -m pytest tests/test_retriever.py          # single file
+
+# Lint (CI runs these on changed .py files in PRs)
+black --check <files>
+flake8 --ignore=E203,W503 --max-line-length 120 <files>
+
+# Format
+black <files>
+
+# Sync and run on dev server
+./devsync.sh                                      # runs the agent remotely
+./devsync.sh chatdku/core/tools/your_file.py      # runs a specific file
+```
+
+## Architecture
+
+### Agent Pipeline (`chatdku/core/`)
+
+The agent is three DSPy modules in sequence per user message:
+
+1. **Planner** (`dspy_classes/plan.py`) — Decides whether to answer directly (`send_message`), ask clarifying questions, or produce a plan for what information to gather.
+2. **Executor** (`dspy_classes/executor.py`) — Runs an Assess-Act loop guided by the plan, calling tools to gather information. A Distill step extracts only relevant context from the trajectory.
+3. **Synthesizer** (`dspy_classes/synthesizer.py`) — Generates the final cited response from distilled context.
+4. **ConversationMemory** (`dspy_classes/conversation_memory.py`) — Compresses/maintains chat history across turns.
+
+Entry point: `chatdku/core/agent.py`
+
+### Tools (`chatdku/core/tools/`)
+
+Tools available to the executor: `VectorQuery` (ChromaDB semantic search), `KeywordQuery` (Redis BM25), `MajorRequirementsLookup`, `QueryCurriculum` (PostgreSQL course/syllabus DB), `PrerequisiteLookup`, and others (calculator, campus service, email, search, GraphRAG).
+
+### Document Ingestion (`chatdku/ingestion/`)
+
+Parses, chunks, and loads documents into vector/keyword stores:
+- `update_data.py` — Parse and chunk documents
+- `load_chroma.py` — Load into ChromaDB
+- `load_redis.py` — Load into Redis (BM25 index)
+- `load_postgres.py` — Load course data into PostgreSQL
+- `clean_classdata.py` — Clean class schedule CSV data
+
+### Configuration (`chatdku/config.py`)
+
+Singleton `Config` class loaded from environment variables (`.env`). Access via `from chatdku.config import config`. Supports attribute access (`config.llm_temperature`), `.set()`, `.update()`, and a read-only `.view()`. All paths, model names, DB connections, and tuning parameters live here.
+
+### Backend (`chatdku/backend/`)
+
+Legacy Flask backend and Django backend (`chatdku/django/`). Django app uses `manage.py` at repo root.
+
+### Infrastructure
+
+- **LLM**: OpenAI-compatible endpoint (vLLM/SGLang with Qwen models)
+- **Embeddings**: TEI with `BAAI/bge-m3`
+- **Vector DB**: ChromaDB
+- **Keyword Index**: Redis
+- **Course DB**: PostgreSQL
+- **Observability**: Arize Phoenix
+- **Framework**: DSPy for agent logic, LlamaIndex for document ingestion
+
+## Code Style
+
+- **Formatter**: Black (pre-commit hook, CI)
+- **Linter**: Flake8 — `max-line-length=120`, ignores `E203,W503`
+- **Python**: 3.11 (pinned in `.python-version`); `pyproject.toml` requires `>=3.11, <3.13`
+- **Docstrings**: NumPy format preferred (per GUIDE.md)
+
+## Key Environment Variables
+
+`LLM_BASE_URL`, `LLM_API_KEY`, `LLM_MODEL`, `TEI_URL`, `EMBEDDING_MODEL`, `CHROMA_HOST`, `CHROMA_DB_PORT`, `REDIS_HOST`, `REDIS_PORT`, `REDIS_PASSWORD`, `DB_USER`, `DB_PASSWORD`, `DB_HOST`, `DB_PORT`, `DB_NAME`. See `.env.example` or `chatdku/config.py` for the full set.
+
+## Git Workflow
+
+- `main` branch is protected — never push directly; always use PRs with review.
+- Create a GitHub issue before starting work.
+- Use `devsync.sh` to iterate on the shared dev server (rsyncs code, runs `uv sync`, starts a live session).
diff --git a/chatdku/ingestion/major_ingest.py b/chatdku/ingestion/major_ingest.py
@@ -137,7 +137,7 @@ def sanitize_filename(name: str) -> str:
     # Remove or replace unsafe characters
     safe = re.sub(r"[^\w\s-]", "", name)
     safe = re.sub(r"[-\s]+", "-", safe)
-    return safe.strip("-").lower()
+    return safe.strip().lower()
 
 
 def save_major_content(major_name: str, content: Dict, output_dir: Path):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,147 @@
+"""Shared fixtures for ChatDKU tool tests."""
+
+from contextlib import contextmanager
+from unittest.mock import MagicMock
+
+import pandas as pd
+import pytest
+
+
+@pytest.fixture()
+def mock_span_ctx(monkeypatch):
+    """Mock span_ctx_start so no real tracer/Phoenix is needed.
+
+    Patches at every import site since each tool module binds the name at import time.
+    Returns the mock span for assertions on set_attributes / set_status.
+    """
+    mock_span = MagicMock()
+
+    @contextmanager
+    def fake_span_ctx_start(name, kind, parent_context=None):
+        yield mock_span
+
+    targets = [
+        "chatdku.core.utils.span_ctx_start",
+        "chatdku.core.tools.course_schedule.span_ctx_start",
+        "chatdku.core.tools.get_prerequisites.span_ctx_start",
+        "chatdku.core.tools.major_requirements.span_ctx_start",
+        "chatdku.core.tools.syllabi_tool.query_curriculum_db.span_ctx_start",
+        "chatdku.core.tools.retriever.base_retriever.span_ctx_start",
+    ]
+    for target in targets:
+        try:
+            monkeypatch.setattr(target, fake_span_ctx_start)
+        except (AttributeError, ImportError):
+            pass  # module not yet imported — safe to skip
+
+    return mock_span
+
+
+@pytest.fixture()
+def mock_get_current_span(monkeypatch):
+    """Mock get_current_span for llama_index_tools which uses it directly."""
+    mock_span = MagicMock()
+    monkeypatch.setattr(
+        "chatdku.core.tools.llama_index_tools.get_current_span", lambda: mock_span
+    )
+    return mock_span
+
+
+@pytest.fixture()
+def sample_classdata_csv(tmp_path):
+    """Create a temporary class schedule CSV with representative data."""
+    csv_path = tmp_path / "classdata.csv"
+    df = pd.DataFrame(
+        {
+            "Subject": ["COMPSCI", "COMPSCI", "MATH", "BIOL", "CHINESE"],
+            "Catalog": ["101", "201", "201", "305", "101A"],
+            "Section": ["01", "01", "01", "01", "01"],
+            "Component": ["LEC", "LEC", "LEC", "LAB", "LEC"],
+            "Instructor": [
+                "Alice Smith",
+                "Bob Jones",
+                "Carol Lee",
+                "Dave Kim",
+                "Eve Wu",
+            ],
+            "Days": ["MWF", "TTh", "MWF", "TTh", "MWF"],
+            "Start Time": ["09:00", "10:30", "11:00", "14:00", "13:00"],
+            "End Time": ["09:50", "11:45", "11:50", "15:15", "13:50"],
+            "Enrollment": [30, 25, 40, 15, 20],
+        }
+    )
+    df.to_csv(csv_path, index=False)
+    return str(csv_path)
+
+
+@pytest.fixture()
+def sample_prereq_csv(tmp_path):
+    """Create a temporary prerequisites CSV with UTF-16LE encoding.
+
+    Column layout matches positional access in get_prereq:
+      col 0: ID
+      col 1: Effective Date (MM/DD/YYYY)
+      col 2: Subject
+      col 3: Catalog
+      cols 4-12: padding
+      col 13: Description (prerequisite text)
+    """
+    csv_path = tmp_path / "prereq.csv"
+    rows = [
+        # COMPSCI 201 with prereqs, two rows with different dates
+        [
+            1,
+            "01/15/2023",
+            "COMPSCI",
+            "201",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "Prereq: COMPSCI 101",
+        ],
+        [
+            2,
+            "09/01/2024",
+            "COMPSCI",
+            "201",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "Prereq: COMPSCI 101 or COMPSCI 102",
+        ],
+        # MATH 201 with prereqs
+        [
+            3,
+            "03/10/2024",
+            "MATH",
+            "201",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "Prereq: MATH 101",
+        ],
+        # BIOL 305 with empty description
+        [4, "06/01/2024", "BIOL", "305", "", "", "", "", "", "", "", "", "", ""],
+    ]
+    columns = [f"col{i}" for i in range(14)]
+    df = pd.DataFrame(rows, columns=columns)
+    df.to_csv(csv_path, index=False, encoding="utf-16le")
+    return str(csv_path)