IFRCGo · fbs617 · Mar 12, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.github/workflows/llm-tests.yml b/.github/workflows/llm-tests.yml
@@ -0,0 +1,196 @@
+# DREF Assist LLM Handler Tests — Tier 1 (pytest)
+#
+# Runs hard-coded assertion tests that make real Azure OpenAI API calls.
+# These tests verify binary safety/correctness properties:
+#   - Prompt injection resistance (BLOCKER)
+#   - Silent overwrite prevention (BLOCKER)
+#   - Cross-turn conflict detection (BLOCKER)
+#   - Schema validation, type checking, null preservation
+#   - Off-topic classification
+#
+# CI POLICY:
+#   - Never blocks PR merge — results are for visibility only
+#   - Blocker failures are labelled clearly in the PR comment
+#   - Tier 2 (Promptfoo judge) is NOT run in CI — locally/nightly only
+#
+# REQUIRED SECRETS:
+#   - AZURE_OPENAI_API_KEY
+#   - AZURE_OPENAI_ENDPOINT
+#   - AZURE_OPENAI_API_VERSION
+#   - AZURE_OPENAI_DEPLOYMENT
+
+name: LLM Handler Tests (Tier 1)
+
+on:
+  push:
+    branches: [main, "feature/**"]
+    paths:
+      - "backend/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - "backend/**"
+
+jobs:
+  tier1-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      # ── Checkout code ──────────────────────────────────────
+      - uses: actions/checkout@v4
+
+      # ── Set up Python ──────────────────────────────────────
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      # ── Install dependencies ───────────────────────────────
+      - name: Install backend dependencies
+        working-directory: backend
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest pytest-json-report
+
+      # ── Check API key is configured ────────────────────────
+      # Fail early with a clear message if secrets are missing
+      - name: Verify API key is configured
+        env:
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+        run: |
+          if [ -z "$AZURE_OPENAI_API_KEY" ]; then
+            echo "::error::AZURE_OPENAI_API_KEY secret is not configured."
+            echo "::error::LLM tests require Azure OpenAI credentials."
+            echo "::error::Add AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT,"
+            echo "::error::AZURE_OPENAI_API_VERSION, and AZURE_OPENAI_DEPLOYMENT"
+            echo "::error::as repository secrets in Settings > Secrets and variables > Actions."
+            exit 1
+          fi
+
+      # ── Create results directory ───────────────────────────
+      - name: Create results directory
+        run: mkdir -p backend/tests/results/latest
+
+      # ── Run Tier 1 tests ───────────────────────────────────
+      # Uses || true so the job continues even if tests fail
+      # (CI is for visibility, not gatekeeping)
+      - name: Run Tier 1 tests
+        working-directory: backend/tests
+        env:
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }}
+          AZURE_OPENAI_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }}
+        run: |
+          python -m pytest tier1/ -v \
+            --json-report \
+            --json-report-file=results/latest/tier1_results.json \
+            || true
+
+      # ── Upload test results as artifact ────────────────────
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: llm-test-results
+          path: backend/tests/results/latest/tier1_results.json
+          retention-days: 30
+
+      # ── Generate and post PR comment ───────────────────────
+      # Parses the JSON report and posts a readable summary
+      # with blocker/non-blocker distinction
+      - name: Generate PR comment
+        if: github.event_name == 'pull_request'
+        working-directory: backend/tests
+        run: |
+          python3 - <<'SCRIPT'
+          import json
+          from pathlib import Path
+
+          results_file = Path("results/latest/tier1_results.json")
+          comment_file = Path("/tmp/pr_comment.md")
+
+          if not results_file.exists():
+              comment_file.write_text(
+                  "## DREF Assist LLM Tests\n\n"
+                  "⚠️ No test results found. API key may not be configured.\n"
+              )
+              exit(0)
+
+          data = json.loads(results_file.read_text())
+          tests = data.get("tests", [])
+
+          BLOCKER_IDS = {
+              "test_10_1_prompt_injection",
+              "test_12_1_cross_turn_contradiction",
+              "test_12_2_conflict_resolution_ux_flow",
+              "test_12_4_cross_document_conflict",
+              "test_12_5_silent_overwrite_prevention",
+          }
+
+          passed = []
+          blocker_fails = []
+          other_fails = []
+          skipped = []
+
+          for t in tests:
+              nodeid = t.get("nodeid", "")
+              name = nodeid.split("::")[-1] if "::" in nodeid else nodeid
+              outcome = t.get("outcome", "unknown")
+
+              if outcome == "passed":
+                  passed.append(name)
+              elif outcome == "skipped":
+                  skipped.append(name)
+              elif name in BLOCKER_IDS:
+                  msg = t.get("call", {}).get("longrepr", "")
+                  if isinstance(msg, str) and len(msg) > 150:
+                      msg = msg[:150] + "..."
+                  blocker_fails.append((name, msg))
+              else:
+                  msg = t.get("call", {}).get("longrepr", "")
+                  if isinstance(msg, str) and len(msg) > 150:
+                      msg = msg[:150] + "..."
+                  other_fails.append((name, msg))
+
+          total = len(tests) - len(skipped)
+          pass_count = len(passed)
+
+          lines = []
+
+          if blocker_fails:
+              lines.append(f"⚠️ **DREF Assist LLM Tests — {pass_count}/{total} passed**\n")
+              lines.append("### 🔴 BLOCKER FAILURES (treat as urgent):")
+              for name, msg in blocker_fails:
+                  lines.append(f"   - `{name}` — {msg}")
+              lines.append("")
+          else:
+              lines.append(f"✅ **DREF Assist LLM Tests — {pass_count}/{total} passed**\n")
+
+          if other_fails:
+              lines.append("### ℹ️ NON-BLOCKER FAILURES:")
+              for name, msg in other_fails:
+                  lines.append(f"   - `{name}` — {msg}")
+              lines.append("")
+          elif not blocker_fails:
+              lines.append("All tests passed. No blocker or non-blocker failures.\n")
+
+          if skipped:
+              lines.append(f"*{len(skipped)} test(s) skipped (likely missing API key)*\n")
+
+          lines.append(
+              "> Merge is not blocked. Blocker failures should be resolved "
+              "before further changes are made on top of this commit."
+          )
+
+          comment_file.write_text("\n".join(lines))
+          SCRIPT
+
+      # ── Post comment on PR ─────────────────────────────────
+      - name: Post PR comment
+        if: github.event_name == 'pull_request'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          path: /tmp/pr_comment.md
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -17,3 +17,6 @@ htmlcov/
 # Virtual environments
 .venv/
 venv/
+
+# LLM test results (generated artifacts)
+tests/results/
diff --git a/backend/llm_handler/handler.py b/backend/llm_handler/handler.py
@@ -8,14 +8,23 @@
 import os
 from typing import Dict, Any, List, Optional, Union
 
-from openai import AzureOpenAI
+from openai import AzureOpenAI, BadRequestError
 from dotenv import load_dotenv
 
 from .prompt import build_system_prompt
 from .parser import process_llm_response
 
 load_dotenv()
 
+_CONTENT_FILTER_RESPONSE: Dict[str, Any] = {
+    "classification": "OFF_TOPIC",
+    "reply": (
+        "I'm not able to help with that request. "
+        "Please ask something related to the DREF application."
+    ),
+    "field_updates": [],
+}
+
 # Type alias for message content (text string or multimodal list from media-processor)
 MessageContent = Union[str, List[Dict[str, Any]]]
 
@@ -69,13 +78,20 @@ def handle_message(
 
     messages.append({"role": "user", "content": user_message})
 
-    response = client.chat.completions.create(
-        model=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
-        messages=messages,
-        temperature=0.1,
-        response_format={"type": "json_object"},
-    )
+    try:
+        response = client.chat.completions.create(
+            model=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
+            messages=messages,
+            temperature=0.1,
+            response_format={"type": "json_object"},
+        )
+    except BadRequestError:
+        # Azure content management policy rejected the prompt
+        return _CONTENT_FILTER_RESPONSE.copy()
 
+    # Content filter may allow the request but redact the response
     raw_response = response.choices[0].message.content
+    if raw_response is None:
+        return _CONTENT_FILTER_RESPONSE.copy()
 
     return process_llm_response(raw_response)
diff --git a/backend/llm_handler/prompt.py b/backend/llm_handler/prompt.py
@@ -62,6 +62,7 @@
    INFERRED fields:
    - You MAY logically deduce the value from available evidence, even if the value is not stated verbatim.
    - The inference must be strong and unambiguous. For example: if the event is an earthquake, disaster_onset can be inferred as "Sudden".
+   - For disaster_type specifically: infer from descriptive language even in narrative or indirect phrasing. Words like "flooding", "water breached banks", "inundated" → Flood; "shaking", "tremors", "magnitude" → Earthquake; "winds", "cyclone", "hurricane", "typhoon" → Storm / Tropical Cyclone; "dry conditions", "crop failure", "water scarcity" → Drought. Do NOT wait for the exact word "Flood" or "Earthquake" to appear — infer from context.
    - If the inference is uncertain or could go either way, ask for clarification instead.
    - In your reply, briefly note any inferred values so the user can verify them.
 
@@ -75,6 +76,8 @@
    - Never invent numbers, dates, or contact information not present in the sources.
    - Do not copy information between fields (e.g., don't assume targeted population equals affected population).
    - For dropdown fields, only use values from the allowed options listed in the schema.
+   - For ambiguous place names (e.g., "Springfield", "Victoria", "Central Region") that exist in multiple countries, do NOT assume or infer a country. Ask the user to specify the country before populating the country or region field.
+   - For dates written in ambiguous slash-delimited format (e.g., "03/04/2025" which could be March 4 or April 3), do NOT silently assume MM/DD or DD/MM interpretation. Ask the user to clarify the intended date before populating any date field. Only dates in unambiguous formats (ISO YYYY-MM-DD, written-out month names like "4 March 2025", or single-digit day/month combinations that are impossible in one interpretation) may be used without asking.
    - For multi-select fields, return an array of strings.
    - For boolean fields, return true or false.
    - For dates, return ISO format: "YYYY-MM-DD".

diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
@@ -0,0 +1,113 @@
+"""
+Shared fixtures, markers, and configuration for the DREF Assist LLM test suite.
+
+This conftest provides:
+- Azure OpenAI client fixture (session-scoped, real API calls)
+- API key validation (skips session if credentials missing)
+- Custom pytest markers for blocker/tier1/security tests
+- Path setup matching the backend module structure
+"""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+from dotenv import load_dotenv
+
+# Load .env from backend root
+_backend = Path(__file__).parent.parent
+load_dotenv(_backend / ".env")
+
+# Add backend paths so imports resolve identically to how app.py does it
+sys.path.insert(0, str(_backend))
+sys.path.insert(0, str(_backend / "llm_handler"))
+sys.path.insert(0, str(_backend / "conflict_resolver"))
+sys.path.insert(0, str(_backend / "media-processor"))
+sys.path.insert(0, str(_backend / "services"))
+
+
+def pytest_configure(config):
+    """Register custom markers."""
+    config.addinivalue_line("markers", "blocker: critical safety test — failure is urgent")
+    config.addinivalue_line("markers", "tier1: Tier 1 hard-coded assertion test")
+    config.addinivalue_line("markers", "security: security-related test (injection, etc.)")
+
+
+@pytest.fixture(scope="session")
+def azure_client():
+    """Create a real AzureOpenAI client for the test session.
+
+    Fails immediately with a clear message if required environment
+    variables are missing, rather than silently failing mid-run.
+    """
+    from openai import AzureOpenAI
+
+    api_key = os.getenv("AZURE_OPENAI_API_KEY")
+    endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+    api_version = os.getenv("AZURE_OPENAI_API_VERSION")
+
+    if not api_key:
+        pytest.fail(
+            "AZURE_OPENAI_API_KEY environment variable is not set. "
+            "LLM tests require a real Azure OpenAI API key. "
+            "Set it in backend/.env or export it in your shell."
+        )
+    if not endpoint:
+        pytest.fail(
+            "AZURE_OPENAI_ENDPOINT environment variable is not set. "
+            "Set it in backend/.env or export it in your shell."
+        )
+
+    return AzureOpenAI(
+        api_key=api_key,
+        azure_endpoint=endpoint,
+        api_version=api_version or "2024-02-15-preview",
+    )
+
+
+@pytest.fixture
+def call_handle_message(azure_client):
+    """Fixture that returns a callable to invoke handle_message with the shared client.
+
+    Usage:
+        def test_something(call_handle_message):
+            result = call_handle_message("Some input", form_state={})
+    """
+    from llm_handler.handler import handle_message
+
+    def _call(user_message, form_state=None, conversation_history=None):
+        return handle_message(
+            user_message=user_message,
+            current_form_state=form_state or {},
+            conversation_history=conversation_history,
+            client=azure_client,
+        )
+
+    return _call
+
+
+@pytest.fixture
+def call_process_user_input(azure_client):
+    """Fixture that returns a callable to invoke process_user_input with the shared client.
+
+    Used for conflict detection tests that need the full service layer.
+
+    Usage:
+        def test_conflict(call_process_user_input):
+            result = call_process_user_input(
+                "New message",
+                enriched_form_state={...},
+            )
+    """
+    from services.assistant import process_user_input
+
+    def _call(user_message, enriched_form_state=None, conversation_history=None):
+        return process_user_input(
+            user_message=user_message,
+            enriched_form_state=enriched_form_state or {},
+            conversation_history=conversation_history,
+            client=azure_client,
+        )
+
+    return _call
diff --git a/backend/tests/helpers/__init__.py b/backend/tests/helpers/__init__.py