Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 196 additions & 0 deletions .github/workflows/llm-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# DREF Assist LLM Handler Tests — Tier 1 (pytest)
#
# Runs hard-coded assertion tests that make real Azure OpenAI API calls.
# These tests verify binary safety/correctness properties:
# - Prompt injection resistance (BLOCKER)
# - Silent overwrite prevention (BLOCKER)
# - Cross-turn conflict detection (BLOCKER)
# - Schema validation, type checking, null preservation
# - Off-topic classification
#
# CI POLICY:
# - Never blocks PR merge — results are for visibility only
# - Blocker failures are labelled clearly in the PR comment
# - Tier 2 (Promptfoo judge) is NOT run in CI — locally/nightly only
#
# REQUIRED SECRETS:
# - AZURE_OPENAI_API_KEY
# - AZURE_OPENAI_ENDPOINT
# - AZURE_OPENAI_API_VERSION
# - AZURE_OPENAI_DEPLOYMENT

name: LLM Handler Tests (Tier 1)

on:
push:
branches: [main, "feature/**"]
paths:
- "backend/**"
pull_request:
branches: [main]
paths:
- "backend/**"

jobs:
tier1-tests:
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
# ── Checkout code ──────────────────────────────────────
- uses: actions/checkout@v4

# ── Set up Python ──────────────────────────────────────
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"

# ── Install dependencies ───────────────────────────────
- name: Install backend dependencies
working-directory: backend
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-json-report

# ── Check API key is configured ────────────────────────
# Fail early with a clear message if secrets are missing
- name: Verify API key is configured
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
run: |
if [ -z "$AZURE_OPENAI_API_KEY" ]; then
echo "::error::AZURE_OPENAI_API_KEY secret is not configured."
echo "::error::LLM tests require Azure OpenAI credentials."
echo "::error::Add AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT,"
echo "::error::AZURE_OPENAI_API_VERSION, and AZURE_OPENAI_DEPLOYMENT"
echo "::error::as repository secrets in Settings > Secrets and variables > Actions."
exit 1
fi

# ── Create results directory ───────────────────────────
- name: Create results directory
run: mkdir -p backend/tests/results/latest

# ── Run Tier 1 tests ───────────────────────────────────
# Uses || true so the job continues even if tests fail
# (CI is for visibility, not gatekeeping)
- name: Run Tier 1 tests
working-directory: backend/tests
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
AZURE_OPENAI_API_VERSION: ${{ secrets.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }}
run: |
python -m pytest tier1/ -v \
--json-report \
--json-report-file=results/latest/tier1_results.json \
|| true

# ── Upload test results as artifact ────────────────────
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: llm-test-results
path: backend/tests/results/latest/tier1_results.json
retention-days: 30

# ── Generate and post PR comment ───────────────────────
# Parses the JSON report and posts a readable summary
# with blocker/non-blocker distinction
- name: Generate PR comment
if: github.event_name == 'pull_request'
working-directory: backend/tests
run: |
python3 - <<'SCRIPT'
import json
from pathlib import Path

results_file = Path("results/latest/tier1_results.json")
comment_file = Path("/tmp/pr_comment.md")

if not results_file.exists():
comment_file.write_text(
"## DREF Assist LLM Tests\n\n"
"⚠️ No test results found. API key may not be configured.\n"
)
exit(0)

data = json.loads(results_file.read_text())
tests = data.get("tests", [])

BLOCKER_IDS = {
"test_10_1_prompt_injection",
"test_12_1_cross_turn_contradiction",
"test_12_2_conflict_resolution_ux_flow",
"test_12_4_cross_document_conflict",
"test_12_5_silent_overwrite_prevention",
}

passed = []
blocker_fails = []
other_fails = []
skipped = []

for t in tests:
nodeid = t.get("nodeid", "")
name = nodeid.split("::")[-1] if "::" in nodeid else nodeid
outcome = t.get("outcome", "unknown")

if outcome == "passed":
passed.append(name)
elif outcome == "skipped":
skipped.append(name)
elif name in BLOCKER_IDS:
msg = t.get("call", {}).get("longrepr", "")
if isinstance(msg, str) and len(msg) > 150:
msg = msg[:150] + "..."
blocker_fails.append((name, msg))
else:
msg = t.get("call", {}).get("longrepr", "")
if isinstance(msg, str) and len(msg) > 150:
msg = msg[:150] + "..."
other_fails.append((name, msg))

total = len(tests) - len(skipped)
pass_count = len(passed)

lines = []

if blocker_fails:
lines.append(f"⚠️ **DREF Assist LLM Tests — {pass_count}/{total} passed**\n")
lines.append("### 🔴 BLOCKER FAILURES (treat as urgent):")
for name, msg in blocker_fails:
lines.append(f" - `{name}` — {msg}")
lines.append("")
else:
lines.append(f"✅ **DREF Assist LLM Tests — {pass_count}/{total} passed**\n")

if other_fails:
lines.append("### ℹ️ NON-BLOCKER FAILURES:")
for name, msg in other_fails:
lines.append(f" - `{name}` — {msg}")
lines.append("")
elif not blocker_fails:
lines.append("All tests passed. No blocker or non-blocker failures.\n")

if skipped:
lines.append(f"*{len(skipped)} test(s) skipped (likely missing API key)*\n")

lines.append(
"> Merge is not blocked. Blocker failures should be resolved "
"before further changes are made on top of this commit."
)

comment_file.write_text("\n".join(lines))
SCRIPT

# ── Post comment on PR ─────────────────────────────────
- name: Post PR comment
if: github.event_name == 'pull_request'
uses: marocchino/sticky-pull-request-comment@v2
with:
path: /tmp/pr_comment.md
3 changes: 3 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ htmlcov/
# Virtual environments
.venv/
venv/

# LLM test results (generated artifacts)
tests/results/
30 changes: 23 additions & 7 deletions backend/llm_handler/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,23 @@
import os
from typing import Dict, Any, List, Optional, Union

from openai import AzureOpenAI
from openai import AzureOpenAI, BadRequestError
from dotenv import load_dotenv

from .prompt import build_system_prompt
from .parser import process_llm_response

load_dotenv()

_CONTENT_FILTER_RESPONSE: Dict[str, Any] = {
"classification": "OFF_TOPIC",
"reply": (
"I'm not able to help with that request. "
"Please ask something related to the DREF application."
),
"field_updates": [],
}

# Type alias for message content (text string or multimodal list from media-processor)
MessageContent = Union[str, List[Dict[str, Any]]]

Expand Down Expand Up @@ -69,13 +78,20 @@ def handle_message(

messages.append({"role": "user", "content": user_message})

response = client.chat.completions.create(
model=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
messages=messages,
temperature=0.1,
response_format={"type": "json_object"},
)
try:
response = client.chat.completions.create(
model=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"),
messages=messages,
temperature=0.1,
response_format={"type": "json_object"},
)
except BadRequestError:
# Azure content management policy rejected the prompt
return _CONTENT_FILTER_RESPONSE.copy()

# Content filter may allow the request but redact the response
raw_response = response.choices[0].message.content
if raw_response is None:
return _CONTENT_FILTER_RESPONSE.copy()

return process_llm_response(raw_response)
3 changes: 3 additions & 0 deletions backend/llm_handler/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
INFERRED fields:
- You MAY logically deduce the value from available evidence, even if the value is not stated verbatim.
- The inference must be strong and unambiguous. For example: if the event is an earthquake, disaster_onset can be inferred as "Sudden".
- For disaster_type specifically: infer from descriptive language even in narrative or indirect phrasing. Words like "flooding", "water breached banks", "inundated" → Flood; "shaking", "tremors", "magnitude" → Earthquake; "winds", "cyclone", "hurricane", "typhoon" → Storm / Tropical Cyclone; "dry conditions", "crop failure", "water scarcity" → Drought. Do NOT wait for the exact word "Flood" or "Earthquake" to appear — infer from context.
- If the inference is uncertain or could go either way, ask for clarification instead.
- In your reply, briefly note any inferred values so the user can verify them.

Expand All @@ -75,6 +76,8 @@
- Never invent numbers, dates, or contact information not present in the sources.
- Do not copy information between fields (e.g., don't assume targeted population equals affected population).
- For dropdown fields, only use values from the allowed options listed in the schema.
- For ambiguous place names (e.g., "Springfield", "Victoria", "Central Region") that exist in multiple countries, do NOT assume or infer a country. Ask the user to specify the country before populating the country or region field.
- For dates written in ambiguous slash-delimited format (e.g., "03/04/2025" which could be March 4 or April 3), do NOT silently assume MM/DD or DD/MM interpretation. Ask the user to clarify the intended date before populating any date field. Only dates in unambiguous formats (ISO YYYY-MM-DD, written-out month names like "4 March 2025", or single-digit day/month combinations that are impossible in one interpretation) may be used without asking.
- For multi-select fields, return an array of strings.
- For boolean fields, return true or false.
- For dates, return ISO format: "YYYY-MM-DD".
Expand Down
113 changes: 113 additions & 0 deletions backend/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Shared fixtures, markers, and configuration for the DREF Assist LLM test suite.

This conftest provides:
- Azure OpenAI client fixture (session-scoped, real API calls)
- API key validation (skips session if credentials missing)
- Custom pytest markers for blocker/tier1/security tests
- Path setup matching the backend module structure
"""

import os
import sys
from pathlib import Path

import pytest
from dotenv import load_dotenv

# Load .env from backend root
_backend = Path(__file__).parent.parent
load_dotenv(_backend / ".env")

# Add backend paths so imports resolve identically to how app.py does it
sys.path.insert(0, str(_backend))
sys.path.insert(0, str(_backend / "llm_handler"))
sys.path.insert(0, str(_backend / "conflict_resolver"))
sys.path.insert(0, str(_backend / "media-processor"))
sys.path.insert(0, str(_backend / "services"))


def pytest_configure(config):
"""Register custom markers."""
config.addinivalue_line("markers", "blocker: critical safety test — failure is urgent")
config.addinivalue_line("markers", "tier1: Tier 1 hard-coded assertion test")
config.addinivalue_line("markers", "security: security-related test (injection, etc.)")


@pytest.fixture(scope="session")
def azure_client():
"""Create a real AzureOpenAI client for the test session.

Fails immediately with a clear message if required environment
variables are missing, rather than silently failing mid-run.
"""
from openai import AzureOpenAI

api_key = os.getenv("AZURE_OPENAI_API_KEY")
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_version = os.getenv("AZURE_OPENAI_API_VERSION")

if not api_key:
pytest.fail(
"AZURE_OPENAI_API_KEY environment variable is not set. "
"LLM tests require a real Azure OpenAI API key. "
"Set it in backend/.env or export it in your shell."
)
if not endpoint:
pytest.fail(
"AZURE_OPENAI_ENDPOINT environment variable is not set. "
"Set it in backend/.env or export it in your shell."
)

return AzureOpenAI(
api_key=api_key,
azure_endpoint=endpoint,
api_version=api_version or "2024-02-15-preview",
)


@pytest.fixture
def call_handle_message(azure_client):
"""Fixture that returns a callable to invoke handle_message with the shared client.

Usage:
def test_something(call_handle_message):
result = call_handle_message("Some input", form_state={})
"""
from llm_handler.handler import handle_message

def _call(user_message, form_state=None, conversation_history=None):
return handle_message(
user_message=user_message,
current_form_state=form_state or {},
conversation_history=conversation_history,
client=azure_client,
)

return _call


@pytest.fixture
def call_process_user_input(azure_client):
"""Fixture that returns a callable to invoke process_user_input with the shared client.

Used for conflict detection tests that need the full service layer.

Usage:
def test_conflict(call_process_user_input):
result = call_process_user_input(
"New message",
enriched_form_state={...},
)
"""
from services.assistant import process_user_input

def _call(user_message, enriched_form_state=None, conversation_history=None):
return process_user_input(
user_message=user_message,
enriched_form_state=enriched_form_state or {},
conversation_history=conversation_history,
client=azure_client,
)

return _call
Empty file.
Loading
Loading