NHSDigital · bencegadanyi1-nhs · Mar 20, 2026 · Mar 20, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -58,6 +58,7 @@ jobs:
       IS_PULL_REQUEST: false
       FORWARD_CSOC_LOGS: false
       RUN_REGRESSION_TESTS: true
+      RUN_RAGAS_EVALUATION: true
     secrets:
       CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
       INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
@@ -86,6 +87,7 @@ jobs:
       IS_PULL_REQUEST: false
       FORWARD_CSOC_LOGS: false
       RUN_REGRESSION_TESTS: true
+      RUN_RAGAS_EVALUATION: true
     secrets:
       CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.QA_CLOUD_FORMATION_DEPLOY_ROLE }}
       INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -169,6 +169,7 @@ jobs:
       IS_PULL_REQUEST: true
       FORWARD_CSOC_LOGS: false
       RUN_REGRESSION_TESTS: true
+      RUN_RAGAS_EVALUATION: true
     secrets:
       CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
       INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -35,7 +35,6 @@ jobs:
       jira_release_prefix: "epsam"
     secrets:
       EXECUTE_JIRA_LAMBDA_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_EXECUTE_LAMBDA_ROLE }}
-
 
   package_code:
     needs: [tag_release, get_config_values]
@@ -62,6 +61,7 @@ jobs:
       IS_PULL_REQUEST: false
       FORWARD_CSOC_LOGS: false
       RUN_REGRESSION_TESTS: true
+      RUN_RAGAS_EVALUATION: true
     secrets:
       CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
       INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
@@ -90,6 +90,7 @@ jobs:
       IS_PULL_REQUEST: false
       FORWARD_CSOC_LOGS: false
       RUN_REGRESSION_TESTS: true
+      RUN_RAGAS_EVALUATION: true
     secrets:
       CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.QA_CLOUD_FORMATION_DEPLOY_ROLE }}
       INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}

diff --git a/.github/workflows/release_all_stacks.yml b/.github/workflows/release_all_stacks.yml
@@ -30,6 +30,9 @@ on:
       RUN_REGRESSION_TESTS:
         type: boolean
         default: true
+      RUN_RAGAS_EVALUATION:
+        type: boolean
+        default: true
       IS_PULL_REQUEST:
         type: boolean
         default: false
@@ -125,7 +128,6 @@ jobs:
           IS_PULL_REQUEST: ${{ inputs.IS_PULL_REQUEST }}
           RUN_REGRESSION_TESTS: ${{ inputs.RUN_REGRESSION_TESTS }}
 
-
       - name: Deploy code for stack
         if: ${{ inputs.DEPLOY_CODE == true }}
         run: |
@@ -199,3 +201,16 @@ jobs:
       pinned_image: ${{ inputs.pinned_image }}
     secrets:
       REGRESSION_TESTS_PEM: ${{ secrets.REGRESSION_TESTS_PEM }}
+
+  ragas_evaluation:
+    name: Ragas Quality Evaluation
+    uses: ./.github/workflows/run_ragas_evaluation.yml
+    if: ${{ always() && !failure() && !cancelled() && inputs.RUN_RAGAS_EVALUATION == true }}
+    needs: [release_all_code]
+    with:
+      ENVIRONMENT: ${{ inputs.TARGET_ENVIRONMENT }}
+      VERSION_NUMBER: ${{ inputs.VERSION_NUMBER }}
+      STACK_NAME: ${{ inputs.STACK_NAME }}
+      pinned_image: ${{ inputs.pinned_image }}
+    secrets:
+      CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.CLOUD_FORMATION_DEPLOY_ROLE }}
diff --git a/.github/workflows/run_ragas_evaluation.yml b/.github/workflows/run_ragas_evaluation.yml
@@ -0,0 +1,105 @@
+name: Run Ragas Quality Evaluation
+
+on:
+    workflow_call:
+        inputs:
+            ENVIRONMENT:
+                required: true
+                type: string
+            VERSION_NUMBER:
+                required: true
+                type: string
+            STACK_NAME:
+                required: true
+                type: string
+            pinned_image:
+                type: string
+                required: true
+        secrets:
+            CLOUD_FORMATION_DEPLOY_ROLE:
+                required: true
+
+jobs:
+    ragas_evaluation:
+        runs-on: ubuntu-22.04
+        container:
+            image: ${{ inputs.pinned_image }}
+            options: --user 1001:1001 --group-add 128
+        defaults:
+            run:
+                shell: bash
+        permissions:
+            id-token: write
+            contents: read
+
+        steps:
+            - name: copy .tool-versions
+              run: |
+                  cp /home/vscode/.tool-versions "$HOME/.tool-versions"
+
+            - name: Checkout repository
+              uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+              with:
+                  ref: ${{ github.ref }}
+                  fetch-depth: 0
+
+            - name: Configure AWS Credentials
+              uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7
+              with:
+                  aws-region: eu-west-2
+                  role-to-assume: ${{ secrets.CLOUD_FORMATION_DEPLOY_ROLE }}
+                  role-session-name: eps-assist-me-ragas-evaluation
+
+            - name: Resolve Lambda function name
+              id: resolve_lambda
+              run: |
+                  # Discover the SlackBot Lambda name from CloudFormation stack outputs
+                  STACK_NAME="${{ inputs.STACK_NAME }}-${{ inputs.ENVIRONMENT }}"
+                  FUNCTION_NAME=$(aws cloudformation describe-stacks \
+                    --stack-name "$STACK_NAME" \
+                    --query "Stacks[0].Outputs[?OutputKey=='SlackBotFunctionName'].OutputValue" \
+                    --output text 2>/dev/null || echo "")
+
+                  if [[ -z "$FUNCTION_NAME" || "$FUNCTION_NAME" == "None" ]]; then
+                    # Fallback: list Lambda functions matching the expected naming pattern
+                    FUNCTION_NAME=$(aws lambda list-functions \
+                      --query "Functions[?contains(FunctionName, '${{ inputs.STACK_NAME }}') && contains(FunctionName, 'SlackBot')].FunctionName | [0]" \
+                      --output text 2>/dev/null || echo "")
+                  fi
+
+                  if [[ -z "$FUNCTION_NAME" || "$FUNCTION_NAME" == "None" ]]; then
+                    echo "::error::Could not resolve SlackBot Lambda function name for stack $STACK_NAME"
+                    exit 1
+                  fi
+
+                  echo "lambda_function_name=$FUNCTION_NAME" >> "$GITHUB_OUTPUT"
+                  echo "Resolved Lambda function: $FUNCTION_NAME"
+
+            - name: Install dependencies
+              run: |
+                  poetry install --with ragasEvaluation,dev
+
+            - name: Run Ragas evaluation
+              working-directory: packages/ragasEvaluation
+              env:
+                  RAGAS_LAMBDA_FUNCTION_NAME: ${{ steps.resolve_lambda.outputs.lambda_function_name }}
+                  RAGAS_AWS_REGION: eu-west-2
+                  RAGAS_EVALUATOR_MODEL_ID: eu.anthropic.claude-3-5-sonnet-20241022-v2:0
+                  RAGAS_RESULTS_DIR: results
+              run: |
+                  echo "Running Ragas evaluation against: $RAGAS_LAMBDA_FUNCTION_NAME"
+                  PYTHONPATH=. poetry run python -m pytest tests/ \
+                    -m ragas \
+                    -v \
+                    --tb=short \
+                    --junitxml=results/ragas_junit.xml \
+                    2>&1 | tee results/ragas_output.log
+
+            - name: Upload evaluation results
+              if: always()
+              uses: actions/upload-artifact@v4
+              with:
+                  name: ragas-evaluation-${{ inputs.ENVIRONMENT }}-${{ inputs.VERSION_NUMBER }}
+                  path: |
+                      packages/ragasEvaluation/results/
+                  retention-days: 30
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,5 @@ cdk.out
 .dependencies/
 .poetry/
 .trivy_out/
+# Ragas evaluation results (generated at runtime)
+**/results/
diff --git a/Makefile b/Makefile
@@ -42,6 +42,9 @@ test:
 	cd packages/preprocessingFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
 	cd packages/bedrockLoggingConfigFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
 
+ragas-eval:
+	cd packages/ragasEvaluation && PYTHONPATH=. poetry run python -m pytest tests/ -m ragas -v --tb=short
+
 clean:
 	rm -rf packages/cdk/coverage
 	rm -rf packages/cdk/lib

diff --git a/packages/ragasEvaluation/README.md b/packages/ragasEvaluation/README.md
@@ -0,0 +1,54 @@
+# Ragas Quality Evaluation for EPS Assist Me
+
+Automated post-deployment quality evaluation of the EPS Assist Me AI bot using the [Ragas](https://docs.ragas.io/) framework.
+
+## Overview
+
+After each deployment, this evaluation suite invokes the deployed Slack Bot Lambda directly (bypassing Slack) and evaluates the AI responses against a curated dataset of EPS onboarding questions using LLM-as-a-judge metrics.
+
+## Metrics
+
+| Metric | Description |
+|--------|-------------|
+| **Faithfulness** | Is the response grounded in the retrieved knowledge base source context? |
+| **Answer Relevancy** | Does the response actually answer the question that was asked? |
+| **Semantic Similarity** | How close is the response to the expected reference answer? |
+| **Answer Correctness** | Is the response factually correct? (combines faithfulness + similarity) |
+
+## Running Locally
+
+```bash
+# Requires AWS credentials with Lambda invoke permissions
+export RAGAS_LAMBDA_FUNCTION_NAME="epsam-dev-SlackBotFunction"
+export RAGAS_AWS_REGION="eu-west-2"
+export RAGAS_EVALUATOR_MODEL_ID="eu.anthropic.claude-3-5-sonnet-20241022-v2:0"
+
+cd /workspaces/eps-assist-me
+poetry run pytest packages/ragasEvaluation -m ragas -v
+```
+
+## Test Dataset
+
+The evaluation dataset is defined in `evaluation/test_dataset.py` and contains representative EPS onboarding questions covering:
+
+- Prescription ID generation and structure
+- CIS2 authentication requirements
+- RBAC controls for prescribers
+- Repeat dispensing cancellation scenarios
+- Controlled drug prescribing rules
+- FHIR API schema requirements
+- Nomination management
+- Error handling guidance
+
+## CI/CD Integration
+
+The evaluation runs automatically in the `release_all_stacks.yml` workflow after successful deployment, gated by the `RUN_RAGAS_EVALUATION` input.
+
+## Thresholds
+
+The evaluation enforces minimum score thresholds (configurable in `evaluation/config.py`):
+
+- Faithfulness: >= 0.7
+- Answer Relevancy: >= 0.7
+- Semantic Similarity: >= 0.7
+- Answer Correctness: >= 0.7
diff --git a/packages/ragasEvaluation/__init__.py b/packages/ragasEvaluation/__init__.py
diff --git a/packages/ragasEvaluation/evaluation/__init__.py b/packages/ragasEvaluation/evaluation/__init__.py
diff --git a/packages/ragasEvaluation/evaluation/config.py b/packages/ragasEvaluation/evaluation/config.py
@@ -0,0 +1,30 @@
+"""
+Configuration for Ragas quality evaluation.
+
+All settings can be overridden via environment variables.
+"""
+
+import os
+
+
+# --- AWS / Lambda Configuration ---
+LAMBDA_FUNCTION_NAME = os.environ.get("RAGAS_LAMBDA_FUNCTION_NAME", "")
+AWS_REGION = os.environ.get("RAGAS_AWS_REGION", "eu-west-2")
+
+# --- Evaluator LLM ---
+# Model used by Ragas to judge responses (not the bot's own model)
+EVALUATOR_MODEL_ID = os.environ.get("RAGAS_EVALUATOR_MODEL_ID", "eu.anthropic.claude-3-5-sonnet-20241022-v2:0")
+EVALUATOR_EMBEDDING_MODEL_ID = os.environ.get("RAGAS_EVALUATOR_EMBEDDING_MODEL_ID", "amazon.titan-embed-text-v2:0")
+
+# --- Score Thresholds ---
+# Minimum acceptable scores for each metric.  A test fails if the
+# aggregate (mean) score for any metric falls below its threshold.
+THRESHOLDS = {
+    "faithfulness": float(os.environ.get("RAGAS_THRESHOLD_FAITHFULNESS", "0.7")),
+    "answer_relevancy": float(os.environ.get("RAGAS_THRESHOLD_ANSWER_RELEVANCY", "0.7")),
+    "semantic_similarity": float(os.environ.get("RAGAS_THRESHOLD_SEMANTIC_SIMILARITY", "0.7")),
+    "answer_correctness": float(os.environ.get("RAGAS_THRESHOLD_ANSWER_CORRECTNESS", "0.7")),
+}
+
+# --- Output ---
+RESULTS_OUTPUT_DIR = os.environ.get("RAGAS_RESULTS_DIR", "packages/ragasEvaluation/results")
diff --git a/packages/ragasEvaluation/evaluation/lambda_invoker.py b/packages/ragasEvaluation/evaluation/lambda_invoker.py
@@ -0,0 +1,58 @@
+"""
+Lambda invoker for Ragas evaluation.
+
+Calls the deployed Slack Bot Lambda via direct invocation (bypasses Slack)
+and returns the AI response with citations for evaluation.
+"""
+
+import json
+import logging
+
+import boto3
+
+from evaluation.config import LAMBDA_FUNCTION_NAME, AWS_REGION
+
+logger = logging.getLogger(__name__)
+
+
+def invoke_bot(query: str, session_id: str | None = None) -> dict:
+    """
+    Invoke the deployed EPS Assist Me Lambda with a direct query.
+
+    Args:
+        query: The user question to send to the bot.
+        session_id: Optional session ID for conversation continuity.
+
+    Returns:
+        dict with keys: text, session_id, citations
+    """
+    client = boto3.client("lambda", region_name=AWS_REGION)
+
+    payload = {
+        "invocation_type": "direct",
+        "query": query,
+    }
+    if session_id:
+        payload["session_id"] = session_id
+
+    logger.info("Invoking Lambda %s with query: %s", LAMBDA_FUNCTION_NAME, query[:80])
+
+    response = client.invoke(
+        FunctionName=LAMBDA_FUNCTION_NAME,
+        InvocationType="RequestResponse",
+        Payload=json.dumps(payload),
+    )
+
+    response_payload = json.loads(response["Payload"].read())
+
+    if response_payload.get("statusCode") != 200:
+        raise RuntimeError(f"Lambda invocation failed: {response_payload}")
+
+    data = response_payload["response"]
+    logger.info("Got response (session=%s, %d citations)", data.get("session_id"), len(data.get("citations", [])))
+
+    return {
+        "text": data["text"],
+        "session_id": data.get("session_id"),
+        "citations": data.get("citations", []),
+    }