Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ jobs:
IS_PULL_REQUEST: false
FORWARD_CSOC_LOGS: false
RUN_REGRESSION_TESTS: true
RUN_RAGAS_EVALUATION: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
Expand Down Expand Up @@ -86,6 +87,7 @@ jobs:
IS_PULL_REQUEST: false
FORWARD_CSOC_LOGS: false
RUN_REGRESSION_TESTS: true
RUN_RAGAS_EVALUATION: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.QA_CLOUD_FORMATION_DEPLOY_ROLE }}
INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ jobs:
IS_PULL_REQUEST: true
FORWARD_CSOC_LOGS: false
RUN_REGRESSION_TESTS: true
RUN_RAGAS_EVALUATION: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ jobs:
jira_release_prefix: "epsam"
secrets:
EXECUTE_JIRA_LAMBDA_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_EXECUTE_LAMBDA_ROLE }}


package_code:
needs: [tag_release, get_config_values]
Expand All @@ -62,6 +61,7 @@ jobs:
IS_PULL_REQUEST: false
FORWARD_CSOC_LOGS: false
RUN_REGRESSION_TESTS: true
RUN_RAGAS_EVALUATION: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.DEV_CLOUD_FORMATION_DEPLOY_ROLE }}
INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
Expand Down Expand Up @@ -90,6 +90,7 @@ jobs:
IS_PULL_REQUEST: false
FORWARD_CSOC_LOGS: false
RUN_REGRESSION_TESTS: true
RUN_RAGAS_EVALUATION: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.QA_CLOUD_FORMATION_DEPLOY_ROLE }}
INT_ASSIST_ME_DOCUMENT_SYNC_ROLE: ${{ secrets.INT_ASSIST_ME_DOCUMENT_SYNC_ROLE }}
Expand Down
17 changes: 16 additions & 1 deletion .github/workflows/release_all_stacks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ on:
RUN_REGRESSION_TESTS:
type: boolean
default: true
RUN_RAGAS_EVALUATION:
type: boolean
default: true
IS_PULL_REQUEST:
type: boolean
default: false
Expand Down Expand Up @@ -125,7 +128,6 @@ jobs:
IS_PULL_REQUEST: ${{ inputs.IS_PULL_REQUEST }}
RUN_REGRESSION_TESTS: ${{ inputs.RUN_REGRESSION_TESTS }}


- name: Deploy code for stack
if: ${{ inputs.DEPLOY_CODE == true }}
run: |
Expand Down Expand Up @@ -199,3 +201,16 @@ jobs:
pinned_image: ${{ inputs.pinned_image }}
secrets:
REGRESSION_TESTS_PEM: ${{ secrets.REGRESSION_TESTS_PEM }}

ragas_evaluation:
name: Ragas Quality Evaluation
uses: ./.github/workflows/run_ragas_evaluation.yml
if: ${{ always() && !failure() && !cancelled() && inputs.RUN_RAGAS_EVALUATION == true }}
needs: [release_all_code]
with:
ENVIRONMENT: ${{ inputs.TARGET_ENVIRONMENT }}
VERSION_NUMBER: ${{ inputs.VERSION_NUMBER }}
STACK_NAME: ${{ inputs.STACK_NAME }}
pinned_image: ${{ inputs.pinned_image }}
secrets:
CLOUD_FORMATION_DEPLOY_ROLE: ${{ secrets.CLOUD_FORMATION_DEPLOY_ROLE }}
105 changes: 105 additions & 0 deletions .github/workflows/run_ragas_evaluation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: Run Ragas Quality Evaluation

on:
workflow_call:
inputs:
ENVIRONMENT:
required: true
type: string
VERSION_NUMBER:
required: true
type: string
STACK_NAME:
required: true
type: string
pinned_image:
type: string
required: true
secrets:
CLOUD_FORMATION_DEPLOY_ROLE:
required: true

jobs:
ragas_evaluation:
runs-on: ubuntu-22.04
container:
image: ${{ inputs.pinned_image }}
options: --user 1001:1001 --group-add 128
defaults:
run:
shell: bash
permissions:
id-token: write
contents: read

steps:
- name: copy .tool-versions
run: |
cp /home/vscode/.tool-versions "$HOME/.tool-versions"

- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
ref: ${{ github.ref }}
fetch-depth: 0

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7
with:
aws-region: eu-west-2
role-to-assume: ${{ secrets.CLOUD_FORMATION_DEPLOY_ROLE }}
role-session-name: eps-assist-me-ragas-evaluation

- name: Resolve Lambda function name
id: resolve_lambda
run: |
# Discover the SlackBot Lambda name from CloudFormation stack outputs
STACK_NAME="${{ inputs.STACK_NAME }}-${{ inputs.ENVIRONMENT }}"
FUNCTION_NAME=$(aws cloudformation describe-stacks \
--stack-name "$STACK_NAME" \
--query "Stacks[0].Outputs[?OutputKey=='SlackBotFunctionName'].OutputValue" \
--output text 2>/dev/null || echo "")

if [[ -z "$FUNCTION_NAME" || "$FUNCTION_NAME" == "None" ]]; then
# Fallback: list Lambda functions matching the expected naming pattern
FUNCTION_NAME=$(aws lambda list-functions \
--query "Functions[?contains(FunctionName, '${{ inputs.STACK_NAME }}') && contains(FunctionName, 'SlackBot')].FunctionName | [0]" \
--output text 2>/dev/null || echo "")
fi

if [[ -z "$FUNCTION_NAME" || "$FUNCTION_NAME" == "None" ]]; then
echo "::error::Could not resolve SlackBot Lambda function name for stack $STACK_NAME"
exit 1
fi

echo "lambda_function_name=$FUNCTION_NAME" >> "$GITHUB_OUTPUT"
echo "Resolved Lambda function: $FUNCTION_NAME"

- name: Install dependencies
run: |
poetry install --with ragasEvaluation,dev

- name: Run Ragas evaluation
working-directory: packages/ragasEvaluation
env:
RAGAS_LAMBDA_FUNCTION_NAME: ${{ steps.resolve_lambda.outputs.lambda_function_name }}
RAGAS_AWS_REGION: eu-west-2
RAGAS_EVALUATOR_MODEL_ID: eu.anthropic.claude-3-5-sonnet-20241022-v2:0
RAGAS_RESULTS_DIR: results
run: |
echo "Running Ragas evaluation against: $RAGAS_LAMBDA_FUNCTION_NAME"
PYTHONPATH=. poetry run python -m pytest tests/ \
-m ragas \
-v \
--tb=short \
--junitxml=results/ragas_junit.xml \
2>&1 | tee results/ragas_output.log

- name: Upload evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: ragas-evaluation-${{ inputs.ENVIRONMENT }}-${{ inputs.VERSION_NUMBER }}
path: |
packages/ragasEvaluation/results/
retention-days: 30
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ cdk.out
.dependencies/
.poetry/
.trivy_out/
# Ragas evaluation results (generated at runtime)
**/results/
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ test:
cd packages/preprocessingFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest
cd packages/bedrockLoggingConfigFunction && PYTHONPATH=. COVERAGE_FILE=coverage/.coverage poetry run python -m pytest

ragas-eval:
cd packages/ragasEvaluation && PYTHONPATH=. poetry run python -m pytest tests/ -m ragas -v --tb=short

clean:
rm -rf packages/cdk/coverage
rm -rf packages/cdk/lib
Expand Down
54 changes: 54 additions & 0 deletions packages/ragasEvaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Ragas Quality Evaluation for EPS Assist Me

Automated post-deployment quality evaluation of the EPS Assist Me AI bot using the [Ragas](https://docs.ragas.io/) framework.

## Overview

After each deployment, this evaluation suite invokes the deployed Slack Bot Lambda directly (bypassing Slack) and evaluates the AI responses against a curated dataset of EPS onboarding questions using LLM-as-a-judge metrics.

## Metrics

| Metric | Description |
|--------|-------------|
| **Faithfulness** | Is the response grounded in the retrieved knowledge base source context? |
| **Answer Relevancy** | Does the response actually answer the question that was asked? |
| **Semantic Similarity** | How close is the response to the expected reference answer? |
| **Answer Correctness** | Is the response factually correct? (combines faithfulness + similarity) |

## Running Locally

```bash
# Requires AWS credentials with Lambda invoke permissions
export RAGAS_LAMBDA_FUNCTION_NAME="epsam-dev-SlackBotFunction"
export RAGAS_AWS_REGION="eu-west-2"
export RAGAS_EVALUATOR_MODEL_ID="eu.anthropic.claude-3-5-sonnet-20241022-v2:0"

cd /workspaces/eps-assist-me
poetry run pytest packages/ragasEvaluation -m ragas -v
```

## Test Dataset

The evaluation dataset is defined in `evaluation/test_dataset.py` and contains representative EPS onboarding questions covering:

- Prescription ID generation and structure
- CIS2 authentication requirements
- RBAC controls for prescribers
- Repeat dispensing cancellation scenarios
- Controlled drug prescribing rules
- FHIR API schema requirements
- Nomination management
- Error handling guidance

## CI/CD Integration

The evaluation runs automatically in the `release_all_stacks.yml` workflow after successful deployment, gated by the `RUN_RAGAS_EVALUATION` input.

## Thresholds

The evaluation enforces minimum score thresholds (configurable in `evaluation/config.py`):

- Faithfulness: >= 0.7
- Answer Relevancy: >= 0.7
- Semantic Similarity: >= 0.7
- Answer Correctness: >= 0.7
Empty file.
Empty file.
30 changes: 30 additions & 0 deletions packages/ragasEvaluation/evaluation/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Configuration for Ragas quality evaluation.

All settings can be overridden via environment variables.
"""

import os


# --- AWS / Lambda Configuration ---
LAMBDA_FUNCTION_NAME = os.environ.get("RAGAS_LAMBDA_FUNCTION_NAME", "")
AWS_REGION = os.environ.get("RAGAS_AWS_REGION", "eu-west-2")

# --- Evaluator LLM ---
# Model used by Ragas to judge responses (not the bot's own model)
EVALUATOR_MODEL_ID = os.environ.get("RAGAS_EVALUATOR_MODEL_ID", "eu.anthropic.claude-3-5-sonnet-20241022-v2:0")
EVALUATOR_EMBEDDING_MODEL_ID = os.environ.get("RAGAS_EVALUATOR_EMBEDDING_MODEL_ID", "amazon.titan-embed-text-v2:0")

# --- Score Thresholds ---
# Minimum acceptable scores for each metric. A test fails if the
# aggregate (mean) score for any metric falls below its threshold.
THRESHOLDS = {
"faithfulness": float(os.environ.get("RAGAS_THRESHOLD_FAITHFULNESS", "0.7")),
"answer_relevancy": float(os.environ.get("RAGAS_THRESHOLD_ANSWER_RELEVANCY", "0.7")),
"semantic_similarity": float(os.environ.get("RAGAS_THRESHOLD_SEMANTIC_SIMILARITY", "0.7")),
"answer_correctness": float(os.environ.get("RAGAS_THRESHOLD_ANSWER_CORRECTNESS", "0.7")),
}

# --- Output ---
RESULTS_OUTPUT_DIR = os.environ.get("RAGAS_RESULTS_DIR", "packages/ragasEvaluation/results")
58 changes: 58 additions & 0 deletions packages/ragasEvaluation/evaluation/lambda_invoker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Lambda invoker for Ragas evaluation.

Calls the deployed Slack Bot Lambda via direct invocation (bypasses Slack)
and returns the AI response with citations for evaluation.
"""

import json
import logging

import boto3

from evaluation.config import LAMBDA_FUNCTION_NAME, AWS_REGION

logger = logging.getLogger(__name__)


def invoke_bot(query: str, session_id: str | None = None) -> dict:
"""
Invoke the deployed EPS Assist Me Lambda with a direct query.

Args:
query: The user question to send to the bot.
session_id: Optional session ID for conversation continuity.

Returns:
dict with keys: text, session_id, citations
"""
client = boto3.client("lambda", region_name=AWS_REGION)

payload = {
"invocation_type": "direct",
"query": query,
}
if session_id:
payload["session_id"] = session_id

logger.info("Invoking Lambda %s with query: %s", LAMBDA_FUNCTION_NAME, query[:80])

response = client.invoke(
FunctionName=LAMBDA_FUNCTION_NAME,
InvocationType="RequestResponse",
Payload=json.dumps(payload),
)

response_payload = json.loads(response["Payload"].read())

if response_payload.get("statusCode") != 200:
raise RuntimeError(f"Lambda invocation failed: {response_payload}")

data = response_payload["response"]
logger.info("Got response (session=%s, %d citations)", data.get("session_id"), len(data.get("citations", [])))

return {
"text": data["text"],
"session_id": data.get("session_id"),
"citations": data.get("citations", []),
}
Loading
Loading