Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ slack_sdk==3.27.2
tqdm==4.66.4
translator-testing-model==0.5.0
reasoner-validator==4.2.5
bmt>=1.4
21 changes: 21 additions & 0 deletions test_harness/regression_checks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Regression checks run alongside acceptance/pathfinder pass-fail analysis."""

from test_harness.regression_checks.base import (
CHECKS,
RegressionCheck,
RegressionCheckResult,
RegressionStatus,
run_all,
)
from test_harness.regression_checks.edge_predicate import EdgePredicateMatchCheck

CHECKS.append(EdgePredicateMatchCheck())

__all__ = [
"CHECKS",
"RegressionCheck",
"RegressionCheckResult",
"RegressionStatus",
"EdgePredicateMatchCheck",
"run_all",
]
56 changes: 56 additions & 0 deletions test_harness/regression_checks/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Pluggable regression checks for TRAPI responses."""

from dataclasses import dataclass, field
from enum import Enum
import logging
from typing import Any, Dict, List, Optional, Protocol


class RegressionStatus(str, Enum):
PASSED = "PASSED"
FAILED = "FAILED"
SKIPPED = "SKIPPED"


@dataclass
class RegressionCheckResult:
name: str
status: RegressionStatus
message: Optional[str] = None
details: Optional[Dict[str, Any]] = field(default=None)


class RegressionCheck(Protocol):
name: str

def run(
self, message: Dict[str, Any], query_graph: Dict[str, Any]
) -> RegressionCheckResult: ...


CHECKS: List[RegressionCheck] = []


def run_all(
message: Dict[str, Any],
query_graph: Dict[str, Any],
logger: Optional[logging.Logger] = None,
) -> List[RegressionCheckResult]:
"""Run every registered regression check, isolating failures per-check."""
results: List[RegressionCheckResult] = []
for check in CHECKS:
try:
results.append(check.run(message, query_graph))
except Exception as e:
if logger is not None:
logger.warning(
f"Regression check {getattr(check, 'name', type(check).__name__)} crashed: {e}"
)
results.append(
RegressionCheckResult(
name=getattr(check, "name", type(check).__name__),
status=RegressionStatus.SKIPPED,
message=f"Check crashed: {type(e).__name__}: {e}",
)
)
return results
122 changes: 122 additions & 0 deletions test_harness/regression_checks/edge_predicate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Edge predicate regression check.

Verifies that every predicate returned in the knowledge graph (and bound to a
query-graph edge) is the queried predicate itself or a biolink descendant of
it. A returned predicate that is an ancestor of the queried predicate (less
specific) or otherwise unrelated counts as a regression.
"""

from typing import Any, Dict, List, Optional, Set

from test_harness.regression_checks.base import (
RegressionCheckResult,
RegressionStatus,
)


class EdgePredicateMatchCheck:
name = "edge_predicate_match"

def __init__(self) -> None:
self._toolkit = None
self._toolkit_init_error: Optional[str] = None
self._descendants_cache: Dict[str, Set[str]] = {}

def _get_toolkit(self):
if self._toolkit is not None or self._toolkit_init_error is not None:
return self._toolkit
try:
import bmt # imported lazily; bmt.Toolkit() is slow
self._toolkit = bmt.Toolkit()
except Exception as e:
self._toolkit_init_error = f"{type(e).__name__}: {e}"
return self._toolkit

def _allowed_predicates(self, predicate: str) -> Set[str]:
cached = self._descendants_cache.get(predicate)
if cached is not None:
return cached
toolkit = self._get_toolkit()
if toolkit is None:
return set()
descendants = toolkit.get_descendants(
predicate, reflexive=False, formatted=True
) or []
allowed = {predicate, *descendants}
self._descendants_cache[predicate] = allowed
return allowed

def run(
self, message: Dict[str, Any], query_graph: Dict[str, Any]
) -> RegressionCheckResult:
qg_edges = (query_graph or {}).get("edges") or {}
qg_edges_with_predicates = {
edge_id: edge
for edge_id, edge in qg_edges.items()
if edge.get("predicates")
}
if not qg_edges_with_predicates:
return RegressionCheckResult(
name=self.name,
status=RegressionStatus.SKIPPED,
message="No query predicates to check.",
)

if self._get_toolkit() is None:
return RegressionCheckResult(
name=self.name,
status=RegressionStatus.SKIPPED,
message=f"biolink toolkit unavailable: {self._toolkit_init_error}",
)

allowed_by_edge: Dict[str, Set[str]] = {}
for edge_id, edge in qg_edges_with_predicates.items():
allowed: Set[str] = set()
for predicate in edge["predicates"]:
allowed |= self._allowed_predicates(predicate)
allowed_by_edge[edge_id] = allowed

kg_edges = (message.get("knowledge_graph") or {}).get("edges") or {}
results = message.get("results") or []
mismatches: List[Dict[str, Any]] = []

for result_idx, result in enumerate(results):
for analysis_idx, analysis in enumerate(result.get("analyses") or []):
for qg_edge_id, bindings in (analysis.get("edge_bindings") or {}).items():
allowed = allowed_by_edge.get(qg_edge_id)
if allowed is None:
continue
for binding in bindings or []:
kg_edge_id = binding.get("id")
kg_edge = kg_edges.get(kg_edge_id)
if kg_edge is None:
mismatches.append({
"result_index": result_idx,
"analysis_index": analysis_idx,
"qg_edge_id": qg_edge_id,
"kg_edge_id": kg_edge_id,
"reason": "kg_edge_missing",
})
continue
predicate = kg_edge.get("predicate")
if predicate not in allowed:
mismatches.append({
"result_index": result_idx,
"analysis_index": analysis_idx,
"qg_edge_id": qg_edge_id,
"kg_edge_id": kg_edge_id,
"predicate": predicate,
"expected_predicates": sorted(allowed),
})

if mismatches:
return RegressionCheckResult(
name=self.name,
status=RegressionStatus.FAILED,
message=f"{len(mismatches)} edge(s) returned predicates not compatible with the query graph.",
details={"mismatches": mismatches, "count": len(mismatches)},
)
return RegressionCheckResult(
name=self.name,
status=RegressionStatus.PASSED,
)
1 change: 1 addition & 0 deletions test_harness/result_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def dump_result_summary(self):
> Acceptance Test Results:
> Passed: {self.acceptance_report['PASSED']},
> Failed: {self.acceptance_report['FAILED']},
> Regression: {self.acceptance_report['REGRESSION']},
> Skipped: {self.acceptance_report['SKIPPED']}
> No Results: {self.acceptance_report['NO_RESULTS']}
> Errors: {self.acceptance_report['ERROR']}
Expand Down
30 changes: 30 additions & 0 deletions test_harness/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
from test_harness.acceptance_test_runner import run_acceptance_pass_fail_analysis
from test_harness.pathfinder_test_runner import pathfinder_pass_fail_analysis
from test_harness.performance_test_runner import run_performance_test
from test_harness.regression_checks import (
RegressionStatus,
run_all as run_regression_checks,
)
from test_harness.reporter import Reporter
from test_harness.result_collector import ResultCollector
from test_harness.runner.generate_query import generate_query
Expand Down Expand Up @@ -171,6 +175,32 @@ def run_tests(
agent_report.status = AgentStatus.FAILED
agent_report.message = "Test Error"

try:
check_results = run_regression_checks(
response["response"]["message"],
test_query["query"]["message"].get("query_graph") or {},
logger,
)
agent_report.regression_checks.extend(check_results)
failed_checks = [
r for r in check_results
if r.status == RegressionStatus.FAILED
]
if failed_checks and agent_report.status == AgentStatus.PASSED:
agent_report.status = AgentStatus.REGRESSION
summary = "; ".join(
f"{r.name}: {r.message or 'failed'}"
for r in failed_checks
)
agent_report.message = (
f"{agent_report.message + ' | ' if agent_report.message else ''}"
f"Regression: {summary}"
)
except Exception as e:
logger.warning(
f"Regression check infrastructure failed on {agent}: {e}"
)

# grab only ars result if it exists, otherwise default to failed
if "ars" not in report.result:
status = AgentStatus.SKIPPED
Expand Down
11 changes: 8 additions & 3 deletions test_harness/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""General utilities for the Test Harness."""

from dataclasses import dataclass
from dataclasses import dataclass, field
from enum import Enum
import logging
from typing import Dict, List, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union

if TYPE_CHECKING:
from test_harness.regression_checks.base import RegressionCheckResult

import httpx
from translator_testing_model.datamodel.pydanticmodel import (
Expand All @@ -27,6 +30,7 @@ class AgentStatus(str, Enum):
NO_RESULTS = "NO_RESULTS"
SKIPPED = "SKIPPED"
ERROR = "ERROR"
REGRESSION = "REGRESSION"


@dataclass
Expand All @@ -36,13 +40,14 @@ class AgentReport:
status: AgentStatus
message: Optional[str]
actual_output: Optional[dict[str, Optional[int]]]
regression_checks: List["RegressionCheckResult"] = field(default_factory=list)


@dataclass
class PathfinderReport(AgentReport):
"""Dictionary for single Pathfinder agent report."""

expected_nodes_found: str
expected_nodes_found: str = ""


@dataclass
Expand Down
Loading
Loading