From 6b11767d30fb08969146d4bb58ac8570cc20c34f Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Tue, 19 May 2026 17:54:34 -0700 Subject: [PATCH 1/2] feat(eval): add evaluator type schemas for classification evaluators Generates BinaryClassificationEvaluator.json and MulticlassClassificationEvaluator.json from the new evaluators added in #1397 so external tooling (Flow UI evaluator picker, `uip maestro flow eval`) can read the config / criteria / justification schemas. Files produced by `python -m uipath.eval.evaluators_types.generate_types`, restricted to the two new evaluator types. A companion PR refreshes the other 11 stale schemas in evaluators_types/. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../BinaryClassificationEvaluator.json | 121 ++++++++++++++++ .../MulticlassClassificationEvaluator.json | 133 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json new file mode 100644 index 000000000..9f7351865 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json @@ -0,0 +1,121 @@ +{ + "evaluatorTypeId": "uipath-binary-classification", + "evaluatorConfigSchema": { + "$defs": { + "BinaryClassificationEvaluationCriteria": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "BinaryClassificationEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the binary classification evaluator.", + "properties": { + "name": { + "default": "BinaryClassificationEvaluator", + "title": "Name", + "type": "string" + }, + "description": { + "default": "", + "description": "The description of the evaluator", + "title": "Description", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/BinaryClassificationEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "line_by_line_evaluator": { + "default": false, + "description": "If True, split output by delimiter and evaluate each line separately", + "title": "Line By Line Evaluator", + "type": "boolean" + }, + "line_delimiter": { + "default": "\n", + "description": "Delimiter to split output when line_by_line_evaluator is True", + "title": "Line Delimiter", + "type": "string" + }, + "positive_class": { + "title": "Positive Class", + "type": "string" + }, + "metric_type": { + "default": "precision", + "enum": [ + "precision", + "recall", + "f-score" + ], + "title": "Metric Type", + "type": "string" + }, + "f_value": { + "default": 1.0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "positive_class" + ], + "title": "BinaryClassificationEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "BinaryClassificationEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Base class for all evaluator justifications.", + "properties": { + "expected": { + "title": "Expected", + "type": "string" + }, + "actual": { + "title": "Actual", + "type": "string" + } + }, + "required": [ + "expected", + "actual" + ], + "title": "BaseEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json new file mode 100644 index 000000000..72262ba92 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json @@ -0,0 +1,133 @@ +{ + "evaluatorTypeId": "uipath-multiclass-classification", + "evaluatorConfigSchema": { + "$defs": { + "MulticlassClassificationEvaluationCriteria": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "MulticlassClassificationEvaluationCriteria", + "type": "object" + } + }, + "description": "Configuration for the multiclass classification evaluator.", + "properties": { + "name": { + "default": "MulticlassClassificationEvaluator", + "title": "Name", + "type": "string" + }, + "description": { + "default": "", + "description": "The description of the evaluator", + "title": "Description", + "type": "string" + }, + "default_evaluation_criteria": { + "anyOf": [ + { + "$ref": "#/$defs/MulticlassClassificationEvaluationCriteria" + }, + { + "type": "null" + } + ], + "default": null + }, + "target_output_key": { + "default": "*", + "description": "Key to extract output from agent execution", + "title": "Target Output Key", + "type": "string" + }, + "line_by_line_evaluator": { + "default": false, + "description": "If True, split output by delimiter and evaluate each line separately", + "title": "Line By Line Evaluator", + "type": "boolean" + }, + "line_delimiter": { + "default": "\n", + "description": "Delimiter to split output when line_by_line_evaluator is True", + "title": "Line Delimiter", + "type": "string" + }, + "classes": { + "items": { + "type": "string" + }, + "title": "Classes", + "type": "array" + }, + "metric_type": { + "default": "f-score", + "enum": [ + "precision", + "recall", + "f-score" + ], + "title": "Metric Type", + "type": "string" + }, + "averaging": { + "default": "macro", + "enum": [ + "micro", + "macro" + ], + "title": "Averaging", + "type": "string" + }, + "f_value": { + "default": 1.0, + "title": "F Value", + "type": "number" + } + }, + "required": [ + "classes" + ], + "title": "MulticlassClassificationEvaluatorConfig", + "type": "object" + }, + "evaluationCriteriaSchema": { + "description": "Per-datapoint criteria: which class this sample should belong to.", + "properties": { + "expected_class": { + "title": "Expected Class", + "type": "string" + } + }, + "required": [ + "expected_class" + ], + "title": "MulticlassClassificationEvaluationCriteria", + "type": "object" + }, + "justificationSchema": { + "description": "Base class for all evaluator justifications.", + "properties": { + "expected": { + "title": "Expected", + "type": "string" + }, + "actual": { + "title": "Actual", + "type": "string" + } + }, + "required": [ + "expected", + "actual" + ], + "title": "BaseEvaluatorJustification", + "type": "object" + } +} \ No newline at end of file From 037b60cdb6e721c494b2b4fd173e6bf1bdb450ed Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Tue, 19 May 2026 18:27:58 -0700 Subject: [PATCH 2/2] test(eval): add e2e tests + sample projects for classification evaluators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two sample projects under packages/uipath/samples/ that double as end-to-end test fixtures for the binary and multiclass classification evaluators added in #1397: - binary_classification_agent — rule-based spam/ham classifier wired up to the binary classification evaluator with metric_type=precision. Eval set is designed so 4/5 datapoints pass but precision is 2/3 because of one deliberate false positive. - multiclass_classification_simple — rule-based 3-class router (payments / support / spam) wired up to the multiclass classification evaluator with macro-averaged F1. Eval set forces a misroute that hurts both payments precision and support recall, giving macro F1 = 26/30. Adds tests/cli/eval/test_classification_samples_e2e.py which loads each sample's eval-sets/default.json, wires its main.py into a stand-in runtime, calls evaluate(), and asserts both the per-row scores and the aggregated metric produced by reduce_scores. Locks in the dataset-level math, not just per-row correct/incorrect. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../binary_classification_agent/bindings.json | 4 + .../evaluations/eval-sets/default.json | 63 ++++++ .../evaluators/binary-classification.json | 16 ++ .../binary_classification_agent/main.py | 39 ++++ .../pyproject.toml | 9 + .../binary_classification_agent/uipath.json | 5 + .../bindings.json | 4 + .../evaluations/eval-sets/default.json | 85 ++++++++ .../evaluators/multiclass-classification.json | 17 ++ .../multiclass_classification_simple/main.py | 51 +++++ .../pyproject.toml | 9 + .../uipath.json | 5 + .../eval/test_classification_samples_e2e.py | 193 ++++++++++++++++++ 13 files changed, 500 insertions(+) create mode 100644 packages/uipath/samples/binary_classification_agent/bindings.json create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json create mode 100644 packages/uipath/samples/binary_classification_agent/main.py create mode 100644 packages/uipath/samples/binary_classification_agent/pyproject.toml create mode 100644 packages/uipath/samples/binary_classification_agent/uipath.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/bindings.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json create mode 100644 packages/uipath/samples/multiclass_classification_simple/main.py create mode 100644 packages/uipath/samples/multiclass_classification_simple/pyproject.toml create mode 100644 packages/uipath/samples/multiclass_classification_simple/uipath.json create mode 100644 packages/uipath/tests/cli/eval/test_classification_samples_e2e.py diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json new file mode 100644 index 000000000..f47cd25b8 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json @@ -0,0 +1,63 @@ +{ + "version": "1.0", + "id": "SpamBinaryEval", + "name": "Binary spam classifier — precision", + "evaluatorRefs": ["BinarySpamPrecision"], + "evaluations": [ + { + "id": "spam-prize", + "name": "Spam: prize giveaway", + "inputs": { + "email_subject": "You won a FREE iPhone!!!", + "email_body": "Congratulations! Click here to claim your prize now." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "spam" } + } + }, + { + "id": "spam-promo", + "name": "Spam: unsolicited promo", + "inputs": { + "email_subject": "Winner of the monthly drawing", + "email_body": "You've been selected. Click here to redeem." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "spam" } + } + }, + { + "id": "ham-invoice", + "name": "Ham: legitimate invoice", + "inputs": { + "email_subject": "Your March invoice is ready", + "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + }, + { + "id": "ham-meeting", + "name": "Ham: meeting request", + "inputs": { + "email_subject": "Sync on Q2 planning", + "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?" + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + }, + { + "id": "ham-mislabeled", + "name": "Ham mislabeled as spam (forces a false positive)", + "inputs": { + "email_subject": "Free coffee in the break room!!!", + "email_body": "Just a heads up — the new espresso machine is set up." + }, + "evaluationCriterias": { + "BinarySpamPrecision": { "expectedClass": "ham" } + } + } + ] +} diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json new file mode 100644 index 000000000..21f7d6850 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json @@ -0,0 +1,16 @@ +{ + "version": "1.0", + "id": "BinarySpamPrecision", + "description": "Precision on the 'spam' positive class", + "evaluatorTypeId": "uipath-binary-classification", + "evaluatorConfig": { + "name": "BinarySpamPrecision", + "targetOutputKey": "category", + "positiveClass": "spam", + "metricType": "precision", + "fValue": 1.0, + "defaultEvaluationCriteria": { + "expectedClass": "ham" + } + } +} diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py new file mode 100644 index 000000000..1df5dea15 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/main.py @@ -0,0 +1,39 @@ +"""Rule-based spam/ham classifier demonstrating the binary classification evaluator.""" + +from dataclasses import dataclass + +from uipath.tracing import traced + +SPAMMY_TOKENS = { + "free", + "winner", + "congratulations", + "click here", + "prize", + "!!!", +} + + +@dataclass +class EmailInput: + email_subject: str + email_body: str + + +@dataclass +class Classification: + category: str + + +@traced(name="classify_email", span_type="tool") +def classify_email(subject: str, body: str) -> str: + """Return 'spam' if any spam-indicator token appears in the subject or body.""" + text = f"{subject} {body}".lower() + return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham" + + +@traced() +async def main(input: EmailInput) -> Classification: + """Classify an email as 'spam' or 'ham'.""" + category = classify_email(input.email_subject, input.email_body) + return Classification(category=category) diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml new file mode 100644 index 000000000..7d81d251a --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "binary-classification-agent" +version = "0.0.1" +description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator" +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/binary_classification_agent/uipath.json b/packages/uipath/samples/binary_classification_agent/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/binary_classification_agent/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/samples/multiclass_classification_simple/bindings.json b/packages/uipath/samples/multiclass_classification_simple/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json new file mode 100644 index 000000000..27e66c25d --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json @@ -0,0 +1,85 @@ +{ + "version": "1.0", + "id": "EmailMulticlassEval", + "name": "3-class email router — macro F1", + "evaluatorRefs": ["EmailMulticlassFScore"], + "evaluations": [ + { + "id": "pay-invoice", + "name": "Payments: invoice reminder", + "inputs": { + "email_subject": "Your March invoice is ready", + "email_body": "Your monthly invoice of $45.99 is now available. Payment is due March 15." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "payments" } + } + }, + { + "id": "pay-refund", + "name": "Payments: refund request", + "inputs": { + "email_subject": "Refund for last month's charge", + "email_body": "I was charged twice for the same service. Please process a refund." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "payments" } + } + }, + { + "id": "support-broken", + "name": "Support: feature broken", + "inputs": { + "email_subject": "Login is broken", + "email_body": "I'm getting an error when trying to sign in. Need help." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + }, + { + "id": "support-question", + "name": "Support: how-to question", + "inputs": { + "email_subject": "How do I export my data?", + "email_body": "Can you help me figure out where the export button is?" + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + }, + { + "id": "spam-prize", + "name": "Spam: prize giveaway", + "inputs": { + "email_subject": "You won a FREE iPhone!!!", + "email_body": "Congratulations! Click here to claim your prize." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "spam" } + } + }, + { + "id": "spam-promo", + "name": "Spam: marketing winner", + "inputs": { + "email_subject": "Winner of the monthly drawing", + "email_body": "Congratulations, click here to redeem your reward." + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "spam" } + } + }, + { + "id": "support-misrouted-by-payment-word", + "name": "Support email accidentally routed to payments (forces an FP for payments)", + "inputs": { + "email_subject": "Question about my billing portal access", + "email_body": "I cannot log into the billing portal. The page just spins. Can you help?" + }, + "evaluationCriterias": { + "EmailMulticlassFScore": { "expectedClass": "support" } + } + } + ] +} diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json new file mode 100644 index 000000000..859a18562 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json @@ -0,0 +1,17 @@ +{ + "version": "1.0", + "id": "EmailMulticlassFScore", + "description": "Macro-averaged F1 across payments / support / spam", + "evaluatorTypeId": "uipath-multiclass-classification", + "evaluatorConfig": { + "name": "EmailMulticlassFScore", + "targetOutputKey": "category", + "classes": ["payments", "support", "spam"], + "metricType": "f-score", + "averaging": "macro", + "fValue": 1.0, + "defaultEvaluationCriteria": { + "expectedClass": "support" + } + } +} diff --git a/packages/uipath/samples/multiclass_classification_simple/main.py b/packages/uipath/samples/multiclass_classification_simple/main.py new file mode 100644 index 000000000..3ab684298 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/main.py @@ -0,0 +1,51 @@ +"""Rule-based 3-class email router demonstrating the multiclass classification evaluator.""" + +from dataclasses import dataclass + +from uipath.tracing import traced + +SPAM_TOKENS = {"free", "winner", "congratulations", "click here", "prize", "!!!"} +PAYMENT_TOKENS = {"invoice", "payment", "refund", "charge", "billing", "$"} +SUPPORT_TOKENS = { + "help", + "support", + "issue", + "error", + "ticket", + "broken", + "not working", +} + + +@dataclass +class EmailInput: + email_subject: str + email_body: str + + +@dataclass +class Classification: + category: str + + +@traced(name="classify_email", span_type="tool") +def classify_email(subject: str, body: str) -> str: + """Classify into 'spam', 'payments', or 'support' using priority rules. + + Spam is checked first so promos with billing-flavored words still route to spam. + Payments is checked before support because it is the more specific intent. + Support is the catch-all default. + """ + text = f"{subject} {body}".lower() + if any(token in text for token in SPAM_TOKENS): + return "spam" + if any(token in text for token in PAYMENT_TOKENS): + return "payments" + return "support" + + +@traced() +async def main(input: EmailInput) -> Classification: + """Route an email to one of three queues.""" + category = classify_email(input.email_subject, input.email_body) + return Classification(category=category) diff --git a/packages/uipath/samples/multiclass_classification_simple/pyproject.toml b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml new file mode 100644 index 000000000..e803a2a43 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "multiclass-classification-simple" +version = "0.0.1" +description = "Rule-based 3-class email router demonstrating the multiclass classification evaluator with macro-averaged F1" +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/multiclass_classification_simple/uipath.json b/packages/uipath/samples/multiclass_classification_simple/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/multiclass_classification_simple/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py new file mode 100644 index 000000000..202363221 --- /dev/null +++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py @@ -0,0 +1,193 @@ +"""End-to-end tests that run the classification sample projects through evaluate(). + +These tests double as integration coverage for the binary and multiclass +classification evaluators added in #1397 — they wire each sample's main.py +into a stand-in runtime, run the full eval set, and assert the per-row scores +plus the aggregated metric produced by `reduce_scores`. +""" + +import importlib.util +import uuid +from pathlib import Path +from types import ModuleType +from typing import Any, AsyncGenerator + +import pytest + +from uipath.core.events import EventBus +from uipath.core.tracing import UiPathTraceManager +from uipath.eval.helpers import EvalHelpers +from uipath.eval.runtime import UiPathEvalContext, evaluate +from uipath.eval.runtime._types import UiPathEvalOutput +from uipath.eval.runtime.runtime import compute_evaluator_scores +from uipath.runtime import ( + UiPathExecuteOptions, + UiPathRuntimeEvent, + UiPathRuntimeFactorySettings, + UiPathRuntimeProtocol, + UiPathRuntimeResult, + UiPathRuntimeStatus, + UiPathRuntimeStorageProtocol, + UiPathStreamOptions, +) +from uipath.runtime.schema import UiPathRuntimeSchema + +SAMPLES_DIR = Path(__file__).resolve().parents[3] / "samples" + + +def _load_sample_main(sample_dir: Path) -> ModuleType: + """Import a sample's main.py as an isolated module.""" + module_name = f"_eval_sample_{sample_dir.name}" + spec = importlib.util.spec_from_file_location(module_name, sample_dir / "main.py") + assert spec is not None and spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class _SampleRuntime: + """Runtime that delegates execution to the sample's `main` function.""" + + def __init__(self, sample_main: Any) -> None: + self._sample_main = sample_main + + async def execute( + self, + input: dict[str, Any] | None = None, + options: UiPathExecuteOptions | None = None, + ) -> UiPathRuntimeResult: + input_model = self._sample_main.EmailInput(**(input or {})) + output = await self._sample_main.main(input_model) + return UiPathRuntimeResult( + output={"category": output.category}, + status=UiPathRuntimeStatus.SUCCESSFUL, + ) + + async def stream( + self, + input: dict[str, Any] | None = None, + options: UiPathStreamOptions | None = None, + ) -> AsyncGenerator[UiPathRuntimeEvent, None]: + yield await self.execute(input, None) + + async def get_schema(self) -> UiPathRuntimeSchema: + return UiPathRuntimeSchema( + filePath="main.py", + uniqueId="main", + type="agent", + input={ + "type": "object", + "properties": { + "email_subject": {"type": "string"}, + "email_body": {"type": "string"}, + }, + }, + output={ + "type": "object", + "properties": {"category": {"type": "string"}}, + }, + ) + + async def dispose(self) -> None: + pass + + +class _SampleFactory: + def __init__(self, sample_main: Any) -> None: + self._sample_main = sample_main + + def discover_entrypoints(self) -> list[str]: + return ["main"] + + async def get_storage(self) -> UiPathRuntimeStorageProtocol | None: + return None + + async def get_settings(self) -> UiPathRuntimeFactorySettings | None: + return None + + async def new_runtime( + self, entrypoint: str, runtime_id: str, **kwargs: Any + ) -> UiPathRuntimeProtocol: + return _SampleRuntime(self._sample_main) + + async def dispose(self) -> None: + pass + + +async def _run_sample(sample_dir: Path) -> tuple[UiPathEvalOutput, dict[str, float]]: + """Run the sample's eval set and return (per-row output, evaluator_averages).""" + sample_main = _load_sample_main(sample_dir) + factory = _SampleFactory(sample_main) + + eval_set_path = str(sample_dir / "evaluations" / "eval-sets" / "default.json") + evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path) + evaluators = await EvalHelpers.load_evaluators( + eval_set_path, evaluation_set, agent_model=None + ) + + runtime = await factory.new_runtime("main", "test-runtime-id") + runtime_schema = await runtime.get_schema() + + context = UiPathEvalContext() + context.execution_id = str(uuid.uuid4()) + context.evaluation_set = evaluation_set + context.runtime_schema = runtime_schema + context.evaluators = evaluators + + result = await evaluate( + factory, + UiPathTraceManager(), + context, + EventBus(), + ) + + eval_output = UiPathEvalOutput.model_validate(result.output) + _, evaluator_averages = compute_evaluator_scores( + eval_output.evaluation_set_results, evaluators + ) + return eval_output, evaluator_averages + + +def _per_row_scores(output: UiPathEvalOutput) -> dict[str, float]: + return { + row.evaluation_name: row.evaluation_run_results[0].result.score + for row in output.evaluation_set_results + } + + +async def test_binary_classification_sample_end_to_end(): + """Binary spam classifier: 4/5 datapoints correct, but precision is 2/3 because of one FP.""" + output, averages = await _run_sample(SAMPLES_DIR / "binary_classification_agent") + + per_row = _per_row_scores(output) + assert per_row == { + "Spam: prize giveaway": 1.0, + "Spam: unsolicited promo": 1.0, + "Ham: legitimate invoice": 1.0, + "Ham: meeting request": 1.0, + "Ham mislabeled as spam (forces a false positive)": 0.0, + } + # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666... + assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6) + + +async def test_multiclass_classification_sample_end_to_end(): + """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666...""" + output, averages = await _run_sample( + SAMPLES_DIR / "multiclass_classification_simple" + ) + + per_row = _per_row_scores(output) + assert per_row == { + "Payments: invoice reminder": 1.0, + "Payments: refund request": 1.0, + "Support: feature broken": 1.0, + "Support: how-to question": 1.0, + "Spam: prize giveaway": 1.0, + "Spam: marketing winner": 1.0, + "Support email accidentally routed to payments " + "(forces an FP for payments)": 0.0, + } + # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0 + # macro = mean = 2.6 / 3 + assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6)