diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 8e9c9f581..81c597d68 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.10.70" +version = "2.10.72" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/samples/classifier_demo/README.md b/packages/uipath/samples/classifier_demo/README.md new file mode 100644 index 000000000..638765ab1 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/README.md @@ -0,0 +1,139 @@ +# Classifier evaluator end-to-end demo + +A minimal intent-classification agent that exercises the new +`ClassifierEvaluator` end-to-end. Use this as the test fixture for both +SDK-only validation (Path A below) and Studio Web full-stack validation +(Path B). + +## What's here + +``` +classifier_demo/ +├── main.py # 3-class keyword classifier +├── uipath.json +├── pyproject.toml +├── bindings.json +└── evaluations/ + ├── eval-sets/ + │ └── main.json # 9 datapoints, 3 per class, some intentionally wrong + └── evaluators/ + ├── intent_match.json # per-datapoint ExactMatch on agent_output.intent + └── intent_classifier.json # the new uipath-classifier (pure metadata) +``` + +The eval set is wired so that for every datapoint both evaluators run: +- `intent_match` produces a 1.0/0.0 score with `{"expected": "...", "actual": "..."}` justification. +- `intent_classifier` produces a sentinel 0.0 score with `{"classes": [...], "source_evaluator": "intent_match"}` justification. + +Downstream (the C# layer in Studio Web) reads both to compute precision / +recall / F-score across the dataset. + +> Heads-up — every datapoint must have an entry for the classifier in +> `evaluationCriterias` (even an empty `{}`). The runtime currently skips +> evaluators that aren't keyed in `evaluationCriterias` for a datapoint, so +> omitting them silently drops the classifier results. + +## Path A — SDK only (real run, ~30 seconds) + +```bash +cd packages/uipath +uv sync --all-extras + +cd samples/classifier_demo +uv run --project ../.. uipath eval main main.json --no-report --output-file /tmp/out.json +``` + +Expected: a results table with two columns (`intent_classifier`, `intent_match`). +`intent_match` averages to 0.7 (6/9 correct). `intent_classifier` shows 0.0 per +row by design — its real work is to ship the classes list to the backend. + +To see the metadata payload that lands in the backend's +`CodedEvaluatorScore.Justification`: + +```bash +python3 -c " +import json +with open('/tmp/out.json') as f: d = json.load(f) +for r in d['evaluationSetResults'][0]['evaluationRunResults']: + print(r['evaluatorName'], r['result'].get('details')) +" +``` + +You should see something like: + +``` +intent_classifier {'expected': '', 'actual': '', 'classes': ['book', 'cancel', 'reschedule'], 'source_evaluator': 'intent_match'} +intent_match {'expected': 'book', 'actual': 'book'} +``` + +## Path B — Full Studio Web stack (real UI, click Run, see panel) + +Currently blocked on environment that I (the assistant who built this) didn't +have available locally. The pieces: + +### Prereqs (per `Agents/LOCAL_DEVELOPMENT.md`) +- Docker installed and running +- `make` available +- Azure CLI authenticated session (`az login`) +- Azure DevOps PAT exported as `AZURE_DEVOPS_PAT` +- GitHub NPM registry token exported as `GH_NPM_REGISTRY_TOKEN` +- Azure access token exported as `AZURE_ACCESS_TOKEN` (for the python worker build) +- `cloud-provider-kind` binary (used for the local KinD cluster) + +### Steps + +1. **Point python-eval-worker at the local SDK branch.** The published + `uipath` package on PyPI doesn't yet have `ClassifierEvaluator`. Edit + `Agents/python-eval-worker/pyproject.toml`: + + ```toml + [tool.uv.sources] + uipath = { path = "../../uipath-python/packages/uipath", editable = true } + ``` + + Then `cd python-eval-worker && uv lock && uv sync`. + +2. **Bring up the local KinD cluster** (from `Agents/`): + ```bash + make create-kind-cluster + kubectl get nodes + sudo ./bin/cloud-provider-kind & # in a separate shell or background + make up + make deploy + ``` + +3. **Build the backend with the classifier changes:** + ```bash + git checkout feat/eval-classifier-backend # in Agents repo + # Re-trigger the helm/skaffold deploy for the backend + make deploy + ``` + +4. **Build the frontend with the UI changes:** + ```bash + git checkout feat/eval-dataset-evaluators-ui # in Agents repo + # Same deploy command rebuilds frontend image + ``` + +5. **Open Studio Web** (URL surfaced by the deploy output), create an agent + project, upload the eval-set + evaluator JSONs from this directory (or + author them in the UI — the picker now shows a "Classifier" entry under + the AGGREGATION section), and click Run. + +6. **Verify** the Aggregations panel renders between the run header and the + datapoint table, with the confusion matrix matching what Path A's Python + shim computes (macro F1 ≈ 0.667 on this fixture). + +### Open questions for the team owning local dev + +- Does the existing PAT / token set get refreshed automatically by the dev tooling, or do contributors need to rotate them periodically? +- Is there a simpler "local-only" path that bypasses the KinD cluster (e.g. docker-compose) for changes that don't touch K8s manifests? +- What's the standard pattern for pointing the python worker at a non-PyPI uipath build? The `[tool.uv.sources]` override above is the standard uv path — confirm there's no Helm/skaffold complication. + +## Companion PRs + +| Repo | Branch | PR | What | +|---|---|---|---| +| uipath-python | `feat/eval-classifier-evaluator` | [#1674](https://github.com/UiPath/uipath-python/pull/1674) | SDK `ClassifierEvaluator` | +| Agents | `feat/eval-classifier-backend` | [#5313](https://github.com/UiPath/Agents/pull/5313) | C# math + activity + envelope storage | +| Agents | `feat/eval-dataset-evaluators-ui` | [#5306](https://github.com/UiPath/Agents/pull/5306) | Frontend picker + Aggregations panel | diff --git a/packages/uipath/samples/classifier_demo/bindings.json b/packages/uipath/samples/classifier_demo/bindings.json new file mode 100644 index 000000000..5e9beeb01 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} diff --git a/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json new file mode 100644 index 000000000..117e9e240 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json @@ -0,0 +1,173 @@ +{ + "version": "1.0", + "id": "classifier-demo-eval-set", + "name": "Classifier demo eval set", + "evaluatorRefs": [ + "intent_match", + "intent_classifier" + ], + "evaluations": [ + { + "id": "book-1", + "name": "book \u2014 straightforward", + "inputs": { + "utterance": "I want to book a table for two" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "book-2", + "name": "book \u2014 schedule keyword", + "inputs": { + "utterance": "Please schedule an appointment" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "book-3", + "name": "book \u2014 agent misclassifies (utterance triggers cancel keyword)", + "inputs": { + "utterance": "I had to cancel my last attempt but I want to reserve a slot now" + }, + "expectedOutput": { + "intent": "book" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "book" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-1", + "name": "cancel \u2014 straightforward", + "inputs": { + "utterance": "Please cancel my reservation" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-2", + "name": "cancel \u2014 void synonym", + "inputs": { + "utterance": "I want to void the order" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "cancel-3", + "name": "cancel \u2014 agent misclassifies (utterance has 'move' which triggers reschedule)", + "inputs": { + "utterance": "I need to move past this and cancel everything" + }, + "expectedOutput": { + "intent": "cancel" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "cancel" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-1", + "name": "reschedule \u2014 straightforward", + "inputs": { + "utterance": "I want to reschedule the meeting" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-2", + "name": "reschedule \u2014 move synonym", + "inputs": { + "utterance": "Can we move the slot to tomorrow" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + }, + { + "id": "reschedule-3", + "name": "reschedule \u2014 agent misclassifies (falls through to default 'book')", + "inputs": { + "utterance": "Different timing please" + }, + "expectedOutput": { + "intent": "reschedule" + }, + "evaluationCriterias": { + "intent_match": { + "expectedOutput": { + "intent": "reschedule" + } + }, + "intent_classifier": {} + } + } + ] +} \ No newline at end of file diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json new file mode 100644 index 000000000..ace8cb712 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json @@ -0,0 +1,11 @@ +{ + "version": "1.0", + "id": "intent_classifier", + "description": "Classification aggregator. Pure metadata — carries the classes list + source evaluator name to downstream consumers (the C# backend computes precision/recall/F-score over the dataset). Per-datapoint result is a no-op carrying the metadata.", + "evaluatorTypeId": "uipath-classifier", + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["book", "cancel", "reschedule"], + "sourceEvaluator": "intent_match" + } +} diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json new file mode 100644 index 000000000..552c7220f --- /dev/null +++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json @@ -0,0 +1,15 @@ +{ + "version": "1.0", + "id": "intent_match", + "description": "Per-datapoint ExactMatch on the agent's `intent` output. Produces expected/actual justification that the ClassifierEvaluator pipeline reads.", + "evaluatorTypeId": "uipath-exact-match", + "evaluatorConfig": { + "name": "intent_match", + "targetOutputKey": "intent", + "caseSensitive": false, + "negated": false, + "defaultEvaluationCriteria": { + "expectedOutput": "book" + } + } +} diff --git a/packages/uipath/samples/classifier_demo/main.py b/packages/uipath/samples/classifier_demo/main.py new file mode 100644 index 000000000..b6e1eea48 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/main.py @@ -0,0 +1,42 @@ +"""Tiny intent-classification agent for the ClassifierEvaluator demo. + +Given an utterance, returns the intent label. Three intents: + - book (anything containing "book" / "reserve" / "schedule") + - cancel (anything containing "cancel" / "void") + - reschedule (anything containing "reschedule" / "move") + +A few datapoints are deliberately misclassified so the run-level +classification metrics (precision/recall/F-score) come out non-trivially. +""" + +from dataclasses import dataclass + + +@dataclass +class IntentInput: + utterance: str + + +@dataclass +class IntentOutput: + intent: str + + +BOOK_KEYWORDS = {"book", "reserve", "schedule"} +CANCEL_KEYWORDS = {"cancel", "void"} +RESCHEDULE_KEYWORDS = {"reschedule", "move"} + + +async def main(input: IntentInput) -> IntentOutput: + """Classify the utterance into book / cancel / reschedule.""" + text = input.utterance.lower() + tokens = set(text.split()) + + if tokens & RESCHEDULE_KEYWORDS: + return IntentOutput(intent="reschedule") + if tokens & CANCEL_KEYWORDS: + return IntentOutput(intent="cancel") + if tokens & BOOK_KEYWORDS: + return IntentOutput(intent="book") + # Fallback to "book" — deliberately wrong-ish so the matrix is interesting. + return IntentOutput(intent="book") diff --git a/packages/uipath/samples/classifier_demo/pyproject.toml b/packages/uipath/samples/classifier_demo/pyproject.toml new file mode 100644 index 000000000..307e3778c --- /dev/null +++ b/packages/uipath/samples/classifier_demo/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "classifier-demo" +version = "0.0.1" +description = "Tiny intent-classification agent that exercises the new ClassifierEvaluator end-to-end via `uipath eval`." +requires-python = ">=3.11" +dependencies = ["uipath"] + +[dependency-groups] +dev = ["uipath-dev"] diff --git a/packages/uipath/samples/classifier_demo/uipath.json b/packages/uipath/samples/classifier_demo/uipath.json new file mode 100644 index 000000000..9b02c2654 --- /dev/null +++ b/packages/uipath/samples/classifier_demo/uipath.json @@ -0,0 +1,5 @@ +{ + "functions": { + "main": "main.py:main" + } +} diff --git a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py index fd4849076..792f929be 100644 --- a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py +++ b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py @@ -960,7 +960,14 @@ def _build_assertion_properties( def _build_evaluator_snapshot( evaluator: BaseEvaluator[Any, Any, Any], ) -> dict[str, Any]: - """Build evaluatorSnapshot dict with prompt and model if available.""" + """Build evaluatorSnapshot dict with prompt, model, and aggregators if available. + + `aggregators` is a list of run-level aggregator specs attached to the + evaluator's config (e.g. an ExactMatch with `aggregators=[{name: + "classification", classes: [...]}]`). It's emitted here so the C# + post-pass can read aggregator configs without consulting the original + evaluator definition file. + """ snapshot: dict[str, Any] = {} config = getattr(evaluator, "evaluator_config", None) if config is not None: @@ -968,6 +975,14 @@ def _build_evaluator_snapshot( snapshot["prompt"] = config.prompt if hasattr(config, "model") and isinstance(config.model, str): snapshot["model"] = config.model + aggregators = getattr(config, "aggregators", None) + if aggregators: + # Serialize Pydantic models to plain dicts so the wire shape is + # readable from C# without referencing our Python types. + snapshot["aggregators"] = [ + spec.model_dump(by_alias=True) if hasattr(spec, "model_dump") else spec + for spec in aggregators + ] return snapshot def _collect_results( diff --git a/packages/uipath/src/uipath/eval/evaluators/__init__.py b/packages/uipath/src/uipath/eval/evaluators/__init__.py index 03a4bf63b..c68271b11 100644 --- a/packages/uipath/src/uipath/eval/evaluators/__init__.py +++ b/packages/uipath/src/uipath/eval/evaluators/__init__.py @@ -9,6 +9,7 @@ BaseEvaluatorConfig, BaseEvaluatorJustification, ) +from ._aggregators import AggregatorSpec, ClassificationAggregatorSpec from .base_legacy_evaluator import BaseLegacyEvaluator from .binary_classification_evaluator import BinaryClassificationEvaluator @@ -71,6 +72,9 @@ "BinaryClassificationEvaluator", "MulticlassClassificationEvaluator", "ContainsEvaluator", + # Aggregator specs (config metadata attached to per-datapoint evaluators) + "AggregatorSpec", + "ClassificationAggregatorSpec", "ExactMatchEvaluator", "JsonSimilarityEvaluator", "BaseLLMOutputEvaluator", diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregators.py b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py new file mode 100644 index 000000000..968974546 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py @@ -0,0 +1,35 @@ +"""Aggregator specs attached to per-datapoint evaluator configs. + +An aggregator is run-level — it consumes the per-datapoint results of an +evaluator after the eval set finishes. The aggregator itself does not run in +the Python runtime; this module just defines the config shape so the downstream +consumer (the C# backend) can pick it up via the evaluator's stored config. + +Today the only aggregator is `classification`, which compares each datapoint's +expected vs. predicted class to build a confusion matrix and precision/recall/ +F-score metrics. +""" + +from typing import Literal + +from pydantic import BaseModel, ConfigDict +from pydantic.alias_generators import to_camel + + +class ClassificationAggregatorSpec(BaseModel): + """Configuration for a classification aggregator. + + Attached to a per-datapoint evaluator (e.g. ExactMatch) to mark that the + evaluator's results should be aggregated into classification metrics. The + classes list defines the exhaustive label space; the C# layer scans each + datapoint's expected output for the first class that matches. + """ + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + name: Literal["classification"] = "classification" + classes: list[str] + + +# Union of all supported aggregator specs. Add new variants here. +AggregatorSpec = ClassificationAggregatorSpec diff --git a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py index 0f1b3e8e8..d8e5c45ba 100644 --- a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py @@ -6,6 +6,7 @@ EvaluatorType, NumericEvaluationResult, ) +from ._aggregators import AggregatorSpec from .base_evaluator import BaseEvaluatorJustification from .output_evaluator import ( OutputEvaluationCriteria, @@ -15,11 +16,18 @@ class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]): - """Configuration for the exact match evaluator.""" + """Configuration for the exact match evaluator. + + The optional `aggregators` field attaches run-level aggregators (e.g. a + classification aggregator with a fixed class set) that the downstream + backend will compute after the eval set finishes. The Python runtime + itself ignores `aggregators` — it's pure metadata for the C# consumer. + """ name: str = "ExactMatchEvaluator" case_sensitive: bool = False negated: bool = False + aggregators: list[AggregatorSpec] | None = None class ExactMatchEvaluator( diff --git a/packages/uipath/tests/evaluators/test_classifier_evaluator.py b/packages/uipath/tests/evaluators/test_classifier_evaluator.py new file mode 100644 index 000000000..182a2dac5 --- /dev/null +++ b/packages/uipath/tests/evaluators/test_classifier_evaluator.py @@ -0,0 +1,179 @@ +"""Tests for the pure-metadata ClassifierEvaluator. + +This evaluator carries a `classes` list to downstream consumers (the C# layer +in Studio Web). Its per-datapoint evaluate is a no-op that emits the classes +list as a justification payload. The tests below pin that contract. +""" + +import json + +import pytest + +from uipath.eval.evaluators import ( + ClassifierEvaluator, + ClassifierJustification, +) +from uipath.eval.evaluators.base_evaluator import BaseEvaluationCriteria +from uipath.eval.evaluators.evaluator_factory import EvaluatorFactory +from uipath.eval.models import AgentExecution, EvaluatorType, NumericEvaluationResult +from uipath.eval.models.models import UiPathEvaluationError + + +def _build_evaluator( + classes: list[str] | None = None, source_evaluator: str = "intent_match" +) -> ClassifierEvaluator: + # Construct via the factory to match how real eval-set runs build evaluators. + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": classes + if classes is not None + else ["book", "cancel", "reschedule"], + "sourceEvaluator": source_evaluator, + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + return evaluator + + +def _agent_execution(output: dict[str, str] | str | None = None) -> AgentExecution: + return AgentExecution( + agent_input={"text": "hello"}, + agent_output=output if output is not None else {"intent": "book"}, + agent_trace=[], + ) + + +class TestClassifierEvaluator: + async def test_evaluate_returns_zero_score_with_classifier_justification( + self, + ) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + assert isinstance(result.details, ClassifierJustification) + assert result.details.classes == ["book", "cancel", "reschedule"] + assert result.details.source_evaluator == "intent_match" + # expected / actual are not meaningful for this evaluator + assert result.details.expected == "" + assert result.details.actual == "" + + async def test_classes_list_is_independent_copy(self) -> None: + # If a caller mutates the result's classes list, it shouldn't leak into the config. + evaluator = _build_evaluator(classes=["a", "b"]) + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + result.details.classes.append("c") + assert evaluator.evaluator_config.classes == ["a", "b"] + + async def test_score_is_zero_regardless_of_agent_output(self) -> None: + evaluator = _build_evaluator() + for output in ( + None, + {}, + {"intent": "book"}, + {"intent": "totally-unrelated"}, + "free text output mentioning cancel", + ): + result = await evaluator.evaluate( + _agent_execution(output), BaseEvaluationCriteria() + ) + assert result.score == 0.0 + + async def test_evaluate_does_not_error_on_missing_criteria(self) -> None: + # The runtime's validate_and_evaluate_criteria falls back to + # default_evaluation_criteria when None is passed. Confirm the config's + # default_evaluation_criteria covers that case. + evaluator = _build_evaluator() + result = await evaluator.validate_and_evaluate_criteria( + _agent_execution(), None + ) + assert result.score == 0.0 + assert isinstance(result.details, ClassifierJustification) + assert result.details.classes == ["book", "cancel", "reschedule"] + + +class TestClassifierJustificationWireShape: + """Pin the JSON shape that flows from CLI → C# via _serialize_justification.""" + + async def test_model_dump_carries_all_config_metadata(self) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + + dumped = result.details.model_dump() + # The CLI ships this via json.dumps(model_dump()) — the resulting string + # is what lands in CodedEvaluatorScore.Justification in the backend. + wire = json.loads(json.dumps(dumped)) + assert wire["classes"] == ["book", "cancel", "reschedule"] + assert wire["source_evaluator"] == "intent_match" + assert wire["expected"] == "" + assert wire["actual"] == "" + + async def test_wire_payload_can_be_round_tripped_back_to_model(self) -> None: + evaluator = _build_evaluator() + result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria()) + assert isinstance(result.details, ClassifierJustification) + + wire_string = json.dumps(result.details.model_dump()) + parsed = ClassifierJustification.model_validate_json(wire_string) + assert parsed.classes == ["book", "cancel", "reschedule"] + assert parsed.source_evaluator == "intent_match" + + +class TestFactoryIntegration: + def test_factory_builds_classifier_from_v1_config(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["book", "cancel", "reschedule"], + "sourceEvaluator": "intent_match", + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + assert evaluator.evaluator_config.classes == ["book", "cancel", "reschedule"] + assert evaluator.evaluator_config.source_evaluator == "intent_match" + assert evaluator.id == "intent_classifier" + + def test_factory_accepts_snake_case_aliases(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "classes": ["yes", "no"], + "source_evaluator": "yes_no_match", + }, + } + evaluator = EvaluatorFactory.create_evaluator(data) + assert isinstance(evaluator, ClassifierEvaluator) + assert evaluator.evaluator_config.source_evaluator == "yes_no_match" + + def test_factory_rejects_config_missing_classes(self) -> None: + data = { + "version": "1.0", + "id": "intent_classifier", + "name": "intent_classifier", + "evaluatorTypeId": EvaluatorType.CLASSIFIER.value, + "evaluatorConfig": { + "name": "intent_classifier", + "sourceEvaluator": "intent_match", + # classes missing + }, + } + with pytest.raises(UiPathEvaluationError): + EvaluatorFactory.create_evaluator(data)