From 749da30675ca4dfd601dcd39f317cce64f1c6f5b Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 21 May 2026 09:51:30 -0700
Subject: [PATCH 1/4] feat(eval): add ClassifierEvaluator (pure-metadata
 aggregator)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new evaluator type whose role is to carry a `classes` list and a
`source_evaluator` name to downstream consumers. It does not compute
classification metrics per datapoint — that work moves to the Studio Web
C# backend, which reads each datapoint's agent output and the source
evaluator's expected label after the per-datapoint loop finishes, scans
the output for each configured class, and builds the confusion matrix.

The per-datapoint evaluate() returns score=0.0 with a
ClassifierJustification(classes, source_evaluator) details payload. This
payload survives the existing CLI -> backend wire path via
StudioWebProgressReporter._serialize_justification (json.dumps of the
model_dump), arriving in the backend as a JSON string inside
CodedEvaluatorScore.Justification where the C# layer can read it.

Replaces the design in earlier draft PRs #1669 and #5307: the SDK no
longer owns the dataset-level computation. The pure-config approach is
~50 LOC instead of ~1500 LOC of dataset-evaluator framework + worker
workflow + factory + child workflow plumbing.

Files:
  src/uipath/eval/evaluators/classifier_evaluator.py  new (~90 LOC)
  src/uipath/eval/evaluators/__init__.py              re-export + EVALUATORS list
  src/uipath/eval/evaluators/evaluator.py             discriminator + Union entry
  src/uipath/eval/models/models.py                    EvaluatorType.CLASSIFIER
  tests/evaluators/test_classifier_evaluator.py       9 unit tests, all passing

Verified:
  pytest tests/evaluators tests/cli/eval --no-cov  -> 824 passed
  ruff check / ruff format / mypy                  -> clean

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/uipath/eval/evaluators/__init__.py    |   9 +
 .../eval/evaluators/classifier_evaluator.py   | 101 ++++++++++
 .../src/uipath/eval/evaluators/evaluator.py   |   7 +
 .../uipath/src/uipath/eval/models/models.py   |   1 +
 .../evaluators/test_classifier_evaluator.py   | 179 ++++++++++++++++++
 5 files changed, 297 insertions(+)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
 create mode 100644 packages/uipath/tests/evaluators/test_classifier_evaluator.py

diff --git a/packages/uipath/src/uipath/eval/evaluators/__init__.py b/packages/uipath/src/uipath/eval/evaluators/__init__.py
index 03a4bf63b..1908b6b1e 100644
--- a/packages/uipath/src/uipath/eval/evaluators/__init__.py
+++ b/packages/uipath/src/uipath/eval/evaluators/__init__.py
@@ -11,6 +11,11 @@
 )
 from .base_legacy_evaluator import BaseLegacyEvaluator
 from .binary_classification_evaluator import BinaryClassificationEvaluator
+from .classifier_evaluator import (
+    ClassifierEvaluator,
+    ClassifierEvaluatorConfig,
+    ClassifierJustification,
+)
 
 # Legacy evaluators
 from .contains_evaluator import ContainsEvaluator
@@ -46,6 +51,7 @@
     ContainsEvaluator,
     BinaryClassificationEvaluator,
     MulticlassClassificationEvaluator,
+    ClassifierEvaluator,
     JsonSimilarityEvaluator,
     LLMJudgeOutputEvaluator,
     LLMJudgeStrictJSONSimilarityOutputEvaluator,
@@ -70,6 +76,9 @@
     "BaseEvaluator",
     "BinaryClassificationEvaluator",
     "MulticlassClassificationEvaluator",
+    "ClassifierEvaluator",
+    "ClassifierEvaluatorConfig",
+    "ClassifierJustification",
     "ContainsEvaluator",
     "ExactMatchEvaluator",
     "JsonSimilarityEvaluator",
diff --git a/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
new file mode 100644
index 000000000..9247dbd76
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
@@ -0,0 +1,101 @@
+"""Classification aggregator evaluator.
+
+Pure-metadata evaluator: it carries a `classes` list and a `source_evaluator`
+name, but does NOT compute classification metrics per datapoint. The actual
+TP/TN/FP/FN tallying happens downstream (the C# layer in Studio Web reads the
+agent output and the source evaluator's expected label, scans the output for
+each configured class, and computes precision/recall/F-score after the
+per-datapoint loop completes).
+
+The per-datapoint `evaluate(...)` returns a sentinel score of 0.0 with a
+ClassifierJustification payload. The payload survives the existing CLI →
+backend wire path (via `_serialize_justification`) as a JSON-stringified
+object, where the C# layer reads `classes` and `sourceEvaluator` from the
+first per-datapoint result.
+"""
+
+from ..models import (
+    AgentExecution,
+    EvaluationResult,
+    EvaluatorType,
+    NumericEvaluationResult,
+)
+from .base_evaluator import (
+    BaseEvaluationCriteria,
+    BaseEvaluator,
+    BaseEvaluatorConfig,
+    BaseEvaluatorJustification,
+)
+
+
+class ClassifierJustification(BaseEvaluatorJustification):
+    """Metadata payload shipped per datapoint so the backend can read the classes list.
+
+    Extends BaseEvaluatorJustification so the framework's J generic bound
+    (`Union[str, BaseEvaluatorJustification]`) is satisfied; expected/actual
+    are not meaningful for this evaluator and default to empty strings.
+    """
+
+    expected: str = ""
+    actual: str = ""
+    classes: list[str]
+    source_evaluator: str
+
+
+class ClassifierEvaluatorConfig(BaseEvaluatorConfig[BaseEvaluationCriteria]):
+    """Configuration for the classification aggregator evaluator."""
+
+    name: str = "ClassifierEvaluator"
+    classes: list[str]
+    source_evaluator: str
+    # Default criteria is an empty BaseEvaluationCriteria so the runtime's
+    # validate_and_evaluate_criteria fallback doesn't trip when an eval item
+    # has no per-datapoint criteria for this evaluator (the common case —
+    # the classifier doesn't need per-datapoint config).
+    default_evaluation_criteria: BaseEvaluationCriteria = BaseEvaluationCriteria()
+
+
+class ClassifierEvaluator(
+    BaseEvaluator[
+        BaseEvaluationCriteria, ClassifierEvaluatorConfig, ClassifierJustification
+    ]
+):
+    """Carries the classes list to the backend; does no per-datapoint scoring.
+
+    Add this to an evaluation set alongside the per-datapoint evaluator (e.g.
+    ExactMatch) that produces expected/actual labels. The backend uses the
+    classes list, the per-datapoint outputs, and the source evaluator's
+    expected labels to build a confusion matrix + per-class metrics after the
+    set finishes.
+    """
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the evaluatorTypeId discriminator on configs."""
+        return EvaluatorType.CLASSIFIER.value
+
+    async def evaluate(
+        self,
+        agent_execution: AgentExecution,
+        evaluation_criteria: BaseEvaluationCriteria,
+    ) -> EvaluationResult:
+        """Return a sentinel per-datapoint result carrying the classes metadata.
+
+        The score is fixed at 0.0 because this evaluator has no per-datapoint
+        notion of pass/fail. Downstream code (the C# layer) ignores the score
+        and reads `details.classes` + `details.source_evaluator` to drive the
+        run-level classification math.
+        """
+        # agent_execution and evaluation_criteria intentionally unused; the
+        # value of this evaluator is the config it carries, not any per-
+        # datapoint computation. Touch them so linters don't flag.
+        _ = agent_execution
+        _ = evaluation_criteria
+
+        return NumericEvaluationResult(
+            score=0.0,
+            details=ClassifierJustification(
+                classes=list(self.evaluator_config.classes),
+                source_evaluator=self.evaluator_config.source_evaluator,
+            ),
+        )
diff --git a/packages/uipath/src/uipath/eval/evaluators/evaluator.py b/packages/uipath/src/uipath/eval/evaluators/evaluator.py
index b9b818847..2f4b56dd7 100644
--- a/packages/uipath/src/uipath/eval/evaluators/evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/evaluator.py
@@ -31,6 +31,7 @@
     ToolCallOutputEvaluator,
 )
 from .base_evaluator import BaseEvaluator, BaseEvaluatorConfig
+from .classifier_evaluator import ClassifierEvaluator
 
 
 class UnknownLegacyEvaluator(BaseLegacyEvaluator[Any]):
@@ -141,6 +142,8 @@ def coded_evaluator_discriminator(data: Any) -> str:
                 return "BinaryClassificationEvaluator"
             case EvaluatorType.MULTICLASS_CLASSIFICATION:
                 return "MulticlassClassificationEvaluator"
+            case EvaluatorType.CLASSIFIER:
+                return "ClassifierEvaluator"
             case _:
                 return "UnknownEvaluator"
     else:
@@ -201,6 +204,10 @@ def coded_evaluator_discriminator(data: Any) -> str:
             MulticlassClassificationEvaluator,
             Tag("MulticlassClassificationEvaluator"),
         ],
+        Annotated[
+            ClassifierEvaluator,
+            Tag("ClassifierEvaluator"),
+        ],
         Annotated[
             UnknownCodedEvaluator,
             Tag("UnknownEvaluator"),
diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py
index d2dc26df9..309cbe85a 100644
--- a/packages/uipath/src/uipath/eval/models/models.py
+++ b/packages/uipath/src/uipath/eval/models/models.py
@@ -300,6 +300,7 @@ class EvaluatorType(str, Enum):
     TOOL_CALL_OUTPUT = "uipath-tool-call-output"
     BINARY_CLASSIFICATION = "uipath-binary-classification"
     MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification"
+    CLASSIFIER = "uipath-classifier"
 
 
 class ToolCall(BaseModel):
diff --git a/packages/uipath/tests/evaluators/test_classifier_evaluator.py b/packages/uipath/tests/evaluators/test_classifier_evaluator.py
new file mode 100644
index 000000000..182a2dac5
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_classifier_evaluator.py
@@ -0,0 +1,179 @@
+"""Tests for the pure-metadata ClassifierEvaluator.
+
+This evaluator carries a `classes` list to downstream consumers (the C# layer
+in Studio Web). Its per-datapoint evaluate is a no-op that emits the classes
+list as a justification payload. The tests below pin that contract.
+"""
+
+import json
+
+import pytest
+
+from uipath.eval.evaluators import (
+    ClassifierEvaluator,
+    ClassifierJustification,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluationCriteria
+from uipath.eval.evaluators.evaluator_factory import EvaluatorFactory
+from uipath.eval.models import AgentExecution, EvaluatorType, NumericEvaluationResult
+from uipath.eval.models.models import UiPathEvaluationError
+
+
+def _build_evaluator(
+    classes: list[str] | None = None, source_evaluator: str = "intent_match"
+) -> ClassifierEvaluator:
+    # Construct via the factory to match how real eval-set runs build evaluators.
+    data = {
+        "version": "1.0",
+        "id": "intent_classifier",
+        "name": "intent_classifier",
+        "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+        "evaluatorConfig": {
+            "name": "intent_classifier",
+            "classes": classes
+            if classes is not None
+            else ["book", "cancel", "reschedule"],
+            "sourceEvaluator": source_evaluator,
+        },
+    }
+    evaluator = EvaluatorFactory.create_evaluator(data)
+    assert isinstance(evaluator, ClassifierEvaluator)
+    return evaluator
+
+
+def _agent_execution(output: dict[str, str] | str | None = None) -> AgentExecution:
+    return AgentExecution(
+        agent_input={"text": "hello"},
+        agent_output=output if output is not None else {"intent": "book"},
+        agent_trace=[],
+    )
+
+
+class TestClassifierEvaluator:
+    async def test_evaluate_returns_zero_score_with_classifier_justification(
+        self,
+    ) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        assert isinstance(result.details, ClassifierJustification)
+        assert result.details.classes == ["book", "cancel", "reschedule"]
+        assert result.details.source_evaluator == "intent_match"
+        # expected / actual are not meaningful for this evaluator
+        assert result.details.expected == ""
+        assert result.details.actual == ""
+
+    async def test_classes_list_is_independent_copy(self) -> None:
+        # If a caller mutates the result's classes list, it shouldn't leak into the config.
+        evaluator = _build_evaluator(classes=["a", "b"])
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+        result.details.classes.append("c")
+        assert evaluator.evaluator_config.classes == ["a", "b"]
+
+    async def test_score_is_zero_regardless_of_agent_output(self) -> None:
+        evaluator = _build_evaluator()
+        for output in (
+            None,
+            {},
+            {"intent": "book"},
+            {"intent": "totally-unrelated"},
+            "free text output mentioning cancel",
+        ):
+            result = await evaluator.evaluate(
+                _agent_execution(output), BaseEvaluationCriteria()
+            )
+            assert result.score == 0.0
+
+    async def test_evaluate_does_not_error_on_missing_criteria(self) -> None:
+        # The runtime's validate_and_evaluate_criteria falls back to
+        # default_evaluation_criteria when None is passed. Confirm the config's
+        # default_evaluation_criteria covers that case.
+        evaluator = _build_evaluator()
+        result = await evaluator.validate_and_evaluate_criteria(
+            _agent_execution(), None
+        )
+        assert result.score == 0.0
+        assert isinstance(result.details, ClassifierJustification)
+        assert result.details.classes == ["book", "cancel", "reschedule"]
+
+
+class TestClassifierJustificationWireShape:
+    """Pin the JSON shape that flows from CLI → C# via _serialize_justification."""
+
+    async def test_model_dump_carries_all_config_metadata(self) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+
+        dumped = result.details.model_dump()
+        # The CLI ships this via json.dumps(model_dump()) — the resulting string
+        # is what lands in CodedEvaluatorScore.Justification in the backend.
+        wire = json.loads(json.dumps(dumped))
+        assert wire["classes"] == ["book", "cancel", "reschedule"]
+        assert wire["source_evaluator"] == "intent_match"
+        assert wire["expected"] == ""
+        assert wire["actual"] == ""
+
+    async def test_wire_payload_can_be_round_tripped_back_to_model(self) -> None:
+        evaluator = _build_evaluator()
+        result = await evaluator.evaluate(_agent_execution(), BaseEvaluationCriteria())
+        assert isinstance(result.details, ClassifierJustification)
+
+        wire_string = json.dumps(result.details.model_dump())
+        parsed = ClassifierJustification.model_validate_json(wire_string)
+        assert parsed.classes == ["book", "cancel", "reschedule"]
+        assert parsed.source_evaluator == "intent_match"
+
+
+class TestFactoryIntegration:
+    def test_factory_builds_classifier_from_v1_config(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "classes": ["book", "cancel", "reschedule"],
+                "sourceEvaluator": "intent_match",
+            },
+        }
+        evaluator = EvaluatorFactory.create_evaluator(data)
+        assert isinstance(evaluator, ClassifierEvaluator)
+        assert evaluator.evaluator_config.classes == ["book", "cancel", "reschedule"]
+        assert evaluator.evaluator_config.source_evaluator == "intent_match"
+        assert evaluator.id == "intent_classifier"
+
+    def test_factory_accepts_snake_case_aliases(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "classes": ["yes", "no"],
+                "source_evaluator": "yes_no_match",
+            },
+        }
+        evaluator = EvaluatorFactory.create_evaluator(data)
+        assert isinstance(evaluator, ClassifierEvaluator)
+        assert evaluator.evaluator_config.source_evaluator == "yes_no_match"
+
+    def test_factory_rejects_config_missing_classes(self) -> None:
+        data = {
+            "version": "1.0",
+            "id": "intent_classifier",
+            "name": "intent_classifier",
+            "evaluatorTypeId": EvaluatorType.CLASSIFIER.value,
+            "evaluatorConfig": {
+                "name": "intent_classifier",
+                "sourceEvaluator": "intent_match",
+                # classes missing
+            },
+        }
+        with pytest.raises(UiPathEvaluationError):
+            EvaluatorFactory.create_evaluator(data)

From 4067b5a16c8bed1b9413d6eb3c685ce0b7a1442b Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 21 May 2026 10:17:18 -0700
Subject: [PATCH 2/4] test(eval): add classifier_demo fixture for end-to-end
 SDK validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A minimal 3-class intent classification agent (book / cancel / reschedule)
that exercises the new ClassifierEvaluator end-to-end via `uipath eval`.
Mirrors the wire shape Studio Web will see once the C# backend and frontend
PRs land, so SDK changes can be validated standalone before the full stack
is brought up.

Layout:
  main.py             — keyword classifier returning {"intent": "..."}
  evaluations/
    eval-sets/main.json
    evaluators/
      intent_match.json       per-datapoint ExactMatch on .intent
      intent_classifier.json  new uipath-classifier with classes + sourceEvaluator
  README.md           — Path A (SDK CLI) + Path B (Studio Web) instructions

Each datapoint has `evaluationCriterias.intent_classifier: {}` (the runtime
skips evaluators that aren't keyed there). 6/9 datapoints are correctly
classified by design; the resulting (expected, actual) pairs flow through
the existing CLI -> backend wire path inside the classifier's justification
payload as classes/source_evaluator metadata.

Verified live:
  - ExactMatch averages to 0.7 (6/9 correct).
  - ClassifierEvaluator emits {"expected":"","actual":"","classes":[...],
    "source_evaluator":"intent_match"} per datapoint.
  - Plugging the (expected, actual) pairs from the resulting output into the
    same confusion-matrix math the C# helper implements yields macro F1 of
    0.667 on this fixture — the number Studio Web's Aggregations panel
    would render once the backend pipeline is live.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../uipath/samples/classifier_demo/README.md  | 139 ++++++++++++++
 .../samples/classifier_demo/bindings.json     |   4 +
 .../evaluations/eval-sets/main.json           | 173 ++++++++++++++++++
 .../evaluators/intent_classifier.json         |  11 ++
 .../evaluations/evaluators/intent_match.json  |  15 ++
 .../uipath/samples/classifier_demo/main.py    |  42 +++++
 .../samples/classifier_demo/pyproject.toml    |   9 +
 .../samples/classifier_demo/uipath.json       |   5 +
 8 files changed, 398 insertions(+)
 create mode 100644 packages/uipath/samples/classifier_demo/README.md
 create mode 100644 packages/uipath/samples/classifier_demo/bindings.json
 create mode 100644 packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json
 create mode 100644 packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json
 create mode 100644 packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json
 create mode 100644 packages/uipath/samples/classifier_demo/main.py
 create mode 100644 packages/uipath/samples/classifier_demo/pyproject.toml
 create mode 100644 packages/uipath/samples/classifier_demo/uipath.json

diff --git a/packages/uipath/samples/classifier_demo/README.md b/packages/uipath/samples/classifier_demo/README.md
new file mode 100644
index 000000000..638765ab1
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/README.md
@@ -0,0 +1,139 @@
+# Classifier evaluator end-to-end demo
+
+A minimal intent-classification agent that exercises the new
+`ClassifierEvaluator` end-to-end. Use this as the test fixture for both
+SDK-only validation (Path A below) and Studio Web full-stack validation
+(Path B).
+
+## What's here
+
+```
+classifier_demo/
+├── main.py                       # 3-class keyword classifier
+├── uipath.json
+├── pyproject.toml
+├── bindings.json
+└── evaluations/
+    ├── eval-sets/
+    │   └── main.json             # 9 datapoints, 3 per class, some intentionally wrong
+    └── evaluators/
+        ├── intent_match.json     # per-datapoint ExactMatch on agent_output.intent
+        └── intent_classifier.json # the new uipath-classifier (pure metadata)
+```
+
+The eval set is wired so that for every datapoint both evaluators run:
+- `intent_match` produces a 1.0/0.0 score with `{"expected": "...", "actual": "..."}` justification.
+- `intent_classifier` produces a sentinel 0.0 score with `{"classes": [...], "source_evaluator": "intent_match"}` justification.
+
+Downstream (the C# layer in Studio Web) reads both to compute precision /
+recall / F-score across the dataset.
+
+> Heads-up — every datapoint must have an entry for the classifier in
+> `evaluationCriterias` (even an empty `{}`). The runtime currently skips
+> evaluators that aren't keyed in `evaluationCriterias` for a datapoint, so
+> omitting them silently drops the classifier results.
+
+## Path A — SDK only (real run, ~30 seconds)
+
+```bash
+cd packages/uipath
+uv sync --all-extras
+
+cd samples/classifier_demo
+uv run --project ../.. uipath eval main main.json --no-report --output-file /tmp/out.json
+```
+
+Expected: a results table with two columns (`intent_classifier`, `intent_match`).
+`intent_match` averages to 0.7 (6/9 correct). `intent_classifier` shows 0.0 per
+row by design — its real work is to ship the classes list to the backend.
+
+To see the metadata payload that lands in the backend's
+`CodedEvaluatorScore.Justification`:
+
+```bash
+python3 -c "
+import json
+with open('/tmp/out.json') as f: d = json.load(f)
+for r in d['evaluationSetResults'][0]['evaluationRunResults']:
+    print(r['evaluatorName'], r['result'].get('details'))
+"
+```
+
+You should see something like:
+
+```
+intent_classifier  {'expected': '', 'actual': '', 'classes': ['book', 'cancel', 'reschedule'], 'source_evaluator': 'intent_match'}
+intent_match       {'expected': 'book', 'actual': 'book'}
+```
+
+## Path B — Full Studio Web stack (real UI, click Run, see panel)
+
+Currently blocked on environment that I (the assistant who built this) didn't
+have available locally. The pieces:
+
+### Prereqs (per `Agents/LOCAL_DEVELOPMENT.md`)
+- Docker installed and running
+- `make` available
+- Azure CLI authenticated session (`az login`)
+- Azure DevOps PAT exported as `AZURE_DEVOPS_PAT`
+- GitHub NPM registry token exported as `GH_NPM_REGISTRY_TOKEN`
+- Azure access token exported as `AZURE_ACCESS_TOKEN` (for the python worker build)
+- `cloud-provider-kind` binary (used for the local KinD cluster)
+
+### Steps
+
+1. **Point python-eval-worker at the local SDK branch.** The published
+   `uipath` package on PyPI doesn't yet have `ClassifierEvaluator`. Edit
+   `Agents/python-eval-worker/pyproject.toml`:
+
+   ```toml
+   [tool.uv.sources]
+   uipath = { path = "../../uipath-python/packages/uipath", editable = true }
+   ```
+
+   Then `cd python-eval-worker && uv lock && uv sync`.
+
+2. **Bring up the local KinD cluster** (from `Agents/`):
+   ```bash
+   make create-kind-cluster
+   kubectl get nodes
+   sudo ./bin/cloud-provider-kind &      # in a separate shell or background
+   make up
+   make deploy
+   ```
+
+3. **Build the backend with the classifier changes:**
+   ```bash
+   git checkout feat/eval-classifier-backend       # in Agents repo
+   # Re-trigger the helm/skaffold deploy for the backend
+   make deploy
+   ```
+
+4. **Build the frontend with the UI changes:**
+   ```bash
+   git checkout feat/eval-dataset-evaluators-ui    # in Agents repo
+   # Same deploy command rebuilds frontend image
+   ```
+
+5. **Open Studio Web** (URL surfaced by the deploy output), create an agent
+   project, upload the eval-set + evaluator JSONs from this directory (or
+   author them in the UI — the picker now shows a "Classifier" entry under
+   the AGGREGATION section), and click Run.
+
+6. **Verify** the Aggregations panel renders between the run header and the
+   datapoint table, with the confusion matrix matching what Path A's Python
+   shim computes (macro F1 ≈ 0.667 on this fixture).
+
+### Open questions for the team owning local dev
+
+- Does the existing PAT / token set get refreshed automatically by the dev tooling, or do contributors need to rotate them periodically?
+- Is there a simpler "local-only" path that bypasses the KinD cluster (e.g. docker-compose) for changes that don't touch K8s manifests?
+- What's the standard pattern for pointing the python worker at a non-PyPI uipath build? The `[tool.uv.sources]` override above is the standard uv path — confirm there's no Helm/skaffold complication.
+
+## Companion PRs
+
+| Repo | Branch | PR | What |
+|---|---|---|---|
+| uipath-python | `feat/eval-classifier-evaluator` | [#1674](https://github.com/UiPath/uipath-python/pull/1674) | SDK `ClassifierEvaluator` |
+| Agents | `feat/eval-classifier-backend` | [#5313](https://github.com/UiPath/Agents/pull/5313) | C# math + activity + envelope storage |
+| Agents | `feat/eval-dataset-evaluators-ui` | [#5306](https://github.com/UiPath/Agents/pull/5306) | Frontend picker + Aggregations panel |
diff --git a/packages/uipath/samples/classifier_demo/bindings.json b/packages/uipath/samples/classifier_demo/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json
new file mode 100644
index 000000000..117e9e240
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/eval-sets/main.json
@@ -0,0 +1,173 @@
+{
+  "version": "1.0",
+  "id": "classifier-demo-eval-set",
+  "name": "Classifier demo eval set",
+  "evaluatorRefs": [
+    "intent_match",
+    "intent_classifier"
+  ],
+  "evaluations": [
+    {
+      "id": "book-1",
+      "name": "book \u2014 straightforward",
+      "inputs": {
+        "utterance": "I want to book a table for two"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "book-2",
+      "name": "book \u2014 schedule keyword",
+      "inputs": {
+        "utterance": "Please schedule an appointment"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "book-3",
+      "name": "book \u2014 agent misclassifies (utterance triggers cancel keyword)",
+      "inputs": {
+        "utterance": "I had to cancel my last attempt but I want to reserve a slot now"
+      },
+      "expectedOutput": {
+        "intent": "book"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "book"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-1",
+      "name": "cancel \u2014 straightforward",
+      "inputs": {
+        "utterance": "Please cancel my reservation"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-2",
+      "name": "cancel \u2014 void synonym",
+      "inputs": {
+        "utterance": "I want to void the order"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "cancel-3",
+      "name": "cancel \u2014 agent misclassifies (utterance has 'move' which triggers reschedule)",
+      "inputs": {
+        "utterance": "I need to move past this and cancel everything"
+      },
+      "expectedOutput": {
+        "intent": "cancel"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "cancel"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-1",
+      "name": "reschedule \u2014 straightforward",
+      "inputs": {
+        "utterance": "I want to reschedule the meeting"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-2",
+      "name": "reschedule \u2014 move synonym",
+      "inputs": {
+        "utterance": "Can we move the slot to tomorrow"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    },
+    {
+      "id": "reschedule-3",
+      "name": "reschedule \u2014 agent misclassifies (falls through to default 'book')",
+      "inputs": {
+        "utterance": "Different timing please"
+      },
+      "expectedOutput": {
+        "intent": "reschedule"
+      },
+      "evaluationCriterias": {
+        "intent_match": {
+          "expectedOutput": {
+            "intent": "reschedule"
+          }
+        },
+        "intent_classifier": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json
new file mode 100644
index 000000000..ace8cb712
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_classifier.json
@@ -0,0 +1,11 @@
+{
+  "version": "1.0",
+  "id": "intent_classifier",
+  "description": "Classification aggregator. Pure metadata — carries the classes list + source evaluator name to downstream consumers (the C# backend computes precision/recall/F-score over the dataset). Per-datapoint result is a no-op carrying the metadata.",
+  "evaluatorTypeId": "uipath-classifier",
+  "evaluatorConfig": {
+    "name": "intent_classifier",
+    "classes": ["book", "cancel", "reschedule"],
+    "sourceEvaluator": "intent_match"
+  }
+}
diff --git a/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json
new file mode 100644
index 000000000..552c7220f
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/evaluations/evaluators/intent_match.json
@@ -0,0 +1,15 @@
+{
+  "version": "1.0",
+  "id": "intent_match",
+  "description": "Per-datapoint ExactMatch on the agent's `intent` output. Produces expected/actual justification that the ClassifierEvaluator pipeline reads.",
+  "evaluatorTypeId": "uipath-exact-match",
+  "evaluatorConfig": {
+    "name": "intent_match",
+    "targetOutputKey": "intent",
+    "caseSensitive": false,
+    "negated": false,
+    "defaultEvaluationCriteria": {
+      "expectedOutput": "book"
+    }
+  }
+}
diff --git a/packages/uipath/samples/classifier_demo/main.py b/packages/uipath/samples/classifier_demo/main.py
new file mode 100644
index 000000000..b6e1eea48
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/main.py
@@ -0,0 +1,42 @@
+"""Tiny intent-classification agent for the ClassifierEvaluator demo.
+
+Given an utterance, returns the intent label. Three intents:
+  - book        (anything containing "book" / "reserve" / "schedule")
+  - cancel      (anything containing "cancel" / "void")
+  - reschedule  (anything containing "reschedule" / "move")
+
+A few datapoints are deliberately misclassified so the run-level
+classification metrics (precision/recall/F-score) come out non-trivially.
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class IntentInput:
+    utterance: str
+
+
+@dataclass
+class IntentOutput:
+    intent: str
+
+
+BOOK_KEYWORDS = {"book", "reserve", "schedule"}
+CANCEL_KEYWORDS = {"cancel", "void"}
+RESCHEDULE_KEYWORDS = {"reschedule", "move"}
+
+
+async def main(input: IntentInput) -> IntentOutput:
+    """Classify the utterance into book / cancel / reschedule."""
+    text = input.utterance.lower()
+    tokens = set(text.split())
+
+    if tokens & RESCHEDULE_KEYWORDS:
+        return IntentOutput(intent="reschedule")
+    if tokens & CANCEL_KEYWORDS:
+        return IntentOutput(intent="cancel")
+    if tokens & BOOK_KEYWORDS:
+        return IntentOutput(intent="book")
+    # Fallback to "book" — deliberately wrong-ish so the matrix is interesting.
+    return IntentOutput(intent="book")
diff --git a/packages/uipath/samples/classifier_demo/pyproject.toml b/packages/uipath/samples/classifier_demo/pyproject.toml
new file mode 100644
index 000000000..307e3778c
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "classifier-demo"
+version = "0.0.1"
+description = "Tiny intent-classification agent that exercises the new ClassifierEvaluator end-to-end via `uipath eval`."
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/classifier_demo/uipath.json b/packages/uipath/samples/classifier_demo/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/classifier_demo/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}

From b2c32bde8bd0300fbc296e5c1ac96eb573697c37 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Thu, 21 May 2026 20:21:25 -0700
Subject: [PATCH 3/4] fix(eval): give ClassifierEvaluator a concrete
 EvaluationCriteria type

Pydantic's generic resolution leaves T = typing.Any when a TypeVar is
parameterized with its own bound (BaseEvaluationCriteria here), so
BaseEvaluator[BaseEvaluationCriteria, ...] tripped the runtime's
"X must be a subclass of BaseEvaluationCriteria" guard at load time:

  Failed to create evaluator from file 'evaluations/evaluators/classifier-*.json':
  typing.Any must be a subclass of BaseEvaluationCriteria.

Introduce an empty ClassifierEvaluationCriteria(BaseEvaluationCriteria)
subclass and parameterize Config + Evaluator with it. Mirrors how every
other built-in evaluator (ExactMatch via OutputEvaluationCriteria, etc.)
provides a concrete criteria type even when no per-datapoint fields are
needed.
---
 .../eval/evaluators/classifier_evaluator.py   | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
index 9247dbd76..1b743e294 100644
--- a/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
@@ -28,6 +28,17 @@
 )
 
 
+class ClassifierEvaluationCriteria(BaseEvaluationCriteria):
+    """Empty per-datapoint criteria for the classifier aggregator.
+
+    The classifier has no per-datapoint config; this concrete subclass exists
+    only because Pydantic's generic resolution requires a concrete (non-bound)
+    type — using `BaseEvaluationCriteria` directly leaves T as `Any`.
+    """
+
+    pass
+
+
 class ClassifierJustification(BaseEvaluatorJustification):
     """Metadata payload shipped per datapoint so the backend can read the classes list.
 
@@ -42,22 +53,22 @@ class ClassifierJustification(BaseEvaluatorJustification):
     source_evaluator: str
 
 
-class ClassifierEvaluatorConfig(BaseEvaluatorConfig[BaseEvaluationCriteria]):
+class ClassifierEvaluatorConfig(BaseEvaluatorConfig[ClassifierEvaluationCriteria]):
     """Configuration for the classification aggregator evaluator."""
 
     name: str = "ClassifierEvaluator"
     classes: list[str]
     source_evaluator: str
-    # Default criteria is an empty BaseEvaluationCriteria so the runtime's
-    # validate_and_evaluate_criteria fallback doesn't trip when an eval item
-    # has no per-datapoint criteria for this evaluator (the common case —
-    # the classifier doesn't need per-datapoint config).
-    default_evaluation_criteria: BaseEvaluationCriteria = BaseEvaluationCriteria()
+    default_evaluation_criteria: ClassifierEvaluationCriteria = (
+        ClassifierEvaluationCriteria()
+    )
 
 
 class ClassifierEvaluator(
     BaseEvaluator[
-        BaseEvaluationCriteria, ClassifierEvaluatorConfig, ClassifierJustification
+        ClassifierEvaluationCriteria,
+        ClassifierEvaluatorConfig,
+        ClassifierJustification,
     ]
 ):
     """Carries the classes list to the backend; does no per-datapoint scoring.
@@ -77,7 +88,7 @@ def get_evaluator_id(cls) -> str:
     async def evaluate(
         self,
         agent_execution: AgentExecution,
-        evaluation_criteria: BaseEvaluationCriteria,
+        evaluation_criteria: ClassifierEvaluationCriteria,
     ) -> EvaluationResult:
         """Return a sentinel per-datapoint result carrying the classes metadata.
 

From e92e734ca83a2e9b44e8a73f8f02d2aa1f7c7608 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Sat, 23 May 2026 19:18:02 -0700
Subject: [PATCH 4/4] feat(eval): collapse standalone Classifier into
 ExactMatch.aggregators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the standalone ClassifierEvaluator with an `aggregators` config
field on per-datapoint evaluators (ExactMatch first). Run-level classification
metrics are now driven by the host evaluator's config, not by a separate
evaluator with a source-evaluator ID reference.

Design rationale (see Confluence "Design for Precision and Recall" §5.2):
the standalone evaluator forced users to add TWO evaluators and copy an
opaque ID between them. Moving aggregator config onto the evaluator that
already emits the labels keeps the source of truth in one place and makes
the JSON file portable across conversions (e.g. low-code -> coded).

- New module `_aggregators.py` with AggregatorSpec / ClassificationAggregatorSpec
- ExactMatchEvaluatorConfig gains optional `aggregators: list[AggregatorSpec] | None`
  The Python runtime ignores the field; it's metadata for the downstream
  C# aggregation pass.
- `_progress_reporter.py:_build_evaluator_snapshot` now also emits `aggregators`
  so the field flows into EvaluatorRun.EvaluatorSnapshot and the C# layer can
  discover it without consulting the eval set definition file separately.
  Bug fix: previously the builder only emitted prompt+model (LLM-judge only),
  so for ExactMatch the dict was empty and the snapshot ended up null in
  the wire payload.
- ClassifierEvaluator, ClassifierEvaluationCriteria, ClassifierJustification,
  ClassifierEvaluatorConfig: all deleted.
- EvaluatorType.CLASSIFIER enum value removed.
- Discriminator union in evaluator.py drops the Classifier branch.

Version bump 2.10.70 -> 2.10.72 (the previous .71 was an unused dev cache-bust).
The new ExactMatch.aggregators field is a public API change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/uipath/pyproject.toml                |   2 +-
 .../uipath/_cli/_evals/_progress_reporter.py  |  17 ++-
 .../src/uipath/eval/evaluators/__init__.py    |  13 +-
 .../uipath/eval/evaluators/_aggregators.py    |  35 ++++++
 .../eval/evaluators/classifier_evaluator.py   | 112 ------------------
 .../src/uipath/eval/evaluators/evaluator.py   |   7 --
 .../eval/evaluators/exact_match_evaluator.py  |  10 +-
 .../uipath/src/uipath/eval/models/models.py   |   1 -
 8 files changed, 65 insertions(+), 132 deletions(-)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/_aggregators.py
 delete mode 100644 packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py

diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 8e9c9f581..81c597d68 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.10.70"
+version = "2.10.72"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py
index fd4849076..792f929be 100644
--- a/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py
+++ b/packages/uipath/src/uipath/_cli/_evals/_progress_reporter.py
@@ -960,7 +960,14 @@ def _build_assertion_properties(
     def _build_evaluator_snapshot(
         evaluator: BaseEvaluator[Any, Any, Any],
     ) -> dict[str, Any]:
-        """Build evaluatorSnapshot dict with prompt and model if available."""
+        """Build evaluatorSnapshot dict with prompt, model, and aggregators if available.
+
+        `aggregators` is a list of run-level aggregator specs attached to the
+        evaluator's config (e.g. an ExactMatch with `aggregators=[{name:
+        "classification", classes: [...]}]`). It's emitted here so the C#
+        post-pass can read aggregator configs without consulting the original
+        evaluator definition file.
+        """
         snapshot: dict[str, Any] = {}
         config = getattr(evaluator, "evaluator_config", None)
         if config is not None:
@@ -968,6 +975,14 @@ def _build_evaluator_snapshot(
                 snapshot["prompt"] = config.prompt
             if hasattr(config, "model") and isinstance(config.model, str):
                 snapshot["model"] = config.model
+            aggregators = getattr(config, "aggregators", None)
+            if aggregators:
+                # Serialize Pydantic models to plain dicts so the wire shape is
+                # readable from C# without referencing our Python types.
+                snapshot["aggregators"] = [
+                    spec.model_dump(by_alias=True) if hasattr(spec, "model_dump") else spec
+                    for spec in aggregators
+                ]
         return snapshot
 
     def _collect_results(
diff --git a/packages/uipath/src/uipath/eval/evaluators/__init__.py b/packages/uipath/src/uipath/eval/evaluators/__init__.py
index 1908b6b1e..c68271b11 100644
--- a/packages/uipath/src/uipath/eval/evaluators/__init__.py
+++ b/packages/uipath/src/uipath/eval/evaluators/__init__.py
@@ -9,13 +9,9 @@
     BaseEvaluatorConfig,
     BaseEvaluatorJustification,
 )
+from ._aggregators import AggregatorSpec, ClassificationAggregatorSpec
 from .base_legacy_evaluator import BaseLegacyEvaluator
 from .binary_classification_evaluator import BinaryClassificationEvaluator
-from .classifier_evaluator import (
-    ClassifierEvaluator,
-    ClassifierEvaluatorConfig,
-    ClassifierJustification,
-)
 
 # Legacy evaluators
 from .contains_evaluator import ContainsEvaluator
@@ -51,7 +47,6 @@
     ContainsEvaluator,
     BinaryClassificationEvaluator,
     MulticlassClassificationEvaluator,
-    ClassifierEvaluator,
     JsonSimilarityEvaluator,
     LLMJudgeOutputEvaluator,
     LLMJudgeStrictJSONSimilarityOutputEvaluator,
@@ -76,10 +71,10 @@
     "BaseEvaluator",
     "BinaryClassificationEvaluator",
     "MulticlassClassificationEvaluator",
-    "ClassifierEvaluator",
-    "ClassifierEvaluatorConfig",
-    "ClassifierJustification",
     "ContainsEvaluator",
+    # Aggregator specs (config metadata attached to per-datapoint evaluators)
+    "AggregatorSpec",
+    "ClassificationAggregatorSpec",
     "ExactMatchEvaluator",
     "JsonSimilarityEvaluator",
     "BaseLLMOutputEvaluator",
diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregators.py b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py
new file mode 100644
index 000000000..968974546
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregators.py
@@ -0,0 +1,35 @@
+"""Aggregator specs attached to per-datapoint evaluator configs.
+
+An aggregator is run-level — it consumes the per-datapoint results of an
+evaluator after the eval set finishes. The aggregator itself does not run in
+the Python runtime; this module just defines the config shape so the downstream
+consumer (the C# backend) can pick it up via the evaluator's stored config.
+
+Today the only aggregator is `classification`, which compares each datapoint's
+expected vs. predicted class to build a confusion matrix and precision/recall/
+F-score metrics.
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict
+from pydantic.alias_generators import to_camel
+
+
+class ClassificationAggregatorSpec(BaseModel):
+    """Configuration for a classification aggregator.
+
+    Attached to a per-datapoint evaluator (e.g. ExactMatch) to mark that the
+    evaluator's results should be aggregated into classification metrics. The
+    classes list defines the exhaustive label space; the C# layer scans each
+    datapoint's expected output for the first class that matches.
+    """
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    name: Literal["classification"] = "classification"
+    classes: list[str]
+
+
+# Union of all supported aggregator specs. Add new variants here.
+AggregatorSpec = ClassificationAggregatorSpec
diff --git a/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
deleted file mode 100644
index 1b743e294..000000000
--- a/packages/uipath/src/uipath/eval/evaluators/classifier_evaluator.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""Classification aggregator evaluator.
-
-Pure-metadata evaluator: it carries a `classes` list and a `source_evaluator`
-name, but does NOT compute classification metrics per datapoint. The actual
-TP/TN/FP/FN tallying happens downstream (the C# layer in Studio Web reads the
-agent output and the source evaluator's expected label, scans the output for
-each configured class, and computes precision/recall/F-score after the
-per-datapoint loop completes).
-
-The per-datapoint `evaluate(...)` returns a sentinel score of 0.0 with a
-ClassifierJustification payload. The payload survives the existing CLI →
-backend wire path (via `_serialize_justification`) as a JSON-stringified
-object, where the C# layer reads `classes` and `sourceEvaluator` from the
-first per-datapoint result.
-"""
-
-from ..models import (
-    AgentExecution,
-    EvaluationResult,
-    EvaluatorType,
-    NumericEvaluationResult,
-)
-from .base_evaluator import (
-    BaseEvaluationCriteria,
-    BaseEvaluator,
-    BaseEvaluatorConfig,
-    BaseEvaluatorJustification,
-)
-
-
-class ClassifierEvaluationCriteria(BaseEvaluationCriteria):
-    """Empty per-datapoint criteria for the classifier aggregator.
-
-    The classifier has no per-datapoint config; this concrete subclass exists
-    only because Pydantic's generic resolution requires a concrete (non-bound)
-    type — using `BaseEvaluationCriteria` directly leaves T as `Any`.
-    """
-
-    pass
-
-
-class ClassifierJustification(BaseEvaluatorJustification):
-    """Metadata payload shipped per datapoint so the backend can read the classes list.
-
-    Extends BaseEvaluatorJustification so the framework's J generic bound
-    (`Union[str, BaseEvaluatorJustification]`) is satisfied; expected/actual
-    are not meaningful for this evaluator and default to empty strings.
-    """
-
-    expected: str = ""
-    actual: str = ""
-    classes: list[str]
-    source_evaluator: str
-
-
-class ClassifierEvaluatorConfig(BaseEvaluatorConfig[ClassifierEvaluationCriteria]):
-    """Configuration for the classification aggregator evaluator."""
-
-    name: str = "ClassifierEvaluator"
-    classes: list[str]
-    source_evaluator: str
-    default_evaluation_criteria: ClassifierEvaluationCriteria = (
-        ClassifierEvaluationCriteria()
-    )
-
-
-class ClassifierEvaluator(
-    BaseEvaluator[
-        ClassifierEvaluationCriteria,
-        ClassifierEvaluatorConfig,
-        ClassifierJustification,
-    ]
-):
-    """Carries the classes list to the backend; does no per-datapoint scoring.
-
-    Add this to an evaluation set alongside the per-datapoint evaluator (e.g.
-    ExactMatch) that produces expected/actual labels. The backend uses the
-    classes list, the per-datapoint outputs, and the source evaluator's
-    expected labels to build a confusion matrix + per-class metrics after the
-    set finishes.
-    """
-
-    @classmethod
-    def get_evaluator_id(cls) -> str:
-        """Identifier matching the evaluatorTypeId discriminator on configs."""
-        return EvaluatorType.CLASSIFIER.value
-
-    async def evaluate(
-        self,
-        agent_execution: AgentExecution,
-        evaluation_criteria: ClassifierEvaluationCriteria,
-    ) -> EvaluationResult:
-        """Return a sentinel per-datapoint result carrying the classes metadata.
-
-        The score is fixed at 0.0 because this evaluator has no per-datapoint
-        notion of pass/fail. Downstream code (the C# layer) ignores the score
-        and reads `details.classes` + `details.source_evaluator` to drive the
-        run-level classification math.
-        """
-        # agent_execution and evaluation_criteria intentionally unused; the
-        # value of this evaluator is the config it carries, not any per-
-        # datapoint computation. Touch them so linters don't flag.
-        _ = agent_execution
-        _ = evaluation_criteria
-
-        return NumericEvaluationResult(
-            score=0.0,
-            details=ClassifierJustification(
-                classes=list(self.evaluator_config.classes),
-                source_evaluator=self.evaluator_config.source_evaluator,
-            ),
-        )
diff --git a/packages/uipath/src/uipath/eval/evaluators/evaluator.py b/packages/uipath/src/uipath/eval/evaluators/evaluator.py
index 2f4b56dd7..b9b818847 100644
--- a/packages/uipath/src/uipath/eval/evaluators/evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/evaluator.py
@@ -31,7 +31,6 @@
     ToolCallOutputEvaluator,
 )
 from .base_evaluator import BaseEvaluator, BaseEvaluatorConfig
-from .classifier_evaluator import ClassifierEvaluator
 
 
 class UnknownLegacyEvaluator(BaseLegacyEvaluator[Any]):
@@ -142,8 +141,6 @@ def coded_evaluator_discriminator(data: Any) -> str:
                 return "BinaryClassificationEvaluator"
             case EvaluatorType.MULTICLASS_CLASSIFICATION:
                 return "MulticlassClassificationEvaluator"
-            case EvaluatorType.CLASSIFIER:
-                return "ClassifierEvaluator"
             case _:
                 return "UnknownEvaluator"
     else:
@@ -204,10 +201,6 @@ def coded_evaluator_discriminator(data: Any) -> str:
             MulticlassClassificationEvaluator,
             Tag("MulticlassClassificationEvaluator"),
         ],
-        Annotated[
-            ClassifierEvaluator,
-            Tag("ClassifierEvaluator"),
-        ],
         Annotated[
             UnknownCodedEvaluator,
             Tag("UnknownEvaluator"),
diff --git a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
index 0f1b3e8e8..d8e5c45ba 100644
--- a/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/exact_match_evaluator.py
@@ -6,6 +6,7 @@
     EvaluatorType,
     NumericEvaluationResult,
 )
+from ._aggregators import AggregatorSpec
 from .base_evaluator import BaseEvaluatorJustification
 from .output_evaluator import (
     OutputEvaluationCriteria,
@@ -15,11 +16,18 @@
 
 
 class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
-    """Configuration for the exact match evaluator."""
+    """Configuration for the exact match evaluator.
+
+    The optional `aggregators` field attaches run-level aggregators (e.g. a
+    classification aggregator with a fixed class set) that the downstream
+    backend will compute after the eval set finishes. The Python runtime
+    itself ignores `aggregators` — it's pure metadata for the C# consumer.
+    """
 
     name: str = "ExactMatchEvaluator"
     case_sensitive: bool = False
     negated: bool = False
+    aggregators: list[AggregatorSpec] | None = None
 
 
 class ExactMatchEvaluator(
diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py
index 309cbe85a..d2dc26df9 100644
--- a/packages/uipath/src/uipath/eval/models/models.py
+++ b/packages/uipath/src/uipath/eval/models/models.py
@@ -300,7 +300,6 @@ class EvaluatorType(str, Enum):
     TOOL_CALL_OUTPUT = "uipath-tool-call-output"
     BINARY_CLASSIFICATION = "uipath-binary-classification"
     MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification"
-    CLASSIFIER = "uipath-classifier"
 
 
 class ToolCall(BaseModel):