From 5e574f1895feccb314fd929d57e15dd69580c5f0 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Wed, 20 May 2026 14:05:44 -0700
Subject: [PATCH 1/2] feat(eval): add dataset-level evaluator framework with
 precision/recall/f-score
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces a new BaseDatasetEvaluator concept that runs once per evaluation
set after all per-datapoint evaluators complete. It consumes per-datapoint
EvaluationResultDto values from a named source evaluator and emits a single
run-level EvaluationResult.

Includes three starter evaluators for multiclass classification metrics:

- PrecisionDatasetEvaluator
- RecallDatasetEvaluator
- FScoreDatasetEvaluator (configurable beta)

Each takes a required classes list (populated from the UI), supports micro
or macro averaging, and emits per-class TP/TN/FP/FN plus the confusion
matrix in details. Binary is the 2-class case — no separate binary path.

Architecture: BaseDatasetEvaluator is a parallel hierarchy to
GenericBaseEvaluator (not a subclass) so the per-datapoint dispatch loop
cannot accidentally pick up a dataset evaluator. Each dataset evaluator
declares a single source_evaluator by name; the runtime groups
per-datapoint results by evaluator name and routes the right list to each
dataset evaluator. Configs load from <eval_set>/../dataset_evaluators/*.json
mirroring the evaluators directory layout.

Patch version bumped: 2.10.68 -> 2.10.69.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 packages/uipath/pyproject.toml                |   2 +-
 packages/uipath/src/uipath/_cli/cli_eval.py   |   7 +
 .../eval/evaluators/base_dataset_evaluator.py |  75 ++++
 .../classification_dataset_evaluators.py      | 311 +++++++++++++
 .../evaluators/dataset_evaluator_factory.py   |  52 +++
 packages/uipath/src/uipath/eval/helpers.py    |  88 ++++
 .../src/uipath/eval/models/evaluation_set.py  |   3 +
 .../uipath/src/uipath/eval/models/models.py   |   3 +
 .../uipath/src/uipath/eval/runtime/_types.py  |   5 +-
 .../uipath/src/uipath/eval/runtime/context.py |   2 +
 .../uipath/src/uipath/eval/runtime/runtime.py |  50 +++
 .../test_dataset_classification_evaluators.py | 411 ++++++++++++++++++
 12 files changed, 1007 insertions(+), 2 deletions(-)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
 create mode 100644 packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
 create mode 100644 packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py

diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml
index 36550f54d..0d70cb383 100644
--- a/packages/uipath/pyproject.toml
+++ b/packages/uipath/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "uipath"
-version = "2.10.68"
+version = "2.10.69"
 description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.11"
diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py
index e101717d6..2e35db849 100644
--- a/packages/uipath/src/uipath/_cli/cli_eval.py
+++ b/packages/uipath/src/uipath/_cli/cli_eval.py
@@ -412,6 +412,13 @@ async def execute_eval():
                             get_agent_model(eval_context.runtime_schema),
                         )
 
+                        eval_context.dataset_evaluators = (
+                            await EvalHelpers.load_dataset_evaluators(
+                                resolved_eval_set_path,
+                                eval_context.evaluation_set,
+                            )
+                        )
+
                         # Runtime is not required anymore.
                         await runtime.dispose()
 
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
new file mode 100644
index 000000000..ae818a421
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -0,0 +1,75 @@
+"""Base abstractions for dataset-level evaluators.
+
+A dataset-level evaluator runs once per evaluation set, after all per-datapoint
+evaluators have produced their results. It consumes the per-datapoint
+EvaluationResultDto values from one named source evaluator and emits a single
+EvaluationResult that summarizes the dataset.
+
+Concretely distinct from GenericBaseEvaluator: different evaluate() signature,
+different lifecycle. Kept as a parallel hierarchy rather than a subclass so
+the runtime cannot accidentally dispatch a dataset evaluator through the
+per-datapoint loop.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import EvaluationResult, EvaluationResultDto
+
+
+class BaseDatasetEvaluatorConfig(BaseModel):
+    """Configuration shared by all dataset-level evaluators."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    id: str
+    name: str
+    type: str
+    source_evaluator: str = Field(
+        ...,
+        description=(
+            "Name of the per-datapoint evaluator whose EvaluationResultDto values "
+            "this dataset evaluator consumes."
+        ),
+    )
+
+
+ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig)
+
+
+class BaseDatasetEvaluator(ABC, Generic[ConfigT]):
+    """Abstract base for dataset-level evaluators.
+
+    Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto
+    values produced by ``config.source_evaluator``.
+    """
+
+    config: ConfigT
+
+    def __init__(self, config: ConfigT) -> None:
+        """Store the evaluator's configuration."""
+        self.config = config
+
+    @property
+    def name(self) -> str:
+        """Logical name of this evaluator instance (used as result-dict key)."""
+        return self.config.name
+
+    @property
+    def source_evaluator(self) -> str:
+        """Name of the upstream evaluator whose results this one consumes."""
+        return self.config.source_evaluator
+
+    @classmethod
+    @abstractmethod
+    def get_evaluator_id(cls) -> str:
+        """Stable identifier matching the ``type`` discriminator on configs."""
+
+    @abstractmethod
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Reduce per-datapoint results into a single run-level EvaluationResult."""
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
new file mode 100644
index 000000000..272541e21
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -0,0 +1,311 @@
+"""Dataset-level classification evaluators: Precision, Recall, F-score.
+
+All three share the same internal machinery — a k x k confusion matrix built
+from each per-datapoint result's BaseEvaluatorJustification (expected, actual)
+strings. They differ only in the final formula and (for F-score) the beta
+parameter. The headline ``score`` is the micro or macro average per config;
+``details`` carries the full per-class breakdown plus the confusion matrix.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    EvaluatorType,
+    NumericEvaluationResult,
+)
+from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig
+from .base_evaluator import BaseEvaluatorJustification
+
+
+def _coerce_justification(details: object) -> tuple[str, str] | None:
+    """Extract (expected, actual) from an EvaluationResultDto.details payload."""
+    if isinstance(details, BaseEvaluatorJustification):
+        return details.expected, details.actual
+    if isinstance(details, dict):
+        try:
+            j = BaseEvaluatorJustification.model_validate(details)
+        except Exception:
+            return None
+        return j.expected, j.actual
+    return None
+
+
+class PerClassMetrics(BaseModel):
+    """Per-class confusion counts plus the metric the evaluator computed."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    tp: int
+    tn: int
+    fp: int
+    fn: int
+    support: int
+    value: float
+
+
+class ClassificationDetails(BaseModel):
+    """Structured details payload emitted by every classification evaluator."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    metric: str
+    average: str
+    classes: list[str]
+    confusion_matrix: list[list[int]]
+    per_class: dict[str, PerClassMetrics]
+    micro: float
+    macro: float
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+class _ConfusionData:
+    """Internal: confusion matrix and per-class counts derived from results."""
+
+    __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped")
+
+    def __init__(
+        self,
+        classes: list[str],
+        matrix: list[list[int]],
+        n_total: int,
+        n_scored: int,
+        n_skipped: int,
+    ) -> None:
+        self.classes = classes
+        self.matrix = matrix
+        self.n_total = n_total
+        self.n_scored = n_scored
+        self.n_skipped = n_skipped
+
+    def counts_for(self, class_index: int) -> tuple[int, int, int, int]:
+        """Return (tp, fp, fn, tn) for a class index."""
+        k = len(self.classes)
+        tp = self.matrix[class_index][class_index]
+        fp = sum(self.matrix[class_index][j] for j in range(k)) - tp
+        fn = sum(self.matrix[j][class_index] for j in range(k)) - tp
+        tn = self.n_scored - tp - fp - fn
+        return tp, fp, fn, tn
+
+
+def _build_confusion(
+    results: list[EvaluationResultDto],
+    classes: list[str],
+    case_sensitive: bool,
+) -> _ConfusionData:
+    """Build a confusion matrix from per-datapoint results.
+
+    Results without a parseable justification are counted in ``n_skipped`` and
+    omitted from the matrix. Pairs whose expected or actual label isn't in
+    ``classes`` are also skipped.
+    """
+
+    def norm(label: str) -> str:
+        return label if case_sensitive else label.lower()
+
+    canonical_classes = [norm(c) for c in classes]
+    index_of = {c: i for i, c in enumerate(canonical_classes)}
+    k = len(canonical_classes)
+    matrix = [[0] * k for _ in range(k)]
+
+    n_total = len(results)
+    n_scored = 0
+    n_skipped = 0
+
+    for r in results:
+        j = _coerce_justification(r.details)
+        if j is None:
+            n_skipped += 1
+            continue
+        exp = norm(j[0])
+        act = norm(j[1])
+        if exp not in index_of or act not in index_of:
+            n_skipped += 1
+            continue
+        matrix[index_of[act]][index_of[exp]] += 1
+        n_scored += 1
+
+    return _ConfusionData(
+        classes=canonical_classes,
+        matrix=matrix,
+        n_total=n_total,
+        n_scored=n_scored,
+        n_skipped=n_skipped,
+    )
+
+
+def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float:
+    return tp / (tp + fp) if (tp + fp) > 0 else 0.0
+
+
+def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float:
+    return tp / (tp + fn) if (tp + fn) > 0 else 0.0
+
+
+def _f_score_of(beta: float):
+    beta_sq = beta * beta
+
+    def compute(tp: int, fp: int, fn: int, _tn: int) -> float:
+        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        denom = beta_sq * p + r
+        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+
+    return compute
+
+
+def _build_details(
+    confusion: _ConfusionData,
+    metric_name: str,
+    average: str,
+    per_class_fn,
+) -> tuple[ClassificationDetails, float]:
+    """Compute per-class values, micro, macro, and pick the headline.
+
+    Returns (details, headline_score). ``headline_score`` is the micro or macro
+    average per the evaluator's ``average`` setting.
+    """
+    per_class: dict[str, PerClassMetrics] = {}
+    total_tp = 0
+    total_fp = 0
+    total_fn = 0
+
+    for c, label in enumerate(confusion.classes):
+        tp, fp, fn, tn = confusion.counts_for(c)
+        total_tp += tp
+        total_fp += fp
+        total_fn += fn
+        per_class[label] = PerClassMetrics(
+            tp=tp,
+            tn=tn,
+            fp=fp,
+            fn=fn,
+            support=tp + fn,
+            value=per_class_fn(tp, fp, fn, tn),
+        )
+
+    micro = per_class_fn(total_tp, total_fp, total_fn, 0)
+
+    k = len(confusion.classes)
+    macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0
+
+    details = ClassificationDetails(
+        metric=metric_name,
+        average=average,
+        classes=confusion.classes,
+        confusion_matrix=confusion.matrix,
+        per_class=per_class,
+        micro=micro,
+        macro=macro,
+        n_total=confusion.n_total,
+        n_scored=confusion.n_scored,
+        n_skipped=confusion.n_skipped,
+    )
+
+    headline = micro if average == "micro" else macro
+    return details, headline
+
+
+# ─── configs ──────────────────────────────────────────────────────────────────
+
+
+class _BaseClassificationConfig(BaseDatasetEvaluatorConfig):
+    """Shared config for the three classification evaluators."""
+
+    classes: list[str] = Field(
+        ...,
+        min_length=1,
+        description="Class labels expected in the upstream evaluator's justifications.",
+    )
+    average: Literal["micro", "macro"] = "macro"
+    case_sensitive: bool = False
+
+
+class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level precision evaluator."""
+
+    type: str = EvaluatorType.DATASET_PRECISION.value
+
+
+class RecallDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level recall evaluator."""
+
+    type: str = EvaluatorType.DATASET_RECALL.value
+
+
+class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig):
+    """Configuration for the dataset-level F-score evaluator."""
+
+    type: str = EvaluatorType.DATASET_F_SCORE.value
+    f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.")
+
+
+# ─── evaluators ───────────────────────────────────────────────────────────────
+
+
+class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]):
+    """Dataset-level precision evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_PRECISION.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the precision report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion, "precision", self.config.average, _precision_of
+        )
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]):
+    """Dataset-level recall evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_RECALL.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the recall report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion, "recall", self.config.average, _recall_of
+        )
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]):
+    """Dataset-level F-beta evaluator (multiclass, micro or macro averaged)."""
+
+    @classmethod
+    def get_evaluator_id(cls) -> str:
+        """Identifier matching the type discriminator on configs."""
+        return EvaluatorType.DATASET_F_SCORE.value
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the F-beta report and return the headline as score."""
+        confusion = _build_confusion(
+            results, self.config.classes, self.config.case_sensitive
+        )
+        details, headline = _build_details(
+            confusion,
+            "f_score",
+            self.config.average,
+            _f_score_of(self.config.f_value),
+        )
+        return NumericEvaluationResult(score=headline, details=details)
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
new file mode 100644
index 000000000..8ba0dbe62
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -0,0 +1,52 @@
+"""Factory that instantiates dataset-level evaluators from configuration."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..models.models import EvaluatorType
+from .base_dataset_evaluator import BaseDatasetEvaluator
+from .classification_dataset_evaluators import (
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+
+_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = {
+    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator,
+    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator,
+    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator,
+}
+
+_CONFIG_REGISTRY: dict[str, type[Any]] = {
+    EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig,
+    EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig,
+    EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig,
+}
+
+
+def build_dataset_evaluator(
+    config_data: dict[str, Any],
+) -> BaseDatasetEvaluator[Any]:
+    """Build a dataset evaluator instance from a parsed JSON config dict.
+
+    Raises:
+        ValueError: If ``type`` is missing or unknown.
+    """
+    evaluator_type = config_data.get("type")
+    if not evaluator_type:
+        raise ValueError("Dataset evaluator config is missing required field 'type'")
+
+    config_cls = _CONFIG_REGISTRY.get(evaluator_type)
+    evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type)
+    if config_cls is None or evaluator_cls is None:
+        known = sorted(_EVALUATOR_REGISTRY.keys())
+        raise ValueError(
+            f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}"
+        )
+
+    config = config_cls.model_validate(config_data)
+    return evaluator_cls(config)
diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py
index 8405e4a7a..fbe210a93 100644
--- a/packages/uipath/src/uipath/eval/helpers.py
+++ b/packages/uipath/src/uipath/eval/helpers.py
@@ -9,7 +9,9 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
+from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from .evaluators.base_evaluator import GenericBaseEvaluator
+from .evaluators.dataset_evaluator_factory import build_dataset_evaluator
 from .evaluators.evaluator_factory import EvaluatorFactory
 from .mocks._types import InputMockingStrategy, LLMMockingStrategy
 from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper
@@ -280,6 +282,92 @@ async def load_evaluators(
 
         return evaluators
 
+    @staticmethod
+    async def load_dataset_evaluators(
+        eval_set_path: str,
+        evaluation_set: EvaluationSet,
+    ) -> list[BaseDatasetEvaluator[Any]]:
+        """Load dataset-level evaluators referenced by the evaluation set.
+
+        Dataset evaluator config JSON files are expected to live under
+        ``<eval_set_dir>/../dataset_evaluators/``, mirroring the evaluators
+        layout. Each config is matched to a reference by its top-level ``id``.
+
+        Validates that every dataset evaluator's ``source_evaluator`` is one of
+        the per-datapoint evaluators declared on the eval set; raises if not.
+        """
+        if evaluation_set is None:
+            raise ValueError("eval_set cannot be None")
+
+        dataset_ref_ids = {
+            ref.ref for ref in evaluation_set.dataset_evaluator_refs
+        }
+        if not dataset_ref_ids:
+            return []
+
+        dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators"
+        if not dataset_dir.exists():
+            raise ValueError(
+                f"Dataset evaluators directory not found at '{dataset_dir}', "
+                f"but evaluation set references dataset evaluators: "
+                f"{sorted(dataset_ref_ids)}"
+            )
+
+        # Build the set of per-datapoint evaluator names so we can validate
+        # source_evaluator references up front.
+        if evaluation_set.evaluator_configs:
+            known_evaluator_names = {
+                ref.ref for ref in evaluation_set.evaluator_configs
+            }
+        else:
+            known_evaluator_names = set(evaluation_set.evaluator_refs)
+
+        dataset_evaluators: list[BaseDatasetEvaluator[Any]] = []
+        found_ids: set[str] = set()
+
+        for file in dataset_dir.glob("*.json"):
+            try:
+                with open(file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except json.JSONDecodeError as e:
+                raise ValueError(
+                    f"Invalid JSON in dataset evaluator file '{file}': {str(e)}."
+                ) from e
+
+            evaluator_id = data.get("id")
+            if evaluator_id not in dataset_ref_ids:
+                continue
+
+            try:
+                evaluator = build_dataset_evaluator(data)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to create dataset evaluator from file '{file}': "
+                    f"{str(e)}."
+                ) from e
+
+            if (
+                known_evaluator_names
+                and evaluator.source_evaluator not in known_evaluator_names
+            ):
+                raise ValueError(
+                    f"Dataset evaluator '{evaluator.name}' references "
+                    f"source_evaluator='{evaluator.source_evaluator}' which is "
+                    f"not declared in this evaluation set. Known evaluators: "
+                    f"{sorted(known_evaluator_names)}"
+                )
+
+            dataset_evaluators.append(evaluator)
+            found_ids.add(evaluator_id)
+
+        missing = dataset_ref_ids - found_ids
+        if missing:
+            raise ValueError(
+                f"Could not find the following dataset evaluators: {missing}"
+            )
+
+        return dataset_evaluators
+
 
 def get_agent_model(schema: UiPathRuntimeSchema) -> str | None:
     """Get agent model from the runtime schema metadata.
diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py
index 22e6ce244..711fedeb9 100644
--- a/packages/uipath/src/uipath/eval/models/evaluation_set.py
+++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py
@@ -145,6 +145,9 @@ class EvaluationSet(BaseModel):
     evaluator_configs: list[EvaluatorReference] = Field(
         default_factory=list, alias="evaluatorConfigs"
     )
+    dataset_evaluator_refs: list[EvaluatorReference] = Field(
+        default_factory=list, alias="datasetEvaluatorRefs"
+    )
     evaluations: list[EvaluationItem] = Field(default_factory=list)
     model_settings: list[EvaluationSetModelSettings] = Field(
         default_factory=list, alias="modelSettings"
diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py
index d2dc26df9..f3c9b57e1 100644
--- a/packages/uipath/src/uipath/eval/models/models.py
+++ b/packages/uipath/src/uipath/eval/models/models.py
@@ -300,6 +300,9 @@ class EvaluatorType(str, Enum):
     TOOL_CALL_OUTPUT = "uipath-tool-call-output"
     BINARY_CLASSIFICATION = "uipath-binary-classification"
     MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification"
+    DATASET_PRECISION = "uipath-dataset-precision"
+    DATASET_RECALL = "uipath-dataset-recall"
+    DATASET_F_SCORE = "uipath-dataset-f-score"
 
 
 class ToolCall(BaseModel):
diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py
index 2aee5e599..fa84f0d9e 100644
--- a/packages/uipath/src/uipath/eval/runtime/_types.py
+++ b/packages/uipath/src/uipath/eval/runtime/_types.py
@@ -1,7 +1,7 @@
 import logging
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
 from uipath.runtime import UiPathRuntimeResult
@@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: list[UiPathEvalRunResult]
+    dataset_evaluator_results: dict[str, EvaluationResultDto] = Field(
+        default_factory=dict
+    )
 
     @property
     def score(self) -> float:
diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py
index b8224718c..f3b713320 100644
--- a/packages/uipath/src/uipath/eval/runtime/context.py
+++ b/packages/uipath/src/uipath/eval/runtime/context.py
@@ -4,6 +4,7 @@
 
 from uipath.runtime.schema import UiPathRuntimeSchema
 
+from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..models.evaluation_set import EvaluationSet
 
@@ -27,3 +28,4 @@ class UiPathEvalContext:
     input_overrides: dict[str, Any] | None = None
     resume: bool = False
     job_id: str | None = None
+    dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 7f7614446..5cadcc527 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,6 +45,7 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
+from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator
 from ..evaluators.base_evaluator import GenericBaseEvaluator
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
@@ -202,6 +203,43 @@ def compute_evaluator_scores(
     return final_score, agg_metrics_per_evaluator
 
 
+def compute_dataset_evaluator_results(
+    evaluation_set_results: list[UiPathEvalRunResult],
+    dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]],
+) -> dict[str, EvaluationResultDto]:
+    """Run each dataset evaluator over its source evaluator's per-datapoint results.
+
+    Args:
+        evaluation_set_results: Per-datapoint results from the run.
+        dataset_evaluators: Dataset-level evaluator instances. Each is routed to
+            the per-datapoint results from ``evaluator.source_evaluator``.
+
+    Returns:
+        Dict mapping dataset evaluator name to its serialized EvaluationResultDto.
+        Dataset evaluators whose source produced no results are still invoked
+        with an empty list so they can emit a zeroed result.
+    """
+    results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
+        list
+    )
+    for eval_run_result in evaluation_set_results:
+        for eval_run_result_dto in eval_run_result.evaluation_run_results:
+            if eval_run_result_dto.is_line_result:
+                continue
+            results_by_evaluator[eval_run_result_dto.evaluator_name].append(
+                eval_run_result_dto.result
+            )
+
+    dataset_results: dict[str, EvaluationResultDto] = {}
+    for evaluator in dataset_evaluators:
+        source = evaluator.source_evaluator
+        evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, []))
+        dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result(
+            evaluation_result
+        )
+    return dataset_results
+
+
 class UiPathEvalRuntime:
     """Specialized runtime for evaluation runs, with access to the factory."""
 
@@ -381,6 +419,18 @@ async def execute(self) -> UiPathRuntimeResult:
                         evaluators,
                     )
 
+                    # Run any dataset-level evaluators configured on the eval
+                    # set. Each consumes the per-datapoint results from one
+                    # named source evaluator and emits a single run-level
+                    # EvaluationResultDto stored on UiPathEvalOutput.
+                    if self.context.dataset_evaluators:
+                        results.dataset_evaluator_results = (
+                            compute_dataset_evaluator_results(
+                                results.evaluation_set_results,
+                                self.context.dataset_evaluators,
+                            )
+                        )
+
                     # Configure span with output and metadata
                     await configure_eval_set_run_span(
                         span=span,
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
new file mode 100644
index 000000000..08d81818d
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -0,0 +1,411 @@
+"""Tests for dataset-level classification evaluators (Precision, Recall, FScore).
+
+Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases
+(empty input, out-of-vocab labels, malformed details), and runtime-level
+routing where compute_dataset_evaluator_results selects results by name.
+"""
+
+import uuid
+
+import pytest
+
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.models.models import (
+    EvaluationResultDto,
+    EvaluatorType,
+    NumericEvaluationResult,
+)
+from uipath.eval.runtime._types import (
+    UiPathEvalRunResult,
+    UiPathEvalRunResultDto,
+)
+from uipath.eval.runtime.runtime import compute_dataset_evaluator_results
+
+
+def _result(
+    expected: str, actual: str, score: float | None = None
+) -> EvaluationResultDto:
+    """Build an EvaluationResultDto carrying an expected/actual justification."""
+    if score is None:
+        score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(
+        score=score,
+        details=justification.model_dump(),
+    )
+
+
+def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator:
+    return PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p1",
+            name="precision",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+        )
+    )
+
+
+def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator:
+    return RecallDatasetEvaluator(
+        RecallDatasetEvaluatorConfig(
+            id="r1",
+            name="recall",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+        )
+    )
+
+
+def _fscore(
+    classes: list[str], average: str = "macro", f_value: float = 1.0
+) -> FScoreDatasetEvaluator:
+    return FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1",
+            name="fscore",
+            source_evaluator="intent_match",
+            classes=classes,
+            average=average,  # type: ignore[arg-type]
+            f_value=f_value,
+        )
+    )
+
+
+def _details(result: NumericEvaluationResult) -> ClassificationDetails:
+    """Type-narrowing helper for asserting on details."""
+    assert isinstance(result.details, ClassificationDetails)
+    return result.details
+
+
+class TestPrecisionEvaluator:
+    def test_empty_input_returns_zeroed_result(self) -> None:
+        result = _precision(["cat", "dog"]).evaluate([])
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        d = _details(result)
+        assert d.n_total == 0 and d.n_scored == 0
+        assert d.confusion_matrix == [[0, 0], [0, 0]]
+        assert d.per_class["cat"].tp == 0
+        assert d.per_class["cat"].tn == 0
+
+    def test_two_class_macro(self) -> None:
+        # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no).
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),  # FN for yes, FP for no
+            _result("no", "yes"),  # FP for yes, FN for no
+        ]
+        result = _precision(["yes", "no"], average="macro").evaluate(results)
+        d = _details(result)
+        # precision_yes = 2 / (2 + 1) = 2/3
+        # precision_no  = 0 / (0 + 1) = 0
+        # macro = (2/3 + 0) / 2 = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert d.macro == pytest.approx((2 / 3 + 0.0) / 2)
+        assert result.score == pytest.approx(d.macro)
+
+    def test_two_class_micro_equals_accuracy(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], average="micro").evaluate(results)
+        d = _details(result)
+        # micro precision = sum(TP) / sum(TP + FP)
+        # sum(TP) = 2 (yes diag) + 0 (no diag) = 2
+        # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2
+        # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case
+        assert d.micro == pytest.approx(0.5)
+        assert result.score == pytest.approx(0.5)
+
+    def test_three_class_macro(self) -> None:
+        # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),  # FN_cat, FP_dog
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),  # FN_dog, FP_bird
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),  # FN_bird, FP_cat
+        ]
+        result = _precision(["cat", "dog", "bird"], average="macro").evaluate(
+            [_result(e, a) for e, a in pairs]
+        )
+        d = _details(result)
+        # per-class precision = 2 / (2 + 1) = 2/3 for all three
+        for label in ("cat", "dog", "bird"):
+            m = d.per_class[label]
+            assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5
+            assert m.value == pytest.approx(2 / 3)
+        assert d.macro == pytest.approx(2 / 3)
+        assert result.score == pytest.approx(2 / 3)
+
+
+class TestRecallEvaluator:
+    def test_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _recall(["yes", "no"], average="macro").evaluate(results)
+        d = _details(result)
+        # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3
+        # recall_no  = 0 / (0 + 1) = 0
+        # macro = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert result.score == pytest.approx(1 / 3)
+
+    def test_recall_differs_from_precision(self) -> None:
+        # Asymmetric example so precision != recall.
+        results = [
+            _result("yes", "yes"),  # TP
+            _result("yes", "yes"),  # TP
+            _result("no", "yes"),  # FP for yes
+            _result("no", "yes"),  # FP for yes
+            _result("no", "no"),  # TP for no
+        ]
+        p = _details(_precision(["yes", "no"], average="macro").evaluate(results))
+        r = _details(_recall(["yes", "no"], average="macro").evaluate(results))
+        # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0
+        assert p.per_class["yes"].value == pytest.approx(0.5)
+        assert p.per_class["no"].value == pytest.approx(1.0)
+        # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3
+        assert r.per_class["yes"].value == pytest.approx(1.0)
+        assert r.per_class["no"].value == pytest.approx(1 / 3)
+
+
+class TestFScoreEvaluator:
+    def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        f = _details(
+            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+        )
+        # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3
+        # precision_no  = 0,   recall_no  = 0    -> F1_no  = 0
+        assert f.per_class["yes"].value == pytest.approx(2 / 3)
+        assert f.per_class["no"].value == pytest.approx(0.0)
+        assert f.macro == pytest.approx((2 / 3 + 0.0) / 2)
+
+    def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
+        # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0.
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        f1 = _details(
+            _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results)
+        )
+        f2 = _details(
+            _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results)
+        )
+        # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes,
+        # F2_yes should be > F1_yes.
+        assert f2.per_class["yes"].value > f1.per_class["yes"].value
+
+    def test_three_class_micro_pools_across_classes(self) -> None:
+        # Same symmetric setup as the precision macro test.
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        d = _details(
+            _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate(
+                [_result(e, a) for e, a in pairs]
+            )
+        )
+        # micro precision == micro recall == 6/9 (accuracy when each off-diag
+        # contributes once to FP and once to FN globally). micro F1 = 6/9.
+        assert d.micro == pytest.approx(6 / 9)
+
+
+class TestSkippingAndEdgeCases:
+    def test_out_of_vocab_labels_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            _result("cat", "platypus"),  # actual not in classes
+            _result("zebra", "dog"),  # expected not in classes
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_results_without_justification_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            EvaluationResultDto(score=1.0, details="just a string"),
+            EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_case_insensitive_by_default(self) -> None:
+        results = [_result("Cat", "CAT"), _result("DOG", "dog")]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.per_class["cat"].tp == 1
+        assert d.per_class["dog"].tp == 1
+
+
+class TestFactory:
+    def test_builds_evaluator_from_dict(self) -> None:
+        config_data = {
+            "id": "precision_intent",
+            "name": "precision_intent",
+            "type": EvaluatorType.DATASET_PRECISION.value,
+            "sourceEvaluator": "intent_match",
+            "classes": ["yes", "no"],
+            "average": "macro",
+        }
+        evaluator = build_dataset_evaluator(config_data)
+        assert isinstance(evaluator, PrecisionDatasetEvaluator)
+        assert evaluator.source_evaluator == "intent_match"
+        assert evaluator.name == "precision_intent"
+
+    def test_unknown_type_raises(self) -> None:
+        with pytest.raises(ValueError, match="Unknown dataset evaluator type"):
+            build_dataset_evaluator(
+                {
+                    "id": "x",
+                    "name": "x",
+                    "type": "uipath-not-a-thing",
+                    "sourceEvaluator": "intent_match",
+                    "classes": ["yes", "no"],
+                }
+            )
+
+    def test_missing_type_raises(self) -> None:
+        with pytest.raises(ValueError, match="missing required field 'type'"):
+            build_dataset_evaluator(
+                {
+                    "id": "x",
+                    "name": "x",
+                    "sourceEvaluator": "intent_match",
+                    "classes": ["yes", "no"],
+                }
+            )
+
+
+class TestComputeDatasetEvaluatorResults:
+    """End-to-end: dataset evaluator picks results by source_evaluator name."""
+
+    def test_routes_to_correct_source_and_ignores_others(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=0.5),
+                    ),
+                ],
+            ),
+            UiPathEvalRunResult(
+                evaluation_name="dp2",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "no"),
+                    ),
+                ],
+            ),
+        ]
+
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"], average="macro")]
+        )
+        assert set(out) == {"precision"}
+        dto = out["precision"]
+        assert isinstance(dto, EvaluationResultDto)
+        # The unrelated 0.5 score from some_other_evaluator must NOT be in the
+        # matrix — only the two intent_match results count.
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 2
+
+    def test_line_by_line_subresults_are_excluded(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                        is_line_result=True,
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("no", "no"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"])]
+        )
+        assert isinstance(out["precision"].details, dict)
+        assert out["precision"].details["n_scored"] == 1
+
+    def test_source_with_no_results_produces_zeroed_report(self) -> None:
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=1.0),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(
+            eval_results, [_precision(["yes", "no"])]
+        )
+        dto = out["precision"]
+        assert dto.score == 0.0
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 0

From d6b7ab5566d07a9e34611358a4b7539912982936 Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Wed, 20 May 2026 16:14:00 -0700
Subject: [PATCH 2/2] docs(eval): add runnable dataset evaluator demo + bump
 uv.lock for 2.10.69
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

examples/dataset_evaluators_demo.py walks the new dataset-level evaluators
(Precision / Recall / F-score) through five scenarios that exercise the
math end-to-end at the SDK layer:

  1. Balanced 3-class — symmetric confusion matrix, macro == micro
  2. Imbalanced 2-class — shows where macro and micro diverge
  3. Same data, four metrics (Precision, Recall, F1, F2) — proves the
     F-beta knob actually moves per-class numbers
  4. Out-of-vocab + malformed details — n_skipped surfaces, no silent drops
  5. Realistic 4-class intent classifier — uneven per-class performance

Each scenario prints the confusion matrix as a table, the per-class
TP/TN/FP/FN + the metric, and a snippet of the wire JSON that AutoMapper
will surface to the frontend.

Run::

    cd packages/uipath && uv run python examples/dataset_evaluators_demo.py

uv.lock reflects the pyproject.toml version bump (2.10.68 -> 2.10.69)
already in this PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../examples/dataset_evaluators_demo.py       | 359 ++++++++++++++++++
 packages/uipath/uv.lock                       |   4 +-
 2 files changed, 361 insertions(+), 2 deletions(-)
 create mode 100644 packages/uipath/examples/dataset_evaluators_demo.py

diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
new file mode 100644
index 000000000..a8f80858d
--- /dev/null
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -0,0 +1,359 @@
+"""Runnable proof that the dataset-level evaluators work on realistic data.
+
+Five scenarios exercise the framework end-to-end at the SDK layer (no
+worker, no backend). Each prints the headline score plus a confusion
+matrix table, so the math is inspectable rather than a passing-test
+binary signal.
+
+Run::
+
+    cd packages/uipath
+    uv run python examples/dataset_evaluators_demo.py
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Iterable
+
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+    FScoreDatasetEvaluator,
+    FScoreDatasetEvaluatorConfig,
+    PrecisionDatasetEvaluator,
+    PrecisionDatasetEvaluatorConfig,
+    RecallDatasetEvaluator,
+    RecallDatasetEvaluatorConfig,
+)
+from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult
+
+
+# ─── helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_result(expected: str, actual: str) -> EvaluationResultDto:
+    """Build a single per-datapoint EvaluationResultDto.
+
+    Models what an upstream ExactMatch evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with
+    the expected/actual labels carried in the justification.
+    """
+    score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(score=score, details=justification.model_dump())
+
+
+def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    return [make_result(e, a) for e, a in pairs]
+
+
+def print_header(title: str) -> None:
+    print()
+    print("═" * 78)
+    print(f" {title}")
+    print("═" * 78)
+
+
+def print_confusion(details: ClassificationDetails) -> None:
+    """Pretty-print the confusion matrix as a table."""
+    classes = details.classes
+    cell_width = max(7, max(len(c) for c in classes) + 1)
+    header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │  ← expected"
+    print(header)
+    print("─" * len(header))
+    for predicted_idx, predicted_label in enumerate(classes):
+        row_cells = [
+            str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width)
+            for expected_idx in range(len(classes))
+        ]
+        print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │")
+    print(" " * cell_width + "↑ predicted")
+
+
+def print_per_class(details: ClassificationDetails) -> None:
+    """One-row-per-class table of TP/TN/FP/FN + the metric."""
+    label_w = max(len("class"), max(len(c) for c in details.classes))
+    metric = details.metric
+    header = f"  {'class'.ljust(label_w)}  │  TP  TN  FP  FN  support  {metric}"
+    print(header)
+    print("  " + "─" * (len(header) - 2))
+    for cls, m in details.per_class.items():
+        print(
+            f"  {cls.ljust(label_w)}  │  "
+            f"{m.tp:>2}  {m.tn:>2}  {m.fp:>2}  {m.fn:>2}  {m.support:>7}  "
+            f"{m.value:.3f}"
+        )
+
+
+def report(
+    title: str,
+    result: NumericEvaluationResult,
+    *,
+    show_json_tail: bool = False,
+) -> None:
+    """Render one scenario's result block."""
+    print_header(title)
+    assert isinstance(result.details, ClassificationDetails)
+    d = result.details
+    print(
+        f"  metric = {d.metric}   average = {d.average}   "
+        f"score (headline) = {result.score:.4f}"
+    )
+    print(
+        f"  micro = {d.micro:.4f}   macro = {d.macro:.4f}   "
+        f"scored = {d.n_scored}/{d.n_total}   skipped = {d.n_skipped}"
+    )
+    print()
+    print_confusion(d)
+    print()
+    print_per_class(d)
+    if show_json_tail:
+        print()
+        print("  ── wire JSON (matches frontend zod schema) ──")
+        # Just show a snippet to keep output focused.
+        payload = d.model_dump(by_alias=True)
+        print(
+            "  "
+            + json.dumps(
+                {k: payload[k] for k in ("metric", "average", "micro", "macro")},
+                indent=2,
+            ).replace("\n", "\n  ")
+        )
+
+
+# ─── scenarios ────────────────────────────────────────────────────────────────
+
+
+def scenario_1_balanced_three_class() -> None:
+    """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
+    pairs = [
+        ("book", "book"),
+        ("book", "book"),
+        ("book", "cancel"),  # FN_book, FP_cancel
+        ("cancel", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "reschedule"),  # FN_cancel, FP_reschedule
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "book"),  # FN_reschedule, FP_book
+    ]
+    results = materialize_pairs(pairs)
+    evaluator = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="precision_intent",
+            name="precision_intent",
+            source_evaluator="intent_match",
+            classes=["book", "cancel", "reschedule"],
+            average="macro",
+        )
+    )
+    report(
+        "Scenario 1 — Balanced 3-class (intent recognition)\n"
+        "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
+        evaluator.evaluate(results),
+        show_json_tail=True,
+    )
+
+
+def scenario_2_imbalanced_two_class() -> None:
+    """Rare-positive case — why macro vs micro matters.
+
+    20 datapoints. Only 4 are actually positive (the rare class). A weak
+    classifier could trivially get high accuracy by predicting "negative"
+    everywhere — micro precision masks that, macro doesn't.
+    """
+    pairs: list[tuple[str, str]] = []
+    # 16 true negatives where the classifier said "negative" (correct).
+    pairs += [("negative", "negative")] * 13
+    # 3 false positives — classifier hallucinated "positive" on actual negatives.
+    pairs += [("negative", "positive")] * 3
+    # 2 true positives.
+    pairs += [("positive", "positive")] * 2
+    # 2 false negatives — classifier missed real positives.
+    pairs += [("positive", "negative")] * 2
+
+    results = materialize_pairs(pairs)
+    classes = ["positive", "negative"]
+
+    macro = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p_macro",
+            name="precision (macro)",
+            source_evaluator="positive_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    micro = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p_micro",
+            name="precision (micro)",
+            source_evaluator="positive_match",
+            classes=classes,
+            average="micro",
+        )
+    )
+    report(
+        "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
+        "  Rare positive class. Macro averages per-class, so the rare class\n"
+        "  having precision = 2/(2+3) = 0.40 drags the score down.",
+        macro.evaluate(results),
+    )
+    report(
+        "Scenario 2b — Same data, MICRO precision\n"
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.\n"
+        "  Notice macro << micro — that's the bias you'd miss with micro alone.",
+        micro.evaluate(results),
+    )
+
+
+def scenario_3_precision_vs_recall_vs_f() -> None:
+    """Same dataset, three different metrics — show they diverge on asymmetric data."""
+    pairs = [
+        ("yes", "yes"),
+        ("yes", "yes"),
+        ("no", "yes"),  # FP for yes
+        ("no", "yes"),  # FP for yes
+        ("no", "no"),
+        ("no", "no"),
+        ("yes", "no"),  # FN for yes
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["yes", "no"]
+
+    p = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="p",
+            name="precision",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    r = RecallDatasetEvaluator(
+        RecallDatasetEvaluatorConfig(
+            id="r",
+            name="recall",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+        )
+    )
+    f1 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1",
+            name="f1",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+            f_value=1.0,
+        )
+    )
+    f2 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f2",
+            name="f2",
+            source_evaluator="yes_match",
+            classes=classes,
+            average="macro",
+            f_value=2.0,
+        )
+    )
+    report(
+        "Scenario 3a — Precision on a recall-favourable dataset",
+        p.evaluate(results),
+    )
+    report(
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)",
+        r.evaluate(results),
+    )
+    report(
+        "Scenario 3c — F1 (harmonic mean of P and R)",
+        f1.evaluate(results),
+    )
+    report(
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)",
+        f2.evaluate(results),
+    )
+
+
+def scenario_4_skipped_datapoints() -> None:
+    """Show how malformed / out-of-vocab data is reported, not silently dropped."""
+    results = [
+        make_result("cat", "cat"),
+        make_result("dog", "dog"),
+        make_result("cat", "platypus"),  # actual not in classes → skipped
+        make_result("zebra", "cat"),  # expected not in classes → skipped
+        EvaluationResultDto(score=1.0, details="bare string — no justification"),
+        EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+    ]
+    evaluator = PrecisionDatasetEvaluator(
+        PrecisionDatasetEvaluatorConfig(
+            id="precision_robustness",
+            name="precision_robustness",
+            source_evaluator="any_match",
+            classes=["cat", "dog"],
+            average="macro",
+        )
+    )
+    report(
+        "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
+        "  6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
+        "  report so you can tell whether a low score is a real signal or\n"
+        "  just sparse data.",
+        evaluator.evaluate(results),
+    )
+
+
+def scenario_5_realistic_intent_classifier() -> None:
+    """A larger, more interesting 4-class dataset — uneven per-class performance."""
+    pairs = [
+        # 'book' is easy: classifier handles it well
+        *[("book", "book")] * 10,
+        ("book", "cancel"),
+        # 'cancel' is medium: a few errors
+        *[("cancel", "cancel")] * 6,
+        ("cancel", "book"),
+        ("cancel", "modify"),
+        # 'reschedule' is hard: classifier confuses it with 'modify'
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "modify"),
+        ("reschedule", "modify"),
+        # 'modify' is rare: only 2 cases, classifier gets one
+        ("modify", "modify"),
+        ("modify", "reschedule"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["book", "cancel", "reschedule", "modify"]
+    macro_f1 = FScoreDatasetEvaluator(
+        FScoreDatasetEvaluatorConfig(
+            id="f1_4class",
+            name="f1_4class",
+            source_evaluator="intent_match",
+            classes=classes,
+            average="macro",
+            f_value=1.0,
+        )
+    )
+    report(
+        "Scenario 5 — Realistic 4-class intent classifier\n"
+        "  Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
+        "  'modify' weakness; micro F1 would have hidden it under 'book' wins.",
+        macro_f1.evaluate(results),
+    )
+
+
+def main() -> None:
+    scenario_1_balanced_three_class()
+    scenario_2_imbalanced_two_class()
+    scenario_3_precision_vs_recall_vs_f()
+    scenario_4_skipped_datapoints()
+    scenario_5_realistic_intent_classifier()
+    print()
+    print("Done. All scenarios computed from real evaluator code.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock
index 41ae12119..19b0d047b 100644
--- a/packages/uipath/uv.lock
+++ b/packages/uipath/uv.lock
@@ -3,7 +3,7 @@ revision = 3
 requires-python = ">=3.11"
 
 [options]
-exclude-newer = "2026-05-17T17:25:34.9197064Z"
+exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
 exclude-newer-span = "P2D"
 
 [options.exclude-newer-package]
@@ -2552,7 +2552,7 @@ wheels = [
 
 [[package]]
 name = "uipath"
-version = "2.10.68"
+version = "2.10.69"
 source = { editable = "." }
 dependencies = [
     { name = "applicationinsights" },