From 5e574f1895feccb314fd929d57e15dd69580c5f0 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Wed, 20 May 2026 14:05:44 -0700 Subject: [PATCH 1/2] feat(eval): add dataset-level evaluator framework with precision/recall/f-score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a new BaseDatasetEvaluator concept that runs once per evaluation set after all per-datapoint evaluators complete. It consumes per-datapoint EvaluationResultDto values from a named source evaluator and emits a single run-level EvaluationResult. Includes three starter evaluators for multiclass classification metrics: - PrecisionDatasetEvaluator - RecallDatasetEvaluator - FScoreDatasetEvaluator (configurable beta) Each takes a required classes list (populated from the UI), supports micro or macro averaging, and emits per-class TP/TN/FP/FN plus the confusion matrix in details. Binary is the 2-class case — no separate binary path. Architecture: BaseDatasetEvaluator is a parallel hierarchy to GenericBaseEvaluator (not a subclass) so the per-datapoint dispatch loop cannot accidentally pick up a dataset evaluator. Each dataset evaluator declares a single source_evaluator by name; the runtime groups per-datapoint results by evaluator name and routes the right list to each dataset evaluator. Configs load from /../dataset_evaluators/*.json mirroring the evaluators directory layout. Patch version bumped: 2.10.68 -> 2.10.69. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/uipath/pyproject.toml | 2 +- packages/uipath/src/uipath/_cli/cli_eval.py | 7 + .../eval/evaluators/base_dataset_evaluator.py | 75 ++++ .../classification_dataset_evaluators.py | 311 +++++++++++++ .../evaluators/dataset_evaluator_factory.py | 52 +++ packages/uipath/src/uipath/eval/helpers.py | 88 ++++ .../src/uipath/eval/models/evaluation_set.py | 3 + .../uipath/src/uipath/eval/models/models.py | 3 + .../uipath/src/uipath/eval/runtime/_types.py | 5 +- .../uipath/src/uipath/eval/runtime/context.py | 2 + .../uipath/src/uipath/eval/runtime/runtime.py | 50 +++ .../test_dataset_classification_evaluators.py | 411 ++++++++++++++++++ 12 files changed, 1007 insertions(+), 2 deletions(-) create mode 100644 packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py create mode 100644 packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py create mode 100644 packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py create mode 100644 packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py diff --git a/packages/uipath/pyproject.toml b/packages/uipath/pyproject.toml index 36550f54d..0d70cb383 100644 --- a/packages/uipath/pyproject.toml +++ b/packages/uipath/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.10.68" +version = "2.10.69" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/packages/uipath/src/uipath/_cli/cli_eval.py b/packages/uipath/src/uipath/_cli/cli_eval.py index e101717d6..2e35db849 100644 --- a/packages/uipath/src/uipath/_cli/cli_eval.py +++ b/packages/uipath/src/uipath/_cli/cli_eval.py @@ -412,6 +412,13 @@ async def execute_eval(): get_agent_model(eval_context.runtime_schema), ) + eval_context.dataset_evaluators = ( + await EvalHelpers.load_dataset_evaluators( + resolved_eval_set_path, + eval_context.evaluation_set, + ) + ) + # Runtime is not required anymore. await runtime.dispose() diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py new file mode 100644 index 000000000..ae818a421 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py @@ -0,0 +1,75 @@ +"""Base abstractions for dataset-level evaluators. + +A dataset-level evaluator runs once per evaluation set, after all per-datapoint +evaluators have produced their results. It consumes the per-datapoint +EvaluationResultDto values from one named source evaluator and emits a single +EvaluationResult that summarizes the dataset. + +Concretely distinct from GenericBaseEvaluator: different evaluate() signature, +different lifecycle. Kept as a parallel hierarchy rather than a subclass so +the runtime cannot accidentally dispatch a dataset evaluator through the +per-datapoint loop. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + +from ..models.models import EvaluationResult, EvaluationResultDto + + +class BaseDatasetEvaluatorConfig(BaseModel): + """Configuration shared by all dataset-level evaluators.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + id: str + name: str + type: str + source_evaluator: str = Field( + ..., + description=( + "Name of the per-datapoint evaluator whose EvaluationResultDto values " + "this dataset evaluator consumes." + ), + ) + + +ConfigT = TypeVar("ConfigT", bound=BaseDatasetEvaluatorConfig) + + +class BaseDatasetEvaluator(ABC, Generic[ConfigT]): + """Abstract base for dataset-level evaluators. + + Subclasses implement ``evaluate`` over the per-datapoint EvaluationResultDto + values produced by ``config.source_evaluator``. + """ + + config: ConfigT + + def __init__(self, config: ConfigT) -> None: + """Store the evaluator's configuration.""" + self.config = config + + @property + def name(self) -> str: + """Logical name of this evaluator instance (used as result-dict key).""" + return self.config.name + + @property + def source_evaluator(self) -> str: + """Name of the upstream evaluator whose results this one consumes.""" + return self.config.source_evaluator + + @classmethod + @abstractmethod + def get_evaluator_id(cls) -> str: + """Stable identifier matching the ``type`` discriminator on configs.""" + + @abstractmethod + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Reduce per-datapoint results into a single run-level EvaluationResult.""" diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py new file mode 100644 index 000000000..272541e21 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -0,0 +1,311 @@ +"""Dataset-level classification evaluators: Precision, Recall, F-score. + +All three share the same internal machinery — a k x k confusion matrix built +from each per-datapoint result's BaseEvaluatorJustification (expected, actual) +strings. They differ only in the final formula and (for F-score) the beta +parameter. The headline ``score`` is the micro or macro average per config; +``details`` carries the full per-class breakdown plus the confusion matrix. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + +from ..models.models import ( + EvaluationResult, + EvaluationResultDto, + EvaluatorType, + NumericEvaluationResult, +) +from .base_dataset_evaluator import BaseDatasetEvaluator, BaseDatasetEvaluatorConfig +from .base_evaluator import BaseEvaluatorJustification + + +def _coerce_justification(details: object) -> tuple[str, str] | None: + """Extract (expected, actual) from an EvaluationResultDto.details payload.""" + if isinstance(details, BaseEvaluatorJustification): + return details.expected, details.actual + if isinstance(details, dict): + try: + j = BaseEvaluatorJustification.model_validate(details) + except Exception: + return None + return j.expected, j.actual + return None + + +class PerClassMetrics(BaseModel): + """Per-class confusion counts plus the metric the evaluator computed.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + tp: int + tn: int + fp: int + fn: int + support: int + value: float + + +class ClassificationDetails(BaseModel): + """Structured details payload emitted by every classification evaluator.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + metric: str + average: str + classes: list[str] + confusion_matrix: list[list[int]] + per_class: dict[str, PerClassMetrics] + micro: float + macro: float + n_total: int + n_scored: int + n_skipped: int + + +class _ConfusionData: + """Internal: confusion matrix and per-class counts derived from results.""" + + __slots__ = ("classes", "matrix", "n_total", "n_scored", "n_skipped") + + def __init__( + self, + classes: list[str], + matrix: list[list[int]], + n_total: int, + n_scored: int, + n_skipped: int, + ) -> None: + self.classes = classes + self.matrix = matrix + self.n_total = n_total + self.n_scored = n_scored + self.n_skipped = n_skipped + + def counts_for(self, class_index: int) -> tuple[int, int, int, int]: + """Return (tp, fp, fn, tn) for a class index.""" + k = len(self.classes) + tp = self.matrix[class_index][class_index] + fp = sum(self.matrix[class_index][j] for j in range(k)) - tp + fn = sum(self.matrix[j][class_index] for j in range(k)) - tp + tn = self.n_scored - tp - fp - fn + return tp, fp, fn, tn + + +def _build_confusion( + results: list[EvaluationResultDto], + classes: list[str], + case_sensitive: bool, +) -> _ConfusionData: + """Build a confusion matrix from per-datapoint results. + + Results without a parseable justification are counted in ``n_skipped`` and + omitted from the matrix. Pairs whose expected or actual label isn't in + ``classes`` are also skipped. + """ + + def norm(label: str) -> str: + return label if case_sensitive else label.lower() + + canonical_classes = [norm(c) for c in classes] + index_of = {c: i for i, c in enumerate(canonical_classes)} + k = len(canonical_classes) + matrix = [[0] * k for _ in range(k)] + + n_total = len(results) + n_scored = 0 + n_skipped = 0 + + for r in results: + j = _coerce_justification(r.details) + if j is None: + n_skipped += 1 + continue + exp = norm(j[0]) + act = norm(j[1]) + if exp not in index_of or act not in index_of: + n_skipped += 1 + continue + matrix[index_of[act]][index_of[exp]] += 1 + n_scored += 1 + + return _ConfusionData( + classes=canonical_classes, + matrix=matrix, + n_total=n_total, + n_scored=n_scored, + n_skipped=n_skipped, + ) + + +def _precision_of(tp: int, fp: int, _fn: int, _tn: int) -> float: + return tp / (tp + fp) if (tp + fp) > 0 else 0.0 + + +def _recall_of(tp: int, _fp: int, fn: int, _tn: int) -> float: + return tp / (tp + fn) if (tp + fn) > 0 else 0.0 + + +def _f_score_of(beta: float): + beta_sq = beta * beta + + def compute(tp: int, fp: int, fn: int, _tn: int) -> float: + p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + denom = beta_sq * p + r + return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 + + return compute + + +def _build_details( + confusion: _ConfusionData, + metric_name: str, + average: str, + per_class_fn, +) -> tuple[ClassificationDetails, float]: + """Compute per-class values, micro, macro, and pick the headline. + + Returns (details, headline_score). ``headline_score`` is the micro or macro + average per the evaluator's ``average`` setting. + """ + per_class: dict[str, PerClassMetrics] = {} + total_tp = 0 + total_fp = 0 + total_fn = 0 + + for c, label in enumerate(confusion.classes): + tp, fp, fn, tn = confusion.counts_for(c) + total_tp += tp + total_fp += fp + total_fn += fn + per_class[label] = PerClassMetrics( + tp=tp, + tn=tn, + fp=fp, + fn=fn, + support=tp + fn, + value=per_class_fn(tp, fp, fn, tn), + ) + + micro = per_class_fn(total_tp, total_fp, total_fn, 0) + + k = len(confusion.classes) + macro = sum(per_class[c].value for c in confusion.classes) / k if k > 0 else 0.0 + + details = ClassificationDetails( + metric=metric_name, + average=average, + classes=confusion.classes, + confusion_matrix=confusion.matrix, + per_class=per_class, + micro=micro, + macro=macro, + n_total=confusion.n_total, + n_scored=confusion.n_scored, + n_skipped=confusion.n_skipped, + ) + + headline = micro if average == "micro" else macro + return details, headline + + +# ─── configs ────────────────────────────────────────────────────────────────── + + +class _BaseClassificationConfig(BaseDatasetEvaluatorConfig): + """Shared config for the three classification evaluators.""" + + classes: list[str] = Field( + ..., + min_length=1, + description="Class labels expected in the upstream evaluator's justifications.", + ) + average: Literal["micro", "macro"] = "macro" + case_sensitive: bool = False + + +class PrecisionDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level precision evaluator.""" + + type: str = EvaluatorType.DATASET_PRECISION.value + + +class RecallDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level recall evaluator.""" + + type: str = EvaluatorType.DATASET_RECALL.value + + +class FScoreDatasetEvaluatorConfig(_BaseClassificationConfig): + """Configuration for the dataset-level F-score evaluator.""" + + type: str = EvaluatorType.DATASET_F_SCORE.value + f_value: float = Field(default=1.0, gt=0, description="Beta value for F_beta.") + + +# ─── evaluators ─────────────────────────────────────────────────────────────── + + +class PrecisionDatasetEvaluator(BaseDatasetEvaluator[PrecisionDatasetEvaluatorConfig]): + """Dataset-level precision evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_PRECISION.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the precision report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, "precision", self.config.average, _precision_of + ) + return NumericEvaluationResult(score=headline, details=details) + + +class RecallDatasetEvaluator(BaseDatasetEvaluator[RecallDatasetEvaluatorConfig]): + """Dataset-level recall evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_RECALL.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the recall report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, "recall", self.config.average, _recall_of + ) + return NumericEvaluationResult(score=headline, details=details) + + +class FScoreDatasetEvaluator(BaseDatasetEvaluator[FScoreDatasetEvaluatorConfig]): + """Dataset-level F-beta evaluator (multiclass, micro or macro averaged).""" + + @classmethod + def get_evaluator_id(cls) -> str: + """Identifier matching the type discriminator on configs.""" + return EvaluatorType.DATASET_F_SCORE.value + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the F-beta report and return the headline as score.""" + confusion = _build_confusion( + results, self.config.classes, self.config.case_sensitive + ) + details, headline = _build_details( + confusion, + "f_score", + self.config.average, + _f_score_of(self.config.f_value), + ) + return NumericEvaluationResult(score=headline, details=details) diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py new file mode 100644 index 000000000..8ba0dbe62 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py @@ -0,0 +1,52 @@ +"""Factory that instantiates dataset-level evaluators from configuration.""" + +from __future__ import annotations + +from typing import Any + +from ..models.models import EvaluatorType +from .base_dataset_evaluator import BaseDatasetEvaluator +from .classification_dataset_evaluators import ( + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) + +_EVALUATOR_REGISTRY: dict[str, type[BaseDatasetEvaluator[Any]]] = { + EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluator, + EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluator, + EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluator, +} + +_CONFIG_REGISTRY: dict[str, type[Any]] = { + EvaluatorType.DATASET_PRECISION.value: PrecisionDatasetEvaluatorConfig, + EvaluatorType.DATASET_RECALL.value: RecallDatasetEvaluatorConfig, + EvaluatorType.DATASET_F_SCORE.value: FScoreDatasetEvaluatorConfig, +} + + +def build_dataset_evaluator( + config_data: dict[str, Any], +) -> BaseDatasetEvaluator[Any]: + """Build a dataset evaluator instance from a parsed JSON config dict. + + Raises: + ValueError: If ``type`` is missing or unknown. + """ + evaluator_type = config_data.get("type") + if not evaluator_type: + raise ValueError("Dataset evaluator config is missing required field 'type'") + + config_cls = _CONFIG_REGISTRY.get(evaluator_type) + evaluator_cls = _EVALUATOR_REGISTRY.get(evaluator_type) + if config_cls is None or evaluator_cls is None: + known = sorted(_EVALUATOR_REGISTRY.keys()) + raise ValueError( + f"Unknown dataset evaluator type '{evaluator_type}'. Known types: {known}" + ) + + config = config_cls.model_validate(config_data) + return evaluator_cls(config) diff --git a/packages/uipath/src/uipath/eval/helpers.py b/packages/uipath/src/uipath/eval/helpers.py index 8405e4a7a..fbe210a93 100644 --- a/packages/uipath/src/uipath/eval/helpers.py +++ b/packages/uipath/src/uipath/eval/helpers.py @@ -9,7 +9,9 @@ from uipath.runtime.schema import UiPathRuntimeSchema +from .evaluators.base_dataset_evaluator import BaseDatasetEvaluator from .evaluators.base_evaluator import GenericBaseEvaluator +from .evaluators.dataset_evaluator_factory import build_dataset_evaluator from .evaluators.evaluator_factory import EvaluatorFactory from .mocks._types import InputMockingStrategy, LLMMockingStrategy from .models._conversational_utils import UiPathLegacyEvalChatMessagesMapper @@ -280,6 +282,92 @@ async def load_evaluators( return evaluators + @staticmethod + async def load_dataset_evaluators( + eval_set_path: str, + evaluation_set: EvaluationSet, + ) -> list[BaseDatasetEvaluator[Any]]: + """Load dataset-level evaluators referenced by the evaluation set. + + Dataset evaluator config JSON files are expected to live under + ``/../dataset_evaluators/``, mirroring the evaluators + layout. Each config is matched to a reference by its top-level ``id``. + + Validates that every dataset evaluator's ``source_evaluator`` is one of + the per-datapoint evaluators declared on the eval set; raises if not. + """ + if evaluation_set is None: + raise ValueError("eval_set cannot be None") + + dataset_ref_ids = { + ref.ref for ref in evaluation_set.dataset_evaluator_refs + } + if not dataset_ref_ids: + return [] + + dataset_dir = Path(eval_set_path).parent.parent / "dataset_evaluators" + if not dataset_dir.exists(): + raise ValueError( + f"Dataset evaluators directory not found at '{dataset_dir}', " + f"but evaluation set references dataset evaluators: " + f"{sorted(dataset_ref_ids)}" + ) + + # Build the set of per-datapoint evaluator names so we can validate + # source_evaluator references up front. + if evaluation_set.evaluator_configs: + known_evaluator_names = { + ref.ref for ref in evaluation_set.evaluator_configs + } + else: + known_evaluator_names = set(evaluation_set.evaluator_refs) + + dataset_evaluators: list[BaseDatasetEvaluator[Any]] = [] + found_ids: set[str] = set() + + for file in dataset_dir.glob("*.json"): + try: + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in dataset evaluator file '{file}': {str(e)}." + ) from e + + evaluator_id = data.get("id") + if evaluator_id not in dataset_ref_ids: + continue + + try: + evaluator = build_dataset_evaluator(data) + except Exception as e: + raise ValueError( + f"Failed to create dataset evaluator from file '{file}': " + f"{str(e)}." + ) from e + + if ( + known_evaluator_names + and evaluator.source_evaluator not in known_evaluator_names + ): + raise ValueError( + f"Dataset evaluator '{evaluator.name}' references " + f"source_evaluator='{evaluator.source_evaluator}' which is " + f"not declared in this evaluation set. Known evaluators: " + f"{sorted(known_evaluator_names)}" + ) + + dataset_evaluators.append(evaluator) + found_ids.add(evaluator_id) + + missing = dataset_ref_ids - found_ids + if missing: + raise ValueError( + f"Could not find the following dataset evaluators: {missing}" + ) + + return dataset_evaluators + def get_agent_model(schema: UiPathRuntimeSchema) -> str | None: """Get agent model from the runtime schema metadata. diff --git a/packages/uipath/src/uipath/eval/models/evaluation_set.py b/packages/uipath/src/uipath/eval/models/evaluation_set.py index 22e6ce244..711fedeb9 100644 --- a/packages/uipath/src/uipath/eval/models/evaluation_set.py +++ b/packages/uipath/src/uipath/eval/models/evaluation_set.py @@ -145,6 +145,9 @@ class EvaluationSet(BaseModel): evaluator_configs: list[EvaluatorReference] = Field( default_factory=list, alias="evaluatorConfigs" ) + dataset_evaluator_refs: list[EvaluatorReference] = Field( + default_factory=list, alias="datasetEvaluatorRefs" + ) evaluations: list[EvaluationItem] = Field(default_factory=list) model_settings: list[EvaluationSetModelSettings] = Field( default_factory=list, alias="modelSettings" diff --git a/packages/uipath/src/uipath/eval/models/models.py b/packages/uipath/src/uipath/eval/models/models.py index d2dc26df9..f3c9b57e1 100644 --- a/packages/uipath/src/uipath/eval/models/models.py +++ b/packages/uipath/src/uipath/eval/models/models.py @@ -300,6 +300,9 @@ class EvaluatorType(str, Enum): TOOL_CALL_OUTPUT = "uipath-tool-call-output" BINARY_CLASSIFICATION = "uipath-binary-classification" MULTICLASS_CLASSIFICATION = "uipath-multiclass-classification" + DATASET_PRECISION = "uipath-dataset-precision" + DATASET_RECALL = "uipath-dataset-recall" + DATASET_F_SCORE = "uipath-dataset-f-score" class ToolCall(BaseModel): diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py index 2aee5e599..fa84f0d9e 100644 --- a/packages/uipath/src/uipath/eval/runtime/_types.py +++ b/packages/uipath/src/uipath/eval/runtime/_types.py @@ -1,7 +1,7 @@ import logging from opentelemetry.sdk.trace import ReadableSpan -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from pydantic.alias_generators import to_camel from uipath.runtime import UiPathRuntimeResult @@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel): evaluation_set_name: str evaluation_set_results: list[UiPathEvalRunResult] + dataset_evaluator_results: dict[str, EvaluationResultDto] = Field( + default_factory=dict + ) @property def score(self) -> float: diff --git a/packages/uipath/src/uipath/eval/runtime/context.py b/packages/uipath/src/uipath/eval/runtime/context.py index b8224718c..f3b713320 100644 --- a/packages/uipath/src/uipath/eval/runtime/context.py +++ b/packages/uipath/src/uipath/eval/runtime/context.py @@ -4,6 +4,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema +from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator from ..models.evaluation_set import EvaluationSet @@ -27,3 +28,4 @@ class UiPathEvalContext: input_overrides: dict[str, Any] | None = None resume: bool = False job_id: str | None = None + dataset_evaluators: list[BaseDatasetEvaluator[Any]] | None = None diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 7f7614446..5cadcc527 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -45,6 +45,7 @@ from uipath.runtime.schema import UiPathRuntimeSchema from .._execution_context import ExecutionSpanCollector +from ..evaluators.base_dataset_evaluator import BaseDatasetEvaluator from ..evaluators.base_evaluator import GenericBaseEvaluator from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..helpers import get_agent_model @@ -202,6 +203,43 @@ def compute_evaluator_scores( return final_score, agg_metrics_per_evaluator +def compute_dataset_evaluator_results( + evaluation_set_results: list[UiPathEvalRunResult], + dataset_evaluators: Iterable[BaseDatasetEvaluator[Any]], +) -> dict[str, EvaluationResultDto]: + """Run each dataset evaluator over its source evaluator's per-datapoint results. + + Args: + evaluation_set_results: Per-datapoint results from the run. + dataset_evaluators: Dataset-level evaluator instances. Each is routed to + the per-datapoint results from ``evaluator.source_evaluator``. + + Returns: + Dict mapping dataset evaluator name to its serialized EvaluationResultDto. + Dataset evaluators whose source produced no results are still invoked + with an empty list so they can emit a zeroed result. + """ + results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict( + list + ) + for eval_run_result in evaluation_set_results: + for eval_run_result_dto in eval_run_result.evaluation_run_results: + if eval_run_result_dto.is_line_result: + continue + results_by_evaluator[eval_run_result_dto.evaluator_name].append( + eval_run_result_dto.result + ) + + dataset_results: dict[str, EvaluationResultDto] = {} + for evaluator in dataset_evaluators: + source = evaluator.source_evaluator + evaluation_result = evaluator.evaluate(results_by_evaluator.get(source, [])) + dataset_results[evaluator.name] = EvaluationResultDto.from_evaluation_result( + evaluation_result + ) + return dataset_results + + class UiPathEvalRuntime: """Specialized runtime for evaluation runs, with access to the factory.""" @@ -381,6 +419,18 @@ async def execute(self) -> UiPathRuntimeResult: evaluators, ) + # Run any dataset-level evaluators configured on the eval + # set. Each consumes the per-datapoint results from one + # named source evaluator and emits a single run-level + # EvaluationResultDto stored on UiPathEvalOutput. + if self.context.dataset_evaluators: + results.dataset_evaluator_results = ( + compute_dataset_evaluator_results( + results.evaluation_set_results, + self.context.dataset_evaluators, + ) + ) + # Configure span with output and metadata await configure_eval_set_run_span( span=span, diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py new file mode 100644 index 000000000..08d81818d --- /dev/null +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -0,0 +1,411 @@ +"""Tests for dataset-level classification evaluators (Precision, Recall, FScore). + +Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases +(empty input, out-of-vocab labels, malformed details), and runtime-level +routing where compute_dataset_evaluator_results selects results by name. +""" + +import uuid + +import pytest + +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDetails, + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) +from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator +from uipath.eval.models.models import ( + EvaluationResultDto, + EvaluatorType, + NumericEvaluationResult, +) +from uipath.eval.runtime._types import ( + UiPathEvalRunResult, + UiPathEvalRunResultDto, +) +from uipath.eval.runtime.runtime import compute_dataset_evaluator_results + + +def _result( + expected: str, actual: str, score: float | None = None +) -> EvaluationResultDto: + """Build an EvaluationResultDto carrying an expected/actual justification.""" + if score is None: + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto( + score=score, + details=justification.model_dump(), + ) + + +def _precision(classes: list[str], average: str = "macro") -> PrecisionDatasetEvaluator: + return PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p1", + name="precision", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + ) + ) + + +def _recall(classes: list[str], average: str = "macro") -> RecallDatasetEvaluator: + return RecallDatasetEvaluator( + RecallDatasetEvaluatorConfig( + id="r1", + name="recall", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + ) + ) + + +def _fscore( + classes: list[str], average: str = "macro", f_value: float = 1.0 +) -> FScoreDatasetEvaluator: + return FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1", + name="fscore", + source_evaluator="intent_match", + classes=classes, + average=average, # type: ignore[arg-type] + f_value=f_value, + ) + ) + + +def _details(result: NumericEvaluationResult) -> ClassificationDetails: + """Type-narrowing helper for asserting on details.""" + assert isinstance(result.details, ClassificationDetails) + return result.details + + +class TestPrecisionEvaluator: + def test_empty_input_returns_zeroed_result(self) -> None: + result = _precision(["cat", "dog"]).evaluate([]) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + d = _details(result) + assert d.n_total == 0 and d.n_scored == 0 + assert d.confusion_matrix == [[0, 0], [0, 0]] + assert d.per_class["cat"].tp == 0 + assert d.per_class["cat"].tn == 0 + + def test_two_class_macro(self) -> None: + # 4 datapoints: 2 TP_yes, 1 FN_yes (predicted no), 1 FP_yes (predicted yes when expected no). + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), # FN for yes, FP for no + _result("no", "yes"), # FP for yes, FN for no + ] + result = _precision(["yes", "no"], average="macro").evaluate(results) + d = _details(result) + # precision_yes = 2 / (2 + 1) = 2/3 + # precision_no = 0 / (0 + 1) = 0 + # macro = (2/3 + 0) / 2 = 1/3 + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert d.macro == pytest.approx((2 / 3 + 0.0) / 2) + assert result.score == pytest.approx(d.macro) + + def test_two_class_micro_equals_accuracy(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _precision(["yes", "no"], average="micro").evaluate(results) + d = _details(result) + # micro precision = sum(TP) / sum(TP + FP) + # sum(TP) = 2 (yes diag) + 0 (no diag) = 2 + # sum(FP) = 1 (yes off-diag row) + 1 (no off-diag row) = 2 + # micro = 2 / (2 + 2) = 0.5 — equals accuracy 2/4 in the 2-class case + assert d.micro == pytest.approx(0.5) + assert result.score == pytest.approx(0.5) + + def test_three_class_macro(self) -> None: + # Each class gets 2 TP, 1 FP, 1 FN — symmetric setup + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), # FN_cat, FP_dog + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), # FN_dog, FP_bird + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), # FN_bird, FP_cat + ] + result = _precision(["cat", "dog", "bird"], average="macro").evaluate( + [_result(e, a) for e, a in pairs] + ) + d = _details(result) + # per-class precision = 2 / (2 + 1) = 2/3 for all three + for label in ("cat", "dog", "bird"): + m = d.per_class[label] + assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5 + assert m.value == pytest.approx(2 / 3) + assert d.macro == pytest.approx(2 / 3) + assert result.score == pytest.approx(2 / 3) + + +class TestRecallEvaluator: + def test_two_class_macro(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _recall(["yes", "no"], average="macro").evaluate(results) + d = _details(result) + # recall_yes = TP / (TP + FN) = 2 / (2 + 1) = 2/3 + # recall_no = 0 / (0 + 1) = 0 + # macro = 1/3 + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert result.score == pytest.approx(1 / 3) + + def test_recall_differs_from_precision(self) -> None: + # Asymmetric example so precision != recall. + results = [ + _result("yes", "yes"), # TP + _result("yes", "yes"), # TP + _result("no", "yes"), # FP for yes + _result("no", "yes"), # FP for yes + _result("no", "no"), # TP for no + ] + p = _details(_precision(["yes", "no"], average="macro").evaluate(results)) + r = _details(_recall(["yes", "no"], average="macro").evaluate(results)) + # precision_yes = 2/(2+2)=0.5, precision_no = 1/(1+0)=1.0 + assert p.per_class["yes"].value == pytest.approx(0.5) + assert p.per_class["no"].value == pytest.approx(1.0) + # recall_yes = 2/(2+0)=1.0, recall_no = 1/(1+2)=1/3 + assert r.per_class["yes"].value == pytest.approx(1.0) + assert r.per_class["no"].value == pytest.approx(1 / 3) + + +class TestFScoreEvaluator: + def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + f = _details( + _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + ) + # precision_yes = 2/3, recall_yes = 2/3 -> F1_yes = 2/3 + # precision_no = 0, recall_no = 0 -> F1_no = 0 + assert f.per_class["yes"].value == pytest.approx(2 / 3) + assert f.per_class["no"].value == pytest.approx(0.0) + assert f.macro == pytest.approx((2 / 3 + 0.0) / 2) + + def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None: + # Asymmetric setup: precision_yes = 0.5, recall_yes = 1.0. + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("no", "yes"), + _result("no", "yes"), + _result("no", "no"), + ] + f1 = _details( + _fscore(["yes", "no"], average="macro", f_value=1.0).evaluate(results) + ) + f2 = _details( + _fscore(["yes", "no"], average="macro", f_value=2.0).evaluate(results) + ) + # F_beta with beta>1 weighs recall higher. Since recall_yes > precision_yes, + # F2_yes should be > F1_yes. + assert f2.per_class["yes"].value > f1.per_class["yes"].value + + def test_three_class_micro_pools_across_classes(self) -> None: + # Same symmetric setup as the precision macro test. + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), + ] + d = _details( + _fscore(["cat", "dog", "bird"], average="micro", f_value=1.0).evaluate( + [_result(e, a) for e, a in pairs] + ) + ) + # micro precision == micro recall == 6/9 (accuracy when each off-diag + # contributes once to FP and once to FN globally). micro F1 = 6/9. + assert d.micro == pytest.approx(6 / 9) + + +class TestSkippingAndEdgeCases: + def test_out_of_vocab_labels_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + _result("cat", "platypus"), # actual not in classes + _result("zebra", "dog"), # expected not in classes + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_results_without_justification_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + EvaluationResultDto(score=1.0, details="just a string"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_case_insensitive_by_default(self) -> None: + results = [_result("Cat", "CAT"), _result("DOG", "dog")] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.per_class["cat"].tp == 1 + assert d.per_class["dog"].tp == 1 + + +class TestFactory: + def test_builds_evaluator_from_dict(self) -> None: + config_data = { + "id": "precision_intent", + "name": "precision_intent", + "type": EvaluatorType.DATASET_PRECISION.value, + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + "average": "macro", + } + evaluator = build_dataset_evaluator(config_data) + assert isinstance(evaluator, PrecisionDatasetEvaluator) + assert evaluator.source_evaluator == "intent_match" + assert evaluator.name == "precision_intent" + + def test_unknown_type_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown dataset evaluator type"): + build_dataset_evaluator( + { + "id": "x", + "name": "x", + "type": "uipath-not-a-thing", + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + } + ) + + def test_missing_type_raises(self) -> None: + with pytest.raises(ValueError, match="missing required field 'type'"): + build_dataset_evaluator( + { + "id": "x", + "name": "x", + "sourceEvaluator": "intent_match", + "classes": ["yes", "no"], + } + ) + + +class TestComputeDatasetEvaluatorResults: + """End-to-end: dataset evaluator picks results by source_evaluator name.""" + + def test_routes_to_correct_source_and_ignores_others(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.5), + ), + ], + ), + UiPathEvalRunResult( + evaluation_name="dp2", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "no"), + ), + ], + ), + ] + + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"], average="macro")] + ) + assert set(out) == {"precision"} + dto = out["precision"] + assert isinstance(dto, EvaluationResultDto) + # The unrelated 0.5 score from some_other_evaluator must NOT be in the + # matrix — only the two intent_match results count. + assert isinstance(dto.details, dict) + assert dto.details["n_scored"] == 2 + + def test_line_by_line_subresults_are_excluded(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + is_line_result=True, + ), + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("no", "no"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"])] + ) + assert isinstance(out["precision"].details, dict) + assert out["precision"].details["n_scored"] == 1 + + def test_source_with_no_results_produces_zeroed_report(self) -> None: + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + ] + out = compute_dataset_evaluator_results( + eval_results, [_precision(["yes", "no"])] + ) + dto = out["precision"] + assert dto.score == 0.0 + assert isinstance(dto.details, dict) + assert dto.details["n_scored"] == 0 From d6b7ab5566d07a9e34611358a4b7539912982936 Mon Sep 17 00:00:00 2001 From: ajay-kesavan Date: Wed, 20 May 2026 16:14:00 -0700 Subject: [PATCH 2/2] docs(eval): add runnable dataset evaluator demo + bump uv.lock for 2.10.69 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit examples/dataset_evaluators_demo.py walks the new dataset-level evaluators (Precision / Recall / F-score) through five scenarios that exercise the math end-to-end at the SDK layer: 1. Balanced 3-class — symmetric confusion matrix, macro == micro 2. Imbalanced 2-class — shows where macro and micro diverge 3. Same data, four metrics (Precision, Recall, F1, F2) — proves the F-beta knob actually moves per-class numbers 4. Out-of-vocab + malformed details — n_skipped surfaces, no silent drops 5. Realistic 4-class intent classifier — uneven per-class performance Each scenario prints the confusion matrix as a table, the per-class TP/TN/FP/FN + the metric, and a snippet of the wire JSON that AutoMapper will surface to the frontend. Run:: cd packages/uipath && uv run python examples/dataset_evaluators_demo.py uv.lock reflects the pyproject.toml version bump (2.10.68 -> 2.10.69) already in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../examples/dataset_evaluators_demo.py | 359 ++++++++++++++++++ packages/uipath/uv.lock | 4 +- 2 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 packages/uipath/examples/dataset_evaluators_demo.py diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py new file mode 100644 index 000000000..a8f80858d --- /dev/null +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -0,0 +1,359 @@ +"""Runnable proof that the dataset-level evaluators work on realistic data. + +Five scenarios exercise the framework end-to-end at the SDK layer (no +worker, no backend). Each prints the headline score plus a confusion +matrix table, so the math is inspectable rather than a passing-test +binary signal. + +Run:: + + cd packages/uipath + uv run python examples/dataset_evaluators_demo.py +""" + +from __future__ import annotations + +import json +from typing import Iterable + +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDetails, + FScoreDatasetEvaluator, + FScoreDatasetEvaluatorConfig, + PrecisionDatasetEvaluator, + PrecisionDatasetEvaluatorConfig, + RecallDatasetEvaluator, + RecallDatasetEvaluatorConfig, +) +from uipath.eval.models.models import EvaluationResultDto, NumericEvaluationResult + + +# ─── helpers ────────────────────────────────────────────────────────────────── + + +def make_result(expected: str, actual: str) -> EvaluationResultDto: + """Build a single per-datapoint EvaluationResultDto. + + Models what an upstream ExactMatch evaluator would produce after running + on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with + the expected/actual labels carried in the justification. + """ + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto(score=score, details=justification.model_dump()) + + +def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]: + return [make_result(e, a) for e, a in pairs] + + +def print_header(title: str) -> None: + print() + print("═" * 78) + print(f" {title}") + print("═" * 78) + + +def print_confusion(details: ClassificationDetails) -> None: + """Pretty-print the confusion matrix as a table.""" + classes = details.classes + cell_width = max(7, max(len(c) for c in classes) + 1) + header = " " * cell_width + " │ " + " │ ".join(c.center(cell_width) for c in classes) + " │ ← expected" + print(header) + print("─" * len(header)) + for predicted_idx, predicted_label in enumerate(classes): + row_cells = [ + str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width) + for expected_idx in range(len(classes)) + ] + print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │") + print(" " * cell_width + "↑ predicted") + + +def print_per_class(details: ClassificationDetails) -> None: + """One-row-per-class table of TP/TN/FP/FN + the metric.""" + label_w = max(len("class"), max(len(c) for c in details.classes)) + metric = details.metric + header = f" {'class'.ljust(label_w)} │ TP TN FP FN support {metric}" + print(header) + print(" " + "─" * (len(header) - 2)) + for cls, m in details.per_class.items(): + print( + f" {cls.ljust(label_w)} │ " + f"{m.tp:>2} {m.tn:>2} {m.fp:>2} {m.fn:>2} {m.support:>7} " + f"{m.value:.3f}" + ) + + +def report( + title: str, + result: NumericEvaluationResult, + *, + show_json_tail: bool = False, +) -> None: + """Render one scenario's result block.""" + print_header(title) + assert isinstance(result.details, ClassificationDetails) + d = result.details + print( + f" metric = {d.metric} average = {d.average} " + f"score (headline) = {result.score:.4f}" + ) + print( + f" micro = {d.micro:.4f} macro = {d.macro:.4f} " + f"scored = {d.n_scored}/{d.n_total} skipped = {d.n_skipped}" + ) + print() + print_confusion(d) + print() + print_per_class(d) + if show_json_tail: + print() + print(" ── wire JSON (matches frontend zod schema) ──") + # Just show a snippet to keep output focused. + payload = d.model_dump(by_alias=True) + print( + " " + + json.dumps( + {k: payload[k] for k in ("metric", "average", "micro", "macro")}, + indent=2, + ).replace("\n", "\n ") + ) + + +# ─── scenarios ──────────────────────────────────────────────────────────────── + + +def scenario_1_balanced_three_class() -> None: + """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong.""" + pairs = [ + ("book", "book"), + ("book", "book"), + ("book", "cancel"), # FN_book, FP_cancel + ("cancel", "cancel"), + ("cancel", "cancel"), + ("cancel", "reschedule"), # FN_cancel, FP_reschedule + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "book"), # FN_reschedule, FP_book + ] + results = materialize_pairs(pairs) + evaluator = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="precision_intent", + name="precision_intent", + source_evaluator="intent_match", + classes=["book", "cancel", "reschedule"], + average="macro", + ) + ) + report( + "Scenario 1 — Balanced 3-class (intent recognition)\n" + " Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.", + evaluator.evaluate(results), + show_json_tail=True, + ) + + +def scenario_2_imbalanced_two_class() -> None: + """Rare-positive case — why macro vs micro matters. + + 20 datapoints. Only 4 are actually positive (the rare class). A weak + classifier could trivially get high accuracy by predicting "negative" + everywhere — micro precision masks that, macro doesn't. + """ + pairs: list[tuple[str, str]] = [] + # 16 true negatives where the classifier said "negative" (correct). + pairs += [("negative", "negative")] * 13 + # 3 false positives — classifier hallucinated "positive" on actual negatives. + pairs += [("negative", "positive")] * 3 + # 2 true positives. + pairs += [("positive", "positive")] * 2 + # 2 false negatives — classifier missed real positives. + pairs += [("positive", "negative")] * 2 + + results = materialize_pairs(pairs) + classes = ["positive", "negative"] + + macro = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p_macro", + name="precision (macro)", + source_evaluator="positive_match", + classes=classes, + average="macro", + ) + ) + micro = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p_micro", + name="precision (micro)", + source_evaluator="positive_match", + classes=classes, + average="micro", + ) + ) + report( + "Scenario 2a — Imbalanced 2-class, MACRO precision\n" + " Rare positive class. Macro averages per-class, so the rare class\n" + " having precision = 2/(2+3) = 0.40 drags the score down.", + macro.evaluate(results), + ) + report( + "Scenario 2b — Same data, MICRO precision\n" + " Pools TP/FP across classes. In a 2-class case this equals accuracy.\n" + " Notice macro << micro — that's the bias you'd miss with micro alone.", + micro.evaluate(results), + ) + + +def scenario_3_precision_vs_recall_vs_f() -> None: + """Same dataset, three different metrics — show they diverge on asymmetric data.""" + pairs = [ + ("yes", "yes"), + ("yes", "yes"), + ("no", "yes"), # FP for yes + ("no", "yes"), # FP for yes + ("no", "no"), + ("no", "no"), + ("yes", "no"), # FN for yes + ] + results = materialize_pairs(pairs) + classes = ["yes", "no"] + + p = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="p", + name="precision", + source_evaluator="yes_match", + classes=classes, + average="macro", + ) + ) + r = RecallDatasetEvaluator( + RecallDatasetEvaluatorConfig( + id="r", + name="recall", + source_evaluator="yes_match", + classes=classes, + average="macro", + ) + ) + f1 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1", + name="f1", + source_evaluator="yes_match", + classes=classes, + average="macro", + f_value=1.0, + ) + ) + f2 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f2", + name="f2", + source_evaluator="yes_match", + classes=classes, + average="macro", + f_value=2.0, + ) + ) + report( + "Scenario 3a — Precision on a recall-favourable dataset", + p.evaluate(results), + ) + report( + "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)", + r.evaluate(results), + ) + report( + "Scenario 3c — F1 (harmonic mean of P and R)", + f1.evaluate(results), + ) + report( + "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)", + f2.evaluate(results), + ) + + +def scenario_4_skipped_datapoints() -> None: + """Show how malformed / out-of-vocab data is reported, not silently dropped.""" + results = [ + make_result("cat", "cat"), + make_result("dog", "dog"), + make_result("cat", "platypus"), # actual not in classes → skipped + make_result("zebra", "cat"), # expected not in classes → skipped + EvaluationResultDto(score=1.0, details="bare string — no justification"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + evaluator = PrecisionDatasetEvaluator( + PrecisionDatasetEvaluatorConfig( + id="precision_robustness", + name="precision_robustness", + source_evaluator="any_match", + classes=["cat", "dog"], + average="macro", + ) + ) + report( + "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n" + " 6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n" + " report so you can tell whether a low score is a real signal or\n" + " just sparse data.", + evaluator.evaluate(results), + ) + + +def scenario_5_realistic_intent_classifier() -> None: + """A larger, more interesting 4-class dataset — uneven per-class performance.""" + pairs = [ + # 'book' is easy: classifier handles it well + *[("book", "book")] * 10, + ("book", "cancel"), + # 'cancel' is medium: a few errors + *[("cancel", "cancel")] * 6, + ("cancel", "book"), + ("cancel", "modify"), + # 'reschedule' is hard: classifier confuses it with 'modify' + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "modify"), + ("reschedule", "modify"), + # 'modify' is rare: only 2 cases, classifier gets one + ("modify", "modify"), + ("modify", "reschedule"), + ] + results = materialize_pairs(pairs) + classes = ["book", "cancel", "reschedule", "modify"] + macro_f1 = FScoreDatasetEvaluator( + FScoreDatasetEvaluatorConfig( + id="f1_4class", + name="f1_4class", + source_evaluator="intent_match", + classes=classes, + average="macro", + f_value=1.0, + ) + ) + report( + "Scenario 5 — Realistic 4-class intent classifier\n" + " Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n" + " 'modify' weakness; micro F1 would have hidden it under 'book' wins.", + macro_f1.evaluate(results), + ) + + +def main() -> None: + scenario_1_balanced_three_class() + scenario_2_imbalanced_two_class() + scenario_3_precision_vs_recall_vs_f() + scenario_4_skipped_datapoints() + scenario_5_realistic_intent_classifier() + print() + print("Done. All scenarios computed from real evaluator code.") + + +if __name__ == "__main__": + main() diff --git a/packages/uipath/uv.lock b/packages/uipath/uv.lock index 41ae12119..19b0d047b 100644 --- a/packages/uipath/uv.lock +++ b/packages/uipath/uv.lock @@ -3,7 +3,7 @@ revision = 3 requires-python = ">=3.11" [options] -exclude-newer = "2026-05-17T17:25:34.9197064Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P2D" [options.exclude-newer-package] @@ -2552,7 +2552,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.10.68" +version = "2.10.69" source = { editable = "." } dependencies = [ { name = "applicationinsights" },