From 6b11767d30fb08969146d4bb58ac8570cc20c34f Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Tue, 19 May 2026 17:54:34 -0700
Subject: [PATCH 1/2] feat(eval): add evaluator type schemas for classification
 evaluators

Generates BinaryClassificationEvaluator.json and MulticlassClassificationEvaluator.json
from the new evaluators added in #1397 so external tooling (Flow UI evaluator
picker, `uip maestro flow eval`) can read the config / criteria / justification
schemas.

Files produced by `python -m uipath.eval.evaluators_types.generate_types`,
restricted to the two new evaluator types. A companion PR refreshes the other
11 stale schemas in evaluators_types/.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../BinaryClassificationEvaluator.json        | 121 ++++++++++++++++
 .../MulticlassClassificationEvaluator.json    | 133 ++++++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
 create mode 100644 packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json

diff --git a/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
new file mode 100644
index 000000000..9f7351865
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/BinaryClassificationEvaluator.json
@@ -0,0 +1,121 @@
+{
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "BinaryClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "BinaryClassificationEvaluationCriteria",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the binary classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "BinaryClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/BinaryClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "default": "*",
+        "description": "Key to extract output from agent execution",
+        "title": "Target Output Key",
+        "type": "string"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "positive_class": {
+        "title": "Positive Class",
+        "type": "string"
+      },
+      "metric_type": {
+        "default": "precision",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      }
+    },
+    "required": [
+      "positive_class"
+    ],
+    "title": "BinaryClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "BinaryClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
new file mode 100644
index 000000000..72262ba92
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators_types/MulticlassClassificationEvaluator.json
@@ -0,0 +1,133 @@
+{
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfigSchema": {
+    "$defs": {
+      "MulticlassClassificationEvaluationCriteria": {
+        "description": "Per-datapoint criteria: which class this sample should belong to.",
+        "properties": {
+          "expected_class": {
+            "title": "Expected Class",
+            "type": "string"
+          }
+        },
+        "required": [
+          "expected_class"
+        ],
+        "title": "MulticlassClassificationEvaluationCriteria",
+        "type": "object"
+      }
+    },
+    "description": "Configuration for the multiclass classification evaluator.",
+    "properties": {
+      "name": {
+        "default": "MulticlassClassificationEvaluator",
+        "title": "Name",
+        "type": "string"
+      },
+      "description": {
+        "default": "",
+        "description": "The description of the evaluator",
+        "title": "Description",
+        "type": "string"
+      },
+      "default_evaluation_criteria": {
+        "anyOf": [
+          {
+            "$ref": "#/$defs/MulticlassClassificationEvaluationCriteria"
+          },
+          {
+            "type": "null"
+          }
+        ],
+        "default": null
+      },
+      "target_output_key": {
+        "default": "*",
+        "description": "Key to extract output from agent execution",
+        "title": "Target Output Key",
+        "type": "string"
+      },
+      "line_by_line_evaluator": {
+        "default": false,
+        "description": "If True, split output by delimiter and evaluate each line separately",
+        "title": "Line By Line Evaluator",
+        "type": "boolean"
+      },
+      "line_delimiter": {
+        "default": "\n",
+        "description": "Delimiter to split output when line_by_line_evaluator is True",
+        "title": "Line Delimiter",
+        "type": "string"
+      },
+      "classes": {
+        "items": {
+          "type": "string"
+        },
+        "title": "Classes",
+        "type": "array"
+      },
+      "metric_type": {
+        "default": "f-score",
+        "enum": [
+          "precision",
+          "recall",
+          "f-score"
+        ],
+        "title": "Metric Type",
+        "type": "string"
+      },
+      "averaging": {
+        "default": "macro",
+        "enum": [
+          "micro",
+          "macro"
+        ],
+        "title": "Averaging",
+        "type": "string"
+      },
+      "f_value": {
+        "default": 1.0,
+        "title": "F Value",
+        "type": "number"
+      }
+    },
+    "required": [
+      "classes"
+    ],
+    "title": "MulticlassClassificationEvaluatorConfig",
+    "type": "object"
+  },
+  "evaluationCriteriaSchema": {
+    "description": "Per-datapoint criteria: which class this sample should belong to.",
+    "properties": {
+      "expected_class": {
+        "title": "Expected Class",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected_class"
+    ],
+    "title": "MulticlassClassificationEvaluationCriteria",
+    "type": "object"
+  },
+  "justificationSchema": {
+    "description": "Base class for all evaluator justifications.",
+    "properties": {
+      "expected": {
+        "title": "Expected",
+        "type": "string"
+      },
+      "actual": {
+        "title": "Actual",
+        "type": "string"
+      }
+    },
+    "required": [
+      "expected",
+      "actual"
+    ],
+    "title": "BaseEvaluatorJustification",
+    "type": "object"
+  }
+}
\ No newline at end of file

From 037b60cdb6e721c494b2b4fd173e6bf1bdb450ed Mon Sep 17 00:00:00 2001
From: ajay-kesavan <ajay.kesavan@uipath.com>
Date: Tue, 19 May 2026 18:27:58 -0700
Subject: [PATCH 2/2] test(eval): add e2e tests + sample projects for
 classification evaluators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two sample projects under packages/uipath/samples/ that double as
end-to-end test fixtures for the binary and multiclass classification
evaluators added in #1397:

- binary_classification_agent — rule-based spam/ham classifier wired up
  to the binary classification evaluator with metric_type=precision.
  Eval set is designed so 4/5 datapoints pass but precision is 2/3
  because of one deliberate false positive.
- multiclass_classification_simple — rule-based 3-class router (payments
  / support / spam) wired up to the multiclass classification evaluator
  with macro-averaged F1. Eval set forces a misroute that hurts both
  payments precision and support recall, giving macro F1 = 26/30.

Adds tests/cli/eval/test_classification_samples_e2e.py which loads each
sample's eval-sets/default.json, wires its main.py into a stand-in runtime,
calls evaluate(), and asserts both the per-row scores and the aggregated
metric produced by reduce_scores. Locks in the dataset-level math, not just
per-row correct/incorrect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../binary_classification_agent/bindings.json |   4 +
 .../evaluations/eval-sets/default.json        |  63 ++++++
 .../evaluators/binary-classification.json     |  16 ++
 .../binary_classification_agent/main.py       |  39 ++++
 .../pyproject.toml                            |   9 +
 .../binary_classification_agent/uipath.json   |   5 +
 .../bindings.json                             |   4 +
 .../evaluations/eval-sets/default.json        |  85 ++++++++
 .../evaluators/multiclass-classification.json |  17 ++
 .../multiclass_classification_simple/main.py  |  51 +++++
 .../pyproject.toml                            |   9 +
 .../uipath.json                               |   5 +
 .../eval/test_classification_samples_e2e.py   | 193 ++++++++++++++++++
 13 files changed, 500 insertions(+)
 create mode 100644 packages/uipath/samples/binary_classification_agent/bindings.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
 create mode 100644 packages/uipath/samples/binary_classification_agent/main.py
 create mode 100644 packages/uipath/samples/binary_classification_agent/pyproject.toml
 create mode 100644 packages/uipath/samples/binary_classification_agent/uipath.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/bindings.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/main.py
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/pyproject.toml
 create mode 100644 packages/uipath/samples/multiclass_classification_simple/uipath.json
 create mode 100644 packages/uipath/tests/cli/eval/test_classification_samples_e2e.py

diff --git a/packages/uipath/samples/binary_classification_agent/bindings.json b/packages/uipath/samples/binary_classification_agent/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..f47cd25b8
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/eval-sets/default.json
@@ -0,0 +1,63 @@
+{
+  "version": "1.0",
+  "id": "SpamBinaryEval",
+  "name": "Binary spam classifier — precision",
+  "evaluatorRefs": ["BinarySpamPrecision"],
+  "evaluations": [
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize now."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: unsolicited promo",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "You've been selected. Click here to redeem."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "ham-invoice",
+      "name": "Ham: legitimate invoice",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is attached. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-meeting",
+      "name": "Ham: meeting request",
+      "inputs": {
+        "email_subject": "Sync on Q2 planning",
+        "email_body": "Can we meet Wednesday at 2pm to align on next quarter's roadmap?"
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    },
+    {
+      "id": "ham-mislabeled",
+      "name": "Ham mislabeled as spam (forces a false positive)",
+      "inputs": {
+        "email_subject": "Free coffee in the break room!!!",
+        "email_body": "Just a heads up — the new espresso machine is set up."
+      },
+      "evaluationCriterias": {
+        "BinarySpamPrecision": { "expectedClass": "ham" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
new file mode 100644
index 000000000..21f7d6850
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/evaluations/evaluators/binary-classification.json
@@ -0,0 +1,16 @@
+{
+  "version": "1.0",
+  "id": "BinarySpamPrecision",
+  "description": "Precision on the 'spam' positive class",
+  "evaluatorTypeId": "uipath-binary-classification",
+  "evaluatorConfig": {
+    "name": "BinarySpamPrecision",
+    "targetOutputKey": "category",
+    "positiveClass": "spam",
+    "metricType": "precision",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "ham"
+    }
+  }
+}
diff --git a/packages/uipath/samples/binary_classification_agent/main.py b/packages/uipath/samples/binary_classification_agent/main.py
new file mode 100644
index 000000000..1df5dea15
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/main.py
@@ -0,0 +1,39 @@
+"""Rule-based spam/ham classifier demonstrating the binary classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAMMY_TOKENS = {
+    "free",
+    "winner",
+    "congratulations",
+    "click here",
+    "prize",
+    "!!!",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Return 'spam' if any spam-indicator token appears in the subject or body."""
+    text = f"{subject} {body}".lower()
+    return "spam" if any(token in text for token in SPAMMY_TOKENS) else "ham"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Classify an email as 'spam' or 'ham'."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/binary_classification_agent/pyproject.toml b/packages/uipath/samples/binary_classification_agent/pyproject.toml
new file mode 100644
index 000000000..7d81d251a
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "binary-classification-agent"
+version = "0.0.1"
+description = "Rule-based spam/ham classifier demonstrating the binary classification evaluator"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/binary_classification_agent/uipath.json b/packages/uipath/samples/binary_classification_agent/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/binary_classification_agent/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/bindings.json b/packages/uipath/samples/multiclass_classification_simple/bindings.json
new file mode 100644
index 000000000..5e9beeb01
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/bindings.json
@@ -0,0 +1,4 @@
+{
+  "version": "2.0",
+  "resources": []
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
new file mode 100644
index 000000000..27e66c25d
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/eval-sets/default.json
@@ -0,0 +1,85 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassEval",
+  "name": "3-class email router — macro F1",
+  "evaluatorRefs": ["EmailMulticlassFScore"],
+  "evaluations": [
+    {
+      "id": "pay-invoice",
+      "name": "Payments: invoice reminder",
+      "inputs": {
+        "email_subject": "Your March invoice is ready",
+        "email_body": "Your monthly invoice of $45.99 is now available. Payment is due March 15."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "pay-refund",
+      "name": "Payments: refund request",
+      "inputs": {
+        "email_subject": "Refund for last month's charge",
+        "email_body": "I was charged twice for the same service. Please process a refund."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "payments" }
+      }
+    },
+    {
+      "id": "support-broken",
+      "name": "Support: feature broken",
+      "inputs": {
+        "email_subject": "Login is broken",
+        "email_body": "I'm getting an error when trying to sign in. Need help."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "support-question",
+      "name": "Support: how-to question",
+      "inputs": {
+        "email_subject": "How do I export my data?",
+        "email_body": "Can you help me figure out where the export button is?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    },
+    {
+      "id": "spam-prize",
+      "name": "Spam: prize giveaway",
+      "inputs": {
+        "email_subject": "You won a FREE iPhone!!!",
+        "email_body": "Congratulations! Click here to claim your prize."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "spam-promo",
+      "name": "Spam: marketing winner",
+      "inputs": {
+        "email_subject": "Winner of the monthly drawing",
+        "email_body": "Congratulations, click here to redeem your reward."
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "spam" }
+      }
+    },
+    {
+      "id": "support-misrouted-by-payment-word",
+      "name": "Support email accidentally routed to payments (forces an FP for payments)",
+      "inputs": {
+        "email_subject": "Question about my billing portal access",
+        "email_body": "I cannot log into the billing portal. The page just spins. Can you help?"
+      },
+      "evaluationCriterias": {
+        "EmailMulticlassFScore": { "expectedClass": "support" }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
new file mode 100644
index 000000000..859a18562
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/evaluations/evaluators/multiclass-classification.json
@@ -0,0 +1,17 @@
+{
+  "version": "1.0",
+  "id": "EmailMulticlassFScore",
+  "description": "Macro-averaged F1 across payments / support / spam",
+  "evaluatorTypeId": "uipath-multiclass-classification",
+  "evaluatorConfig": {
+    "name": "EmailMulticlassFScore",
+    "targetOutputKey": "category",
+    "classes": ["payments", "support", "spam"],
+    "metricType": "f-score",
+    "averaging": "macro",
+    "fValue": 1.0,
+    "defaultEvaluationCriteria": {
+      "expectedClass": "support"
+    }
+  }
+}
diff --git a/packages/uipath/samples/multiclass_classification_simple/main.py b/packages/uipath/samples/multiclass_classification_simple/main.py
new file mode 100644
index 000000000..3ab684298
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/main.py
@@ -0,0 +1,51 @@
+"""Rule-based 3-class email router demonstrating the multiclass classification evaluator."""
+
+from dataclasses import dataclass
+
+from uipath.tracing import traced
+
+SPAM_TOKENS = {"free", "winner", "congratulations", "click here", "prize", "!!!"}
+PAYMENT_TOKENS = {"invoice", "payment", "refund", "charge", "billing", "$"}
+SUPPORT_TOKENS = {
+    "help",
+    "support",
+    "issue",
+    "error",
+    "ticket",
+    "broken",
+    "not working",
+}
+
+
+@dataclass
+class EmailInput:
+    email_subject: str
+    email_body: str
+
+
+@dataclass
+class Classification:
+    category: str
+
+
+@traced(name="classify_email", span_type="tool")
+def classify_email(subject: str, body: str) -> str:
+    """Classify into 'spam', 'payments', or 'support' using priority rules.
+
+    Spam is checked first so promos with billing-flavored words still route to spam.
+    Payments is checked before support because it is the more specific intent.
+    Support is the catch-all default.
+    """
+    text = f"{subject} {body}".lower()
+    if any(token in text for token in SPAM_TOKENS):
+        return "spam"
+    if any(token in text for token in PAYMENT_TOKENS):
+        return "payments"
+    return "support"
+
+
+@traced()
+async def main(input: EmailInput) -> Classification:
+    """Route an email to one of three queues."""
+    category = classify_email(input.email_subject, input.email_body)
+    return Classification(category=category)
diff --git a/packages/uipath/samples/multiclass_classification_simple/pyproject.toml b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
new file mode 100644
index 000000000..e803a2a43
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "multiclass-classification-simple"
+version = "0.0.1"
+description = "Rule-based 3-class email router demonstrating the multiclass classification evaluator with macro-averaged F1"
+requires-python = ">=3.11"
+dependencies = ["uipath"]
+
+[dependency-groups]
+dev = ["uipath-dev"]
diff --git a/packages/uipath/samples/multiclass_classification_simple/uipath.json b/packages/uipath/samples/multiclass_classification_simple/uipath.json
new file mode 100644
index 000000000..9b02c2654
--- /dev/null
+++ b/packages/uipath/samples/multiclass_classification_simple/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "main.py:main"
+  }
+}
diff --git a/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
new file mode 100644
index 000000000..202363221
--- /dev/null
+++ b/packages/uipath/tests/cli/eval/test_classification_samples_e2e.py
@@ -0,0 +1,193 @@
+"""End-to-end tests that run the classification sample projects through evaluate().
+
+These tests double as integration coverage for the binary and multiclass
+classification evaluators added in #1397 — they wire each sample's main.py
+into a stand-in runtime, run the full eval set, and assert the per-row scores
+plus the aggregated metric produced by `reduce_scores`.
+"""
+
+import importlib.util
+import uuid
+from pathlib import Path
+from types import ModuleType
+from typing import Any, AsyncGenerator
+
+import pytest
+
+from uipath.core.events import EventBus
+from uipath.core.tracing import UiPathTraceManager
+from uipath.eval.helpers import EvalHelpers
+from uipath.eval.runtime import UiPathEvalContext, evaluate
+from uipath.eval.runtime._types import UiPathEvalOutput
+from uipath.eval.runtime.runtime import compute_evaluator_scores
+from uipath.runtime import (
+    UiPathExecuteOptions,
+    UiPathRuntimeEvent,
+    UiPathRuntimeFactorySettings,
+    UiPathRuntimeProtocol,
+    UiPathRuntimeResult,
+    UiPathRuntimeStatus,
+    UiPathRuntimeStorageProtocol,
+    UiPathStreamOptions,
+)
+from uipath.runtime.schema import UiPathRuntimeSchema
+
+SAMPLES_DIR = Path(__file__).resolve().parents[3] / "samples"
+
+
+def _load_sample_main(sample_dir: Path) -> ModuleType:
+    """Import a sample's main.py as an isolated module."""
+    module_name = f"_eval_sample_{sample_dir.name}"
+    spec = importlib.util.spec_from_file_location(module_name, sample_dir / "main.py")
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class _SampleRuntime:
+    """Runtime that delegates execution to the sample's `main` function."""
+
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    async def execute(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathExecuteOptions | None = None,
+    ) -> UiPathRuntimeResult:
+        input_model = self._sample_main.EmailInput(**(input or {}))
+        output = await self._sample_main.main(input_model)
+        return UiPathRuntimeResult(
+            output={"category": output.category},
+            status=UiPathRuntimeStatus.SUCCESSFUL,
+        )
+
+    async def stream(
+        self,
+        input: dict[str, Any] | None = None,
+        options: UiPathStreamOptions | None = None,
+    ) -> AsyncGenerator[UiPathRuntimeEvent, None]:
+        yield await self.execute(input, None)
+
+    async def get_schema(self) -> UiPathRuntimeSchema:
+        return UiPathRuntimeSchema(
+            filePath="main.py",
+            uniqueId="main",
+            type="agent",
+            input={
+                "type": "object",
+                "properties": {
+                    "email_subject": {"type": "string"},
+                    "email_body": {"type": "string"},
+                },
+            },
+            output={
+                "type": "object",
+                "properties": {"category": {"type": "string"}},
+            },
+        )
+
+    async def dispose(self) -> None:
+        pass
+
+
+class _SampleFactory:
+    def __init__(self, sample_main: Any) -> None:
+        self._sample_main = sample_main
+
+    def discover_entrypoints(self) -> list[str]:
+        return ["main"]
+
+    async def get_storage(self) -> UiPathRuntimeStorageProtocol | None:
+        return None
+
+    async def get_settings(self) -> UiPathRuntimeFactorySettings | None:
+        return None
+
+    async def new_runtime(
+        self, entrypoint: str, runtime_id: str, **kwargs: Any
+    ) -> UiPathRuntimeProtocol:
+        return _SampleRuntime(self._sample_main)
+
+    async def dispose(self) -> None:
+        pass
+
+
+async def _run_sample(sample_dir: Path) -> tuple[UiPathEvalOutput, dict[str, float]]:
+    """Run the sample's eval set and return (per-row output, evaluator_averages)."""
+    sample_main = _load_sample_main(sample_dir)
+    factory = _SampleFactory(sample_main)
+
+    eval_set_path = str(sample_dir / "evaluations" / "eval-sets" / "default.json")
+    evaluation_set, _ = EvalHelpers.load_eval_set(eval_set_path)
+    evaluators = await EvalHelpers.load_evaluators(
+        eval_set_path, evaluation_set, agent_model=None
+    )
+
+    runtime = await factory.new_runtime("main", "test-runtime-id")
+    runtime_schema = await runtime.get_schema()
+
+    context = UiPathEvalContext()
+    context.execution_id = str(uuid.uuid4())
+    context.evaluation_set = evaluation_set
+    context.runtime_schema = runtime_schema
+    context.evaluators = evaluators
+
+    result = await evaluate(
+        factory,
+        UiPathTraceManager(),
+        context,
+        EventBus(),
+    )
+
+    eval_output = UiPathEvalOutput.model_validate(result.output)
+    _, evaluator_averages = compute_evaluator_scores(
+        eval_output.evaluation_set_results, evaluators
+    )
+    return eval_output, evaluator_averages
+
+
+def _per_row_scores(output: UiPathEvalOutput) -> dict[str, float]:
+    return {
+        row.evaluation_name: row.evaluation_run_results[0].result.score
+        for row in output.evaluation_set_results
+    }
+
+
+async def test_binary_classification_sample_end_to_end():
+    """Binary spam classifier: 4/5 datapoints correct, but precision is 2/3 because of one FP."""
+    output, averages = await _run_sample(SAMPLES_DIR / "binary_classification_agent")
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Spam: prize giveaway": 1.0,
+        "Spam: unsolicited promo": 1.0,
+        "Ham: legitimate invoice": 1.0,
+        "Ham: meeting request": 1.0,
+        "Ham mislabeled as spam (forces a false positive)": 0.0,
+    }
+    # Precision = TP / (TP + FP) = 2 / (2 + 1) = 0.6666...
+    assert averages["BinarySpamPrecision"] == pytest.approx(2 / 3, rel=1e-6)
+
+
+async def test_multiclass_classification_sample_end_to_end():
+    """Multiclass router: 6/7 correct, macro F1 = (0.8 + 0.8 + 1.0) / 3 = 0.8666..."""
+    output, averages = await _run_sample(
+        SAMPLES_DIR / "multiclass_classification_simple"
+    )
+
+    per_row = _per_row_scores(output)
+    assert per_row == {
+        "Payments: invoice reminder": 1.0,
+        "Payments: refund request": 1.0,
+        "Support: feature broken": 1.0,
+        "Support: how-to question": 1.0,
+        "Spam: prize giveaway": 1.0,
+        "Spam: marketing winner": 1.0,
+        "Support email accidentally routed to payments "
+        "(forces an FP for payments)": 0.0,
+    }
+    # payments F1=0.8 (P=2/3, R=1), support F1=0.8 (P=1, R=2/3), spam F1=1.0
+    # macro = mean = 2.6 / 3
+    assert averages["EmailMulticlassFScore"] == pytest.approx(2.6 / 3, rel=1e-6)