facebook · saitcakmak · Feb 25, 2026
diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
@@ -25,7 +25,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import replace
 from datetime import datetime
-from itertools import product
+from itertools import accumulate, product
 from logging import Logger, WARNING
 from time import time
 
@@ -439,6 +439,8 @@ def get_benchmark_result_from_experiment_and_gs(
         trial_completion_order = [{i} for i in range(len(experiment.trials))]
         cost_trace = 1.0 + np.arange(len(experiment.trials), dtype=float)
 
+    num_trials = list(accumulate(len(trials) for trials in trial_completion_order))
+
     # {trial_index: {arm_name: params}}
     dict_of_dict_of_params = {
         new_trial_index: {
@@ -524,6 +526,7 @@ def get_benchmark_result_from_experiment_and_gs(
         is_feasible_trace=is_feasible_trace.tolist(),
         score_trace=score_trace.tolist(),
         cost_trace=cost_trace.tolist(),
+        num_trials=num_trials,
         fit_time=fit_time,
         gen_time=gen_time,
     )
@@ -869,6 +872,7 @@ def get_benchmark_result_with_cumulative_steps(
         result,
         optimization_trace=opt_trace,
         cost_trace=np.arange(1, len(opt_trace) + 1, dtype=int),
+        num_trials=list(range(1, len(opt_trace) + 1)),
         # Empty
         oracle_trace=np.full(len(opt_trace), np.nan),
         inference_trace=np.full(len(opt_trace), np.nan),

diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py
@@ -32,6 +32,8 @@ class BenchmarkResult(Base):
         name: Name of the benchmark. Should make it possible to determine the
             problem and the method.
         seed: Seed used for determinism.
+        fit_time: Total time spent fitting models.
+        gen_time: Total time spent generating candidates.
         oracle_trace: For single-objective problems, the oracle trace is the
             best oracle objective value seen on completed trials up to that
             point. For multi-objective problems, it is the cumulative
@@ -75,10 +77,6 @@ class BenchmarkResult(Base):
             `report_inference_value`. Having `optimization_trace` specified
             separately is useful when we need just one value to evaluate how
             well the benchmark went.
-        is_feasible_trace: Whether a trial was feasible or not. Differently from
-            the `inference_trace` and `oracle_trace`, the `is_feasible_trace` is
-            not cumulative. For problems with no constraints all elements of
-            `is_feasible_trace` will be True.
         score_trace: The scores associated with the problem, typically either
             the optimization_trace or inference_value_trace normalized to a
             0-100 scale for comparability between problems.
@@ -89,8 +87,16 @@ class BenchmarkResult(Base):
             not produce Data with a "step" column have a cost of 1, and trials
             that produce Data with a "step" column have a cost equal to the
             number of steps in the Data.
-        fit_time: Total time spent fitting models.
-        gen_time: Total time spent generating candidates.
+        num_trials: The cumulative number of trials that have completed or been
+            early stopped at each completion event. Like the other traces, it
+            has one element per completion event. In the synchronous case, this
+            is simply ``[1, 2, 3, ..., n]``. In the asynchronous case, it can
+            increase by more than 1 at a step if multiple trials complete at
+            the same time, e.g., ``[2, 4, 5, ...]``.
+        is_feasible_trace: Whether a trial was feasible or not. Differently from
+            the `inference_trace` and `oracle_trace`, the `is_feasible_trace` is
+            not cumulative. For problems with no constraints all elements of
+            `is_feasible_trace` will be True.
         experiment: If not ``None``, the Ax experiment associated with the
             optimization that generated this data. Either ``experiment`` or
             ``experiment_storage_id`` must be provided.
@@ -108,6 +114,7 @@ class BenchmarkResult(Base):
     optimization_trace: list[float]
     score_trace: list[float]
     cost_trace: list[float]
+    num_trials: list[int] | None = None  # optional for backwards compatibility
     is_feasible_trace: list[bool] | None = None  # optional for backwards compatibility
 
     experiment: Experiment | None = None
@@ -130,6 +137,14 @@ def __post_init__(self) -> None:
 class AggregatedBenchmarkResult(Base):
     """The result of a benchmark test, or series of replications. Scalar data present
     in the BenchmarkResult is here represented as (mean, sem) pairs.
+
+    The ``optimization_trace`` and ``score_trace`` DataFrames have columns
+    ``["mean", "sem", "P25", "P50", "P75"]`` for the trace values. If
+    ``num_trials`` is available on all underlying ``BenchmarkResult`` objects,
+    a ``"num_trials"`` column is also included, representing the mean number
+    of completed or early-stopped trials across replications at each step.
+    This value may be fractional because different replications can have
+    different trial-to-step groupings in the asynchronous case.
     """
 
     name: str
@@ -168,13 +183,33 @@ def from_benchmark_results(
             stats = _get_stats(step_data=step_data, percentiles=PERCENTILES)
             trace_stats[name] = stats
 
+        # Compute mean num_trials at each step and add as a column to each
+        # trace DataFrame if available on all results
+        num_trials_mean = None
+        if all(res.num_trials is not None for res in results):
+            num_trials_step_data = zip(
+                *(
+                    # pyre-ignore[16]: already checked for None above
+                    res.num_trials
+                    for res in results
+                )
+            )
+            num_trials_mean = [nanmean(step_vals) for step_vals in num_trials_step_data]
+
+        trace_dfs = {}
+        for name, stats in trace_stats.items():
+            df = DataFrame(stats)
+            if num_trials_mean is not None:
+                df["num_trials"] = num_trials_mean
+            trace_dfs[name] = df
+
         # Return aggregated results
         return cls(
             name=results[0].name,
             results=results,
             fit_time=fit_time,
             gen_time=gen_time,
-            **{name: DataFrame(stats) for name, stats in trace_stats.items()},
+            **trace_dfs,
         )
 
 

diff --git a/ax/benchmark/testing/benchmark_stubs.py b/ax/benchmark/testing/benchmark_stubs.py
@@ -179,6 +179,7 @@ def get_benchmark_result(seed: int = 0) -> BenchmarkResult:
         inference_trace=[1.0, 1.0, 1.0, 1.0],
         oracle_trace=[0.0, 0.0, 0.0, 0.0],
         cost_trace=[0.0, 0.0, 0.0, 0.0],
+        num_trials=[1, 2, 3, 4],
         optimization_trace=[3.0, 2.0, 1.0, 0.1],
         score_trace=[3.0, 2.0, 1.0, 0.1],
         is_feasible_trace=[True, True, True, True],

diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
@@ -368,6 +368,12 @@ def _test_replication_async(self, map_data: bool) -> None:
             "Trials complete at same time": [1, 2],
             "Complete out of order": [1, 2, 3, 4],
         }
+        expected_num_trials = {
+            "All complete at different times": [1, 2, 3, 4],
+            "Trials complete immediately": [2, 4],
+            "Trials complete at same time": [2, 4],
+            "Complete out of order": [1, 2, 3, 4],
+        }
         expected_backend_simulator_time = {
             "All complete at different times": 12,
             "Trials complete immediately": 2,
@@ -465,6 +471,11 @@ def _test_replication_async(self, map_data: bool) -> None:
                     expected_costs[case_name],
                     msg=case_name,
                 )
+                self.assertEqual(
+                    result.num_trials,
+                    expected_num_trials[case_name],
+                    msg=case_name,
+                )
                 if map_data:
                     data = experiment.lookup_data()
                     self.assertEqual(len(data.df), 4, msg=case_name)
@@ -840,6 +851,8 @@ def test_replication_moo_sobol(self) -> None:
 
         self.assertTrue(np.all(np.array(res.score_trace) <= 100))
         self.assertEqual(len(res.cost_trace), problem.num_trials)
+        self.assertEqual(len(res.num_trials), problem.num_trials)
+        self.assertEqual(res.num_trials, list(range(1, problem.num_trials + 1)))
         self.assertEqual(len(res.inference_trace), problem.num_trials)
         # since inference trace is not supported for MOO, it should be all NaN
         self.assertTrue(np.isnan(res.inference_trace).all())
@@ -866,6 +879,10 @@ def test_benchmark_one_method_problem(self) -> None:
 
         for col in ["mean", "P25", "P50", "P75"]:
             self.assertTrue((agg.score_trace[col] <= 100).all())
+        self.assertIn("num_trials", agg.optimization_trace.columns)
+        self.assertIn("num_trials", agg.score_trace.columns)
+        self.assertTrue((agg.optimization_trace["num_trials"] > 0).all())
+        self.assertTrue((agg.score_trace["num_trials"] > 0).all())
 
     @mock_botorch_optimize
     def test_benchmark_multiple_problems_methods(self) -> None:

diff --git a/ax/benchmark/tests/test_benchmark_result.py b/ax/benchmark/tests/test_benchmark_result.py
@@ -26,6 +26,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
                 score_trace=[],
                 is_feasible_trace=[],
                 cost_trace=[],
+                num_trials=[],
                 fit_time=0.0,
                 gen_time=0.0,
                 experiment=get_experiment(),
@@ -44,6 +45,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
                 score_trace=[],
                 is_feasible_trace=[],
                 cost_trace=[],
+                num_trials=[],
                 fit_time=0.0,
                 gen_time=0.0,
             )