Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion ax/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from collections.abc import Iterable, Mapping, Sequence
from dataclasses import replace
from datetime import datetime
from itertools import product
from itertools import accumulate, product
from logging import Logger, WARNING
from time import time

Expand Down Expand Up @@ -439,6 +439,8 @@ def get_benchmark_result_from_experiment_and_gs(
trial_completion_order = [{i} for i in range(len(experiment.trials))]
cost_trace = 1.0 + np.arange(len(experiment.trials), dtype=float)

num_trials = list(accumulate(len(trials) for trials in trial_completion_order))

# {trial_index: {arm_name: params}}
dict_of_dict_of_params = {
new_trial_index: {
Expand Down Expand Up @@ -524,6 +526,7 @@ def get_benchmark_result_from_experiment_and_gs(
is_feasible_trace=is_feasible_trace.tolist(),
score_trace=score_trace.tolist(),
cost_trace=cost_trace.tolist(),
num_trials=num_trials,
fit_time=fit_time,
gen_time=gen_time,
)
Expand Down Expand Up @@ -869,6 +872,7 @@ def get_benchmark_result_with_cumulative_steps(
result,
optimization_trace=opt_trace,
cost_trace=np.arange(1, len(opt_trace) + 1, dtype=int),
num_trials=list(range(1, len(opt_trace) + 1)),
# Empty
oracle_trace=np.full(len(opt_trace), np.nan),
inference_trace=np.full(len(opt_trace), np.nan),
Expand Down
49 changes: 42 additions & 7 deletions ax/benchmark/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class BenchmarkResult(Base):
name: Name of the benchmark. Should make it possible to determine the
problem and the method.
seed: Seed used for determinism.
fit_time: Total time spent fitting models.
gen_time: Total time spent generating candidates.
oracle_trace: For single-objective problems, the oracle trace is the
best oracle objective value seen on completed trials up to that
point. For multi-objective problems, it is the cumulative
Expand Down Expand Up @@ -75,10 +77,6 @@ class BenchmarkResult(Base):
`report_inference_value`. Having `optimization_trace` specified
separately is useful when we need just one value to evaluate how
well the benchmark went.
is_feasible_trace: Whether a trial was feasible or not. Differently from
the `inference_trace` and `oracle_trace`, the `is_feasible_trace` is
not cumulative. For problems with no constraints all elements of
`is_feasible_trace` will be True.
score_trace: The scores associated with the problem, typically either
the optimization_trace or inference_value_trace normalized to a
0-100 scale for comparability between problems.
Expand All @@ -89,8 +87,16 @@ class BenchmarkResult(Base):
not produce Data with a "step" column have a cost of 1, and trials
that produce Data with a "step" column have a cost equal to the
number of steps in the Data.
fit_time: Total time spent fitting models.
gen_time: Total time spent generating candidates.
num_trials: The cumulative number of trials that have completed or been
early stopped at each completion event. Like the other traces, it
has one element per completion event. In the synchronous case, this
is simply ``[1, 2, 3, ..., n]``. In the asynchronous case, it can
increase by more than 1 at a step if multiple trials complete at
the same time, e.g., ``[2, 4, 5, ...]``.
is_feasible_trace: Whether a trial was feasible or not. Differently from
the `inference_trace` and `oracle_trace`, the `is_feasible_trace` is
not cumulative. For problems with no constraints all elements of
`is_feasible_trace` will be True.
experiment: If not ``None``, the Ax experiment associated with the
optimization that generated this data. Either ``experiment`` or
``experiment_storage_id`` must be provided.
Expand All @@ -108,6 +114,7 @@ class BenchmarkResult(Base):
optimization_trace: list[float]
score_trace: list[float]
cost_trace: list[float]
num_trials: list[int] | None = None # optional for backwards compatibility
is_feasible_trace: list[bool] | None = None # optional for backwards compatibility

experiment: Experiment | None = None
Expand All @@ -130,6 +137,14 @@ def __post_init__(self) -> None:
class AggregatedBenchmarkResult(Base):
"""The result of a benchmark test, or series of replications. Scalar data present
in the BenchmarkResult is here represented as (mean, sem) pairs.

The ``optimization_trace`` and ``score_trace`` DataFrames have columns
``["mean", "sem", "P25", "P50", "P75"]`` for the trace values. If
``num_trials`` is available on all underlying ``BenchmarkResult`` objects,
a ``"num_trials"`` column is also included, representing the mean number
of completed or early-stopped trials across replications at each step.
This value may be fractional because different replications can have
different trial-to-step groupings in the asynchronous case.
"""

name: str
Expand Down Expand Up @@ -168,13 +183,33 @@ def from_benchmark_results(
stats = _get_stats(step_data=step_data, percentiles=PERCENTILES)
trace_stats[name] = stats

# Compute mean num_trials at each step and add as a column to each
# trace DataFrame if available on all results
num_trials_mean = None
if all(res.num_trials is not None for res in results):
num_trials_step_data = zip(
*(
# pyre-ignore[16]: already checked for None above
res.num_trials
for res in results
)
)
num_trials_mean = [nanmean(step_vals) for step_vals in num_trials_step_data]

trace_dfs = {}
for name, stats in trace_stats.items():
df = DataFrame(stats)
if num_trials_mean is not None:
df["num_trials"] = num_trials_mean
trace_dfs[name] = df

# Return aggregated results
return cls(
name=results[0].name,
results=results,
fit_time=fit_time,
gen_time=gen_time,
**{name: DataFrame(stats) for name, stats in trace_stats.items()},
**trace_dfs,
)


Expand Down
1 change: 1 addition & 0 deletions ax/benchmark/testing/benchmark_stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def get_benchmark_result(seed: int = 0) -> BenchmarkResult:
inference_trace=[1.0, 1.0, 1.0, 1.0],
oracle_trace=[0.0, 0.0, 0.0, 0.0],
cost_trace=[0.0, 0.0, 0.0, 0.0],
num_trials=[1, 2, 3, 4],
optimization_trace=[3.0, 2.0, 1.0, 0.1],
score_trace=[3.0, 2.0, 1.0, 0.1],
is_feasible_trace=[True, True, True, True],
Expand Down
17 changes: 17 additions & 0 deletions ax/benchmark/tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,12 @@ def _test_replication_async(self, map_data: bool) -> None:
"Trials complete at same time": [1, 2],
"Complete out of order": [1, 2, 3, 4],
}
expected_num_trials = {
"All complete at different times": [1, 2, 3, 4],
"Trials complete immediately": [2, 4],
"Trials complete at same time": [2, 4],
"Complete out of order": [1, 2, 3, 4],
}
expected_backend_simulator_time = {
"All complete at different times": 12,
"Trials complete immediately": 2,
Expand Down Expand Up @@ -465,6 +471,11 @@ def _test_replication_async(self, map_data: bool) -> None:
expected_costs[case_name],
msg=case_name,
)
self.assertEqual(
result.num_trials,
expected_num_trials[case_name],
msg=case_name,
)
if map_data:
data = experiment.lookup_data()
self.assertEqual(len(data.df), 4, msg=case_name)
Expand Down Expand Up @@ -840,6 +851,8 @@ def test_replication_moo_sobol(self) -> None:

self.assertTrue(np.all(np.array(res.score_trace) <= 100))
self.assertEqual(len(res.cost_trace), problem.num_trials)
self.assertEqual(len(res.num_trials), problem.num_trials)
self.assertEqual(res.num_trials, list(range(1, problem.num_trials + 1)))
self.assertEqual(len(res.inference_trace), problem.num_trials)
# since inference trace is not supported for MOO, it should be all NaN
self.assertTrue(np.isnan(res.inference_trace).all())
Expand All @@ -866,6 +879,10 @@ def test_benchmark_one_method_problem(self) -> None:

for col in ["mean", "P25", "P50", "P75"]:
self.assertTrue((agg.score_trace[col] <= 100).all())
self.assertIn("num_trials", agg.optimization_trace.columns)
self.assertIn("num_trials", agg.score_trace.columns)
self.assertTrue((agg.optimization_trace["num_trials"] > 0).all())
self.assertTrue((agg.score_trace["num_trials"] > 0).all())

@mock_botorch_optimize
def test_benchmark_multiple_problems_methods(self) -> None:
Expand Down
2 changes: 2 additions & 0 deletions ax/benchmark/tests/test_benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
score_trace=[],
is_feasible_trace=[],
cost_trace=[],
num_trials=[],
fit_time=0.0,
gen_time=0.0,
experiment=get_experiment(),
Expand All @@ -44,6 +45,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
score_trace=[],
is_feasible_trace=[],
cost_trace=[],
num_trials=[],
fit_time=0.0,
gen_time=0.0,
)