diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..d33546aec --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,337 @@ +# CLAUDE.md - PyTorch Benchmark Repository Guide + +This file provides guidance for AI assistants working on the `pytorch-benchmark` repository. + +## Repository Overview + +This is the **PyTorch Benchmark Suite** — a comprehensive performance testing framework for PyTorch that includes 105+ models across computer vision, NLP, and specialized domains. It is used for continuous performance monitoring, regression detection, and bisection analysis. + +## Repository Structure + +``` +pytorch-benchmark/ +├── torchbenchmark/ # Core benchmark library +│ ├── models/ # 105+ benchmark models (each in its own directory) +│ ├── canary_models/ # Experimental/canary models +│ ├── e2e_models/ # End-to-end benchmark models +│ ├── util/ # Core utilities (model loading, profiling, benchmarking) +│ └── _components/ # Component architecture (tasks, worker processes) +├── userbenchmark/ # User-customizable benchmark suites +│ ├── dynamo/ # TorchDynamo compilation testing +│ ├── optim/ # Optimizer performance +│ ├── torch-nightly/ # Nightly PyTorch tracking +│ ├── torchao/ # TorchAO quantization +│ └── ... # Other benchmark suites +├── .github/ +│ ├── workflows/ # GitHub Actions CI/CD +│ └── scripts/ # Helper scripts (bisection, A/B testing, analysis) +├── .ci/torchbench/ # Local CI scripts (install.sh, test.sh) +├── scripts/ # Utility scripts (conda, batch sizing, scribe uploads) +├── utils/ # Build utilities, CUDA, GitHub, S3, version checking +├── docker/ # Docker configuration for nightly builds +├── submodules/ # External project submodules (FAMBench, lit-llama) +├── test.py # Unittest sanity checks for all models +├── test_bench.py # pytest-benchmark driver for performance measurements +├── run.py # CLI for debugging/profiling individual models +├── run_benchmark.py # Router for userbenchmarks +├── run_e2e.py # End-to-end model execution +├── install.py # Main installation orchestrator +├── regression_detector.py # A/B test result comparison and regression detection +├── bisection.py # Automated binary search for regression sources +├── conftest.py # pytest configuration +├── requirements.txt # Core Python dependencies +├── setup.py # Package setup +└── pyproject.toml # Build system and code style config +``` + +## Development Workflows + +### Installation + +```bash +# Install all models +python3 install.py + +# Install specific models +python3 install.py --models BERT_pytorch densenet121 + +# Skip certain models during install +python3 install.py --skip MODEL_NAME + +# Install a userbenchmark +python3 install.py --userbenchmark dynamo + +# Check if models are installed (no install) +python3 install.py --check-only +``` + +### Running Benchmarks + +```bash +# Unittest sanity checks (accuracy validation) +python3 test.py -k "test_BERT_pytorch_train_cpu" +python3 test.py -k "cuda" # Run all CUDA tests + +# Performance benchmarking with pytest-benchmark +pytest test_bench.py +pytest test_bench.py -k "BERT" --benchmark-autosave +pytest test_bench.py --cpu_only +pytest test_bench.py --cuda_only + +# Debug/profile a single model +python3 run.py BERT_pytorch -d cuda -t train +python3 run.py densenet121 -d cpu -t eval --profile +python3 run.py alexnet -d cuda -t eval --bs 32 + +# End-to-end models +python3 run_e2e.py -t eval --bs 16 + +# Run userbenchmarks +python run_benchmark.py dynamo [benchmark-args] +python run_benchmark.py optim [benchmark-args] +``` + +### CI Execution + +```bash +bash .ci/torchbench/install.sh +bash .ci/torchbench/test.sh +``` + +### Regression Detection and Bisection + +```bash +# Compare two benchmark result sets (A/B test) +python regression_detector.py --control --treatment [--output ] + +# Binary search over commits to find regression source +python bisection.py \ + --work-dir \ + --torch-repos-path \ + --torchbench-repo-path \ + --config \ + --output +``` + +## Key Conventions + +### Model Structure + +Every model lives in `torchbenchmark/models//` and must contain: + +``` +torchbenchmark/models/MyModel/ +├── __init__.py # Model class implementing BenchmarkModel API +├── install.py # Model-specific dependency installation +└── metadata.yaml # Device-specific batch sizes and benchmark config +``` + +Optional files: `requirements.txt`, `setup.py`, subdirectories for model code. + +### BenchmarkModel API + +All models inherit from `BenchmarkModel` and implement: + +```python +class Model(BenchmarkModel): + task = TASK_TYPE # e.g., NLP.LANGUAGE_MODELING + DEFAULT_TRAIN_BSIZE = 32 # Default training batch size + DEFAULT_EVAL_BSIZE = 32 # Default eval batch size + DEEPCOPY = False # Whether to deep copy model between runs + DISABLE_DETERMINISM = False # Whether to disable determinism checks + + def __init__(self, test: str, device: str, batch_size: Optional[int] = None, extra_args: List[str] = []): + ... + + def get_module(self): + """Return (model, example_inputs)""" + return self.model, self.example_inputs + + def train(self): + """Run one training step""" + ... + + def eval(self): + """Run one eval step""" + ... +``` + +### Userbenchmark Structure + +Each userbenchmark in `userbenchmark//` requires: + +``` +userbenchmark/my_benchmark/ +├── __init__.py # Required (can be empty) +└── run.py # Must expose: run(args: List[str]) +``` + +Optional: `install.py`, `regression_detector.py`, `ci.yaml`. + +**Output format** (written to `.userbenchmark//metrics-.json`): +```json +{ + "name": "benchmark-name", + "environ": { + "pytorch_git_version": "...", + "pytorch_version": "..." + }, + "metrics": { + "metric_name": value + } +} +``` + +### Naming Conventions + +- **Classes**: PascalCase (`BenchmarkModel`, `ModelTask`) +- **Functions/Methods**: snake_case (`get_module`, `train`, `eval`) +- **Constants**: UPPERCASE (`DEFAULT_TRAIN_BSIZE`, `DEEPCOPY`) +- **Test names**: `test___` (e.g., `test_BERT_pytorch_train_cpu`) + +### Code Style + +- **Line length**: 88 characters (Black formatter), 120 characters (flake8 linting) +- **Formatter**: Black (`pyproject.toml` configures this) +- **Linter**: flake8 (see `.flake8` for ignored rules) +- **C++ formatting**: clang-format (`.clang-format`) + +### Supported Devices + +- `cpu` - CPU execution +- `cuda` - NVIDIA GPU via CUDA +- `mps` - Apple Silicon GPU via Metal Performance Shaders +- `hpu` - Intel Gaudi (Habana) + +### Import Patterns + +```python +# Core benchmark imports +from torchbenchmark import ( + _list_model_paths, + ModelTask, + get_metadata_from_yaml, +) + +# Model-specific +from torchbenchmark.util.model import BenchmarkModel +from torchbenchmark.tasks import NLP, COMPUTER_VISION +``` + +## Adding New Models + +Refer to `torchbenchmark/models/ADDING_MODELS.md` for the complete guide. Key steps: + +1. Create directory `torchbenchmark/models//` +2. Implement `__init__.py` with the `BenchmarkModel` subclass +3. Create `install.py` for dependencies +4. Create `metadata.yaml` with device/batch-size configuration +5. Test with `python3 run.py -d cpu -t eval` +6. Run sanity checks: `python3 test.py -k "test_"` + +## Adding New Userbenchmarks + +Refer to `userbenchmark/ADDING_USERBENCHMARKS.md` for the complete guide. Key steps: + +1. Create directory `userbenchmark//` +2. Add `__init__.py` (can be empty) +3. Implement `run.py` with a `run(args: List[str])` function +4. Optionally add `install.py`, `regression_detector.py`, `ci.yaml` + +## CI/CD Overview + +### GitHub Actions Workflows + +- **`pr-test.yml`**: Main PR workflow + - Triggers on PRs to main, pushes to main, and manual dispatch + - Matrix builds: CPU (`linux.24xlarge`) and CUDA (`linux.aws.a100`) + - Docker-based execution with 240-minute timeout + - Uses `HUGGING_FACE_HUB_TOKEN` secret for model downloads + +- **`build-nightly-docker.yml`**: Builds nightly Docker image +- **`clean-nightly-docker.yml`**: Cleans up old nightly images + +### CI Scripts + +- `.ci/torchbench/install.sh` — Checks Python version, installs models (with skip list) +- `.ci/torchbench/test.sh` — Executes the test suite + +## Dependencies + +### Core Requirements (`requirements.txt`) + +| Dependency | Purpose | +|------------|---------| +| `transformers==4.57.3` | HuggingFace Transformers (pinned) | +| `timm==1.0.19` | PyTorch Image Models (pinned) | +| `numba>=0.57.0` | JIT compilation for some models | +| `pytest`, `pytest-benchmark` | Testing framework | +| `pandas`, `numpy`, `scipy` | Data analysis | +| `boto3` | AWS S3 integration | +| `nvidia-ml-py>=13.0.0` | NVIDIA GPU monitoring | +| `submitit` | Job submission | +| `pyyaml`, `tabulate` | YAML parsing, output formatting | + +### PyTorch Ecosystem (installed separately) + +- `torch`, `torchvision`, `torchaudio` + +## Metadata YAML Format + +Each model's `metadata.yaml` specifies device-specific configs: + +```yaml +devices: + cpu: + train_batch_size: 4 + eval_batch_size: 8 + cuda: + train_batch_size: 32 + eval_batch_size: 64 +operators: + - ... +``` + +## Common Patterns and Pitfalls + +### Memory Management + +- Models should sync CUDA operations: `torch.cuda.synchronize()` after GPU work +- The test harness checks for CUDA memory leaks between iterations +- Use `DEEPCOPY = True` if model state is mutated during `get_module()` calls + +### Determinism + +- By default, models are tested with determinism checks for accuracy validation +- Set `DISABLE_DETERMINISM = True` only when a model is inherently non-deterministic (e.g., uses random sampling) + +### Batch Size Handling + +- Always respect the `batch_size` parameter passed to `__init__` +- Fall back to `DEFAULT_TRAIN_BSIZE` / `DEFAULT_EVAL_BSIZE` if `batch_size=None` +- Device-specific defaults should be in `metadata.yaml` + +### Test Filtering + +```bash +# Filter by model name +pytest test_bench.py -k "BERT" + +# Filter by device +pytest test_bench.py --cuda_only +pytest test_bench.py --cpu_only + +# Filter unittest +python3 test.py -k "test_BERT_pytorch_train_cuda" +``` + +## Useful Files to Reference + +| File | Purpose | +|------|---------| +| `torchbenchmark/util/model.py` | `BenchmarkModel` base class definition | +| `torchbenchmark/__init__.py` | Core loading utilities, `ModelTask`, `_list_model_paths` | +| `torchbenchmark/util/env_check.py` | Environment validation helpers | +| `userbenchmark/utils.py` | Shared userbenchmark utilities | +| `conftest.py` | pytest fixtures and configuration | +| `run.py` | Reference for model invocation patterns | diff --git a/torchbenchmark/util/framework/huggingface/basic_configs.py b/torchbenchmark/util/framework/huggingface/basic_configs.py index f0941f7df..df1093a59 100644 --- a/torchbenchmark/util/framework/huggingface/basic_configs.py +++ b/torchbenchmark/util/framework/huggingface/basic_configs.py @@ -300,8 +300,9 @@ def _extract_config_cls_name(config_cls_ctor: str) -> str: return m.groups()[0] config_cls_name = _extract_config_cls_name(HUGGINGFACE_MODELS[model_name][2]) - exec(f"from transformers import {config_cls_name}") - config = eval(HUGGINGFACE_MODELS[model_name][2]) + namespace = {} + exec(f"from transformers import {config_cls_name}", namespace) + config = eval(HUGGINGFACE_MODELS[model_name][2], namespace) model_cls = getattr(transformers, HUGGINGFACE_MODELS[model_name][3]) kwargs = {} if model_name in HUGGINGFACE_MODELS_REQUIRING_TRUST_REMOTE_CODE: diff --git a/userbenchmark/whisper/__init__.py b/userbenchmark/whisper/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/userbenchmark/whisper/run.py b/userbenchmark/whisper/run.py new file mode 100644 index 000000000..e8b467d96 --- /dev/null +++ b/userbenchmark/whisper/run.py @@ -0,0 +1,408 @@ +""" +Whisper Medium Benchmark +======================== +Benchmarks OpenAI Whisper medium encoder inference using PyTorch. + +Measures: + - Average latency per inference (CPU fp32, CUDA fp16) + - Encoder FLOPs (via torch.utils.flop_counter.FlopCounterMode) + - CPU-to-GPU speedup ratio + +Input shape follows the existing hf_Whisper torchbenchmark model: + (batch=1, mel_bins=80, time_frames=3000) — 30 seconds of audio at 100 fps. + +Usage +----- +Via userbenchmark router: + python run_benchmark.py whisper [--no-cuda] [--warmup N] [--iters N] + +Direct execution: + python userbenchmark/whisper/run.py [--no-cuda] [--warmup N] [--iters N] +""" + +import argparse +import statistics +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + +import torch + +# --------------------------------------------------------------------------- +# Constants — aligned with torchbenchmark/models/hf_Whisper conventions +# --------------------------------------------------------------------------- +MODEL_NAME = "openai/whisper-medium" + +# Mel-spectrogram shape: (batch, mel_bins, time_frames) +# Matches hf_Whisper's example_inputs construction: (batch_size, 80, 3000) +INPUT_SHAPE = (1, 80, 3000) + +# Benchmark iterations — reduced vs. default WARMUP_ROUNDS=10/BENCHMARK_ITERS=15 +# because Whisper medium encoder is large (~300M params) and slow on CPU. +DEFAULT_WARMUP = 5 +DEFAULT_ITERS = 10 + +NS_PER_MS = 1_000_000.0 + +BM_NAME = "whisper" + + +# --------------------------------------------------------------------------- +# Model loading +# --------------------------------------------------------------------------- + +def load_model(device: str) -> "WhisperForConditionalGeneration": + """Load Whisper medium from HuggingFace Hub and move to device. + + Mirrors hf_Whisper's approach: + - CPU → fp32 + - CUDA → fp16 (DEFAULT_EVAL_CUDA_PRECISION = "fp16" in hf_Whisper) + """ + from transformers import WhisperForConditionalGeneration + + print(f" Loading {MODEL_NAME} ...", end=" ", flush=True) + try: + model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME) + except Exception as exc: + print(f"\nFailed to load model: {exc}", file=sys.stderr) + print( + "Ensure you have internet access or a cached copy of the model.\n" + "Install dependencies: pip install transformers>=4.23.0", + file=sys.stderr, + ) + sys.exit(1) + + model.eval() + model = model.to(device) + + # Match hf_Whisper: use fp16 on CUDA to reflect real-world inference. + if device == "cuda": + model = model.half() + + print("done.") + return model + + +# --------------------------------------------------------------------------- +# Latency measurement +# --------------------------------------------------------------------------- + +def _sync(device: str) -> None: + if device == "cuda": + torch.cuda.synchronize() + + +def measure_latency( + model: "WhisperForConditionalGeneration", + input_features: torch.Tensor, + device: str, + warmup: int = DEFAULT_WARMUP, + iters: int = DEFAULT_ITERS, +) -> List[float]: + """Return per-inference latencies in milliseconds. + + Follows torchbenchmark/util/experiment/metrics.py::get_latencies(): + - warmup loop (discarded) + - timed loop with synchronize() bracketing every iteration + - returns list of float latencies in ms + """ + encoder = model.model.encoder + + # Warmup + with torch.no_grad(): + for _ in range(warmup): + _sync(device) + encoder(input_features) + _sync(device) + + # Timed benchmark + latencies: List[float] = [] + with torch.no_grad(): + for _ in range(iters): + _sync(device) + t0 = time.time_ns() + encoder(input_features) + _sync(device) + t1 = time.time_ns() + latencies.append((t1 - t0) / NS_PER_MS) + + return latencies + + +# --------------------------------------------------------------------------- +# FLOPS measurement +# --------------------------------------------------------------------------- + +def measure_flops() -> Optional[int]: + """Count encoder FLOPs for one forward pass using FlopCounterMode. + + Runs on CPU in fp32 — FlopCounterMode is device/dtype-agnostic but CPU + makes the import path reliable across all PyTorch builds. + + Returns None if FlopCounterMode is unavailable (PyTorch < 2.0). + """ + try: + from torch.utils.flop_counter import FlopCounterMode + except ImportError: + print( + " Warning: torch.utils.flop_counter not available (PyTorch >= 2.0 required).", + file=sys.stderr, + ) + return None + + # FLOPs measurement always uses a fresh CPU fp32 copy so results are + # independent of the device/dtype used for latency benchmarking. + from transformers import WhisperForConditionalGeneration + + print(f" Loading CPU fp32 copy for FLOPs measurement ...", end=" ", flush=True) + try: + flop_model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME) + except Exception as exc: + print(f"\n FLOPs skipped — could not load model: {exc}", file=sys.stderr) + return None + flop_model.eval() + print("done.") + + cpu_input = torch.randn(INPUT_SHAPE, dtype=torch.float32) + + flop_counter = FlopCounterMode(display=False) + with torch.no_grad(), flop_counter: + flop_model.model.encoder(cpu_input) + + total_flops = sum(flop_counter.flop_counts["Global"].values()) + del flop_model + return int(total_flops) + + +# --------------------------------------------------------------------------- +# Output formatting +# --------------------------------------------------------------------------- + +def _fmt_ms(ms: float) -> str: + return f"{ms:>10.2f} ms" + + +def _fmt_flops(flops: int) -> str: + gflops = flops / 1e9 + return f"{flops:,} (~{gflops:.1f} GFLOPs)" + + +def print_results( + cpu_latencies: List[float], + cuda_latencies: Optional[List[float]], + flops: Optional[int], +) -> None: + """Print a human-readable benchmark summary table.""" + sep = "-" * 72 + + print() + print("=" * 72) + print(f" Whisper Medium Benchmark ({MODEL_NAME})") + print(f" Input : {INPUT_SHAPE} (batch, mel_bins, time_frames)") + print(f" Task : encoder-only inference (model.model.encoder)") + print("=" * 72) + + # Header + print(f"\n{'Device':<8} {'Dtype':<6} {'Avg':>10} {'Std':>9} {'Min':>10} {'Max':>10}") + print(sep) + + # CPU row + cpu_mean = statistics.mean(cpu_latencies) + cpu_std = statistics.stdev(cpu_latencies) if len(cpu_latencies) > 1 else 0.0 + cpu_min = min(cpu_latencies) + cpu_max = max(cpu_latencies) + print( + f"{'CPU':<8} {'fp32':<6} {cpu_mean:>10.2f} {cpu_std:>8.2f} " + f"{cpu_min:>10.2f} {cpu_max:>10.2f} ms" + ) + + # CUDA row (optional) + if cuda_latencies: + cuda_mean = statistics.mean(cuda_latencies) + cuda_std = statistics.stdev(cuda_latencies) if len(cuda_latencies) > 1 else 0.0 + cuda_min = min(cuda_latencies) + cuda_max = max(cuda_latencies) + print( + f"{'CUDA':<8} {'fp16':<6} {cuda_mean:>10.2f} {cuda_std:>8.2f} " + f"{cuda_min:>10.2f} {cuda_max:>10.2f} ms" + ) + + print(sep) + + # FLOPs + if flops is not None: + print(f"\n Encoder FLOPs : {_fmt_flops(flops)}") + else: + print("\n Encoder FLOPs : N/A") + + # Speedup + if cuda_latencies: + speedup = statistics.mean(cpu_latencies) / statistics.mean(cuda_latencies) + print(f" CPU→GPU Speedup : {speedup:.1f}x") + else: + print(" CPU→GPU Speedup : N/A (CUDA not available / skipped)") + + print() + + +# --------------------------------------------------------------------------- +# Metrics dict for JSON output +# --------------------------------------------------------------------------- + +def build_metrics( + cpu_latencies: List[float], + cuda_latencies: Optional[List[float]], + flops: Optional[int], +) -> Dict[str, float]: + """Assemble the flat metrics dict written to the userbenchmark JSON file.""" + metrics: Dict[str, float] = { + "cpu_latency_mean_ms": round(statistics.mean(cpu_latencies), 4), + "cpu_latency_std_ms": round( + statistics.stdev(cpu_latencies) if len(cpu_latencies) > 1 else 0.0, 4 + ), + "cpu_latency_min_ms": round(min(cpu_latencies), 4), + "cpu_latency_max_ms": round(max(cpu_latencies), 4), + } + + if flops is not None: + metrics["encoder_flops"] = float(flops) + metrics["encoder_gflops"] = round(flops / 1e9, 3) + + if cuda_latencies: + cuda_mean = statistics.mean(cuda_latencies) + metrics["cuda_latency_mean_ms"] = round(cuda_mean, 4) + metrics["cuda_latency_std_ms"] = round( + statistics.stdev(cuda_latencies) if len(cuda_latencies) > 1 else 0.0, 4 + ) + metrics["cuda_latency_min_ms"] = round(min(cuda_latencies), 4) + metrics["cuda_latency_max_ms"] = round(max(cuda_latencies), 4) + metrics["cpu_to_cuda_speedup"] = round( + statistics.mean(cpu_latencies) / cuda_mean, 2 + ) + + return metrics + + +# --------------------------------------------------------------------------- +# Entry point — required by userbenchmark spec +# --------------------------------------------------------------------------- + +def run(args: List[str] = []) -> None: + """Run the Whisper medium benchmark. + + This is the required entry point for the userbenchmark framework. + Invoked via: python run_benchmark.py whisper [args] + """ + parser = argparse.ArgumentParser( + description="Benchmark OpenAI Whisper medium encoder latency and FLOPs." + ) + parser.add_argument( + "--no-cuda", + action="store_true", + help="Skip CUDA benchmark even if a GPU is available.", + ) + parser.add_argument( + "--warmup", + type=int, + default=DEFAULT_WARMUP, + metavar="N", + help=f"Warmup iterations before timing (default: {DEFAULT_WARMUP}).", + ) + parser.add_argument( + "--iters", + type=int, + default=DEFAULT_ITERS, + metavar="N", + help=f"Timed benchmark iterations (default: {DEFAULT_ITERS}).", + ) + parser.add_argument( + "--no-flops", + action="store_true", + help="Skip FLOPs measurement (saves time; FLOPs require a second model load).", + ) + parsed = parser.parse_args(args) + + cuda_available = torch.cuda.is_available() and not parsed.no_cuda + if parsed.no_cuda: + print("CUDA benchmark skipped (--no-cuda).") + elif not torch.cuda.is_available(): + print("CUDA not available — running CPU benchmark only.") + + # ------------------------------------------------------------------ + # CPU benchmark + # ------------------------------------------------------------------ + print("\n[1/3] CPU benchmark (fp32)") + cpu_model = load_model("cpu") + cpu_input = torch.randn(INPUT_SHAPE, dtype=torch.float32) + print( + f" Warmup {parsed.warmup} iters, then timing {parsed.iters} iters ...", + flush=True, + ) + cpu_latencies = measure_latency( + cpu_model, cpu_input, "cpu", warmup=parsed.warmup, iters=parsed.iters + ) + print(f" Avg latency: {statistics.mean(cpu_latencies):.2f} ms") + del cpu_model + + # ------------------------------------------------------------------ + # CUDA benchmark + # ------------------------------------------------------------------ + cuda_latencies: Optional[List[float]] = None + if cuda_available: + print("\n[2/3] CUDA benchmark (fp16)") + cuda_model = load_model("cuda") + # fp16 input — matches model dtype on CUDA + cuda_input = torch.randn(INPUT_SHAPE, dtype=torch.float16, device="cuda") + print( + f" Warmup {parsed.warmup} iters, then timing {parsed.iters} iters ...", + flush=True, + ) + cuda_latencies = measure_latency( + cuda_model, cuda_input, "cuda", warmup=parsed.warmup, iters=parsed.iters + ) + print(f" Avg latency: {statistics.mean(cuda_latencies):.2f} ms") + del cuda_model + else: + print("\n[2/3] CUDA benchmark — skipped.") + + # ------------------------------------------------------------------ + # FLOPs measurement + # ------------------------------------------------------------------ + flops: Optional[int] = None + if not parsed.no_flops: + print("\n[3/3] FLOPs measurement (CPU fp32, single forward pass)") + flops = measure_flops() + if flops is not None: + print(f" Encoder FLOPs: {_fmt_flops(flops)}") + else: + print("\n[3/3] FLOPs measurement — skipped (--no-flops).") + + # ------------------------------------------------------------------ + # Results + # ------------------------------------------------------------------ + print_results(cpu_latencies, cuda_latencies, flops) + + # ------------------------------------------------------------------ + # JSON output (userbenchmark format) + # ------------------------------------------------------------------ + # Import here so the script can also be run standalone without the + # full torchbenchmark package on sys.path. + try: + _repo_root = Path(__file__).parent.parent.parent + sys.path.insert(0, str(_repo_root)) + from userbenchmark.utils import dump_output, get_output_json + + metrics = build_metrics(cpu_latencies, cuda_latencies, flops) + output = get_output_json(BM_NAME, metrics) + dump_output(BM_NAME, output) + print(f"Metrics written to .userbenchmark/{BM_NAME}/metrics-.json") + except Exception as exc: + print(f"Warning: could not write JSON output — {exc}", file=sys.stderr) + + +# --------------------------------------------------------------------------- +# Allow direct execution: python userbenchmark/whisper/run.py +# --------------------------------------------------------------------------- +if __name__ == "__main__": + run(sys.argv[1:])