From ff2c7e7297ac55df99c0eb4bebd477ec5df1e456 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 10:33:16 -0500 Subject: [PATCH 1/9] Implement benchmarking CI for python --- .../workflows/{bench_cub.yml => bench.yml} | 84 +- .../workflows/ci-workflow-pull-request.yml | 27 +- ci-overview.md | 6 +- ci/bench.template.yaml | 25 +- ci/bench.yaml | 25 +- ci/bench/README.md | 59 +- ci/bench/{cub.sh => bench.sh} | 0 ci/bench/compare_git_refs.sh | 2 +- ci/bench/compare_paths.sh | 798 +++++++++++++----- ci/bench/parse_bench_matrix.sh | 31 +- python/cuda_cccl/pyproject.toml | 6 +- 11 files changed, 771 insertions(+), 292 deletions(-) rename .github/workflows/{bench_cub.yml => bench.yml} (79%) rename ci/bench/{cub.sh => bench.sh} (100%) diff --git a/.github/workflows/bench_cub.yml b/.github/workflows/bench.yml similarity index 79% rename from .github/workflows/bench_cub.yml rename to .github/workflows/bench.yml index 7810c858fc7..b5d54526c11 100644 --- a/.github/workflows/bench_cub.yml +++ b/.github/workflows/bench.yml @@ -1,4 +1,4 @@ -name: CUB Benchmark Compare +name: Benchmark Compare defaults: run: @@ -18,7 +18,7 @@ on: default: "--cuda 13.1 --host gcc14" type: string arch: - description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch" + description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch" required: false default: "native" type: string @@ -32,8 +32,13 @@ on: required: false default: "" type: string - filters: - description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')" + cub_filters: + description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')" + required: false + default: "" + type: string + python_filters: + description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')" required: false default: "" type: string @@ -65,7 +70,7 @@ on: default: "--cuda 13.1 --host gcc14" type: string arch: - description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch" + description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch" required: false default: "native" type: string @@ -79,8 +84,13 @@ on: required: false default: "" type: string - filters: - description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')" + cub_filters: + description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')" + required: false + default: "" + type: string + python_filters: + description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')" required: false default: "" type: string @@ -146,7 +156,8 @@ jobs: INPUT_ARCH: ${{ inputs.arch }} INPUT_BASE_REF: ${{ inputs.base_ref }} INPUT_TEST_REF: ${{ inputs.test_ref }} - INPUT_FILTERS: ${{ inputs.filters }} + INPUT_CUB_FILTERS: ${{ inputs.cub_filters }} + INPUT_PYTHON_FILTERS: ${{ inputs.python_filters }} INPUT_RAW_ARGS: ${{ inputs.raw_args }} INPUT_NVBENCH_ARGS: ${{ inputs.nvbench_args }} INPUT_NVBENCH_COMPARE_ARGS: ${{ inputs.nvbench_compare_args }} @@ -178,8 +189,12 @@ jobs: mapfile -d '' -t bench_args < "${parsed_raw_args_file}" rm -f "${parsed_raw_args_file}" else - if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" || -z "${INPUT_FILTERS}" ]]; then - echo "::error::When Raw Args is empty, Base Ref, Test Ref, and Filters must all be set." + if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" ]]; then + echo "::error::When Raw Args is empty, Base Ref and Test Ref must be set." + exit 2 + fi + if [[ -z "${INPUT_CUB_FILTERS}" && -z "${INPUT_PYTHON_FILTERS}" ]]; then + echo "::error::At least one of CUB Filters or Python Filters must be set." exit 2 fi @@ -190,23 +205,40 @@ jobs: bench_args+=(--arch "${INPUT_ARCH}") fi - declare -a parsed_filters - parsed_filters=() - parsed_filters_file="$(mktemp "${RUNNER_TEMP}/bench-filters-XXXXXX")" - if ! parse_quoted_args "${INPUT_FILTERS}" > "${parsed_filters_file}"; then - rm -f "${parsed_filters_file}" - exit 2 + # Add CUB filters as --cub-filter flags. + if [[ -n "${INPUT_CUB_FILTERS}" ]]; then + declare -a parsed_cub_filters + parsed_cub_filters=() + parsed_cub_filters_file="$(mktemp "${RUNNER_TEMP}/bench-cub-filters-XXXXXX")" + if ! parse_quoted_args "${INPUT_CUB_FILTERS}" > "${parsed_cub_filters_file}"; then + rm -f "${parsed_cub_filters_file}" + exit 2 + fi + mapfile -d '' -t parsed_cub_filters < "${parsed_cub_filters_file}" + rm -f "${parsed_cub_filters_file}" + + for cub_filter in "${parsed_cub_filters[@]}"; do + bench_args+=(--cub-filter "${cub_filter}") + done fi - mapfile -d '' -t parsed_filters < "${parsed_filters_file}" - rm -f "${parsed_filters_file}" - if [[ "${#parsed_filters[@]}" -eq 0 ]]; then - echo "::error::Filters must parse to at least one argument." - exit 2 + # Add Python filters as --python-filter flags. + if [[ -n "${INPUT_PYTHON_FILTERS}" ]]; then + declare -a parsed_py_filters + parsed_py_filters=() + parsed_py_filters_file="$(mktemp "${RUNNER_TEMP}/bench-py-filters-XXXXXX")" + if ! parse_quoted_args "${INPUT_PYTHON_FILTERS}" > "${parsed_py_filters_file}"; then + rm -f "${parsed_py_filters_file}" + exit 2 + fi + mapfile -d '' -t parsed_py_filters < "${parsed_py_filters_file}" + rm -f "${parsed_py_filters_file}" + + for py_filter in "${parsed_py_filters[@]}"; do + bench_args+=(--python-filter "${py_filter}") + done fi - bench_args+=("${parsed_filters[@]}") - if [[ -n "${INPUT_NVBENCH_ARGS}" ]]; then bench_args+=(--nvbench-args "${INPUT_NVBENCH_ARGS}") fi @@ -234,7 +266,7 @@ jobs: - name: Show resolved benchmark args run: | - echo "Resolved args passed to ci/bench/cub.sh:" + echo "Resolved args passed to ci/bench/bench.sh:" while IFS= read -r arg; do echo " ${arg}" done <<< "${{ steps.resolve-args.outputs.resolved_args }}" @@ -263,7 +295,7 @@ jobs: base_sha_short="${base_sha_full:0:8}" test_sha_short="${test_sha_full:0:8}" - artifact_name="bench-cub-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}" + artifact_name="bench-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}" mkdir -p "bench-artifacts" @@ -289,7 +321,7 @@ jobs: jsondiff \ tabulate - ./ci/bench/cub.sh "${bench_args[@]}" + ./ci/bench/bench.sh "${bench_args[@]}" EOF chmod +x "${ci_script}" diff --git a/.github/workflows/ci-workflow-pull-request.yml b/.github/workflows/ci-workflow-pull-request.yml index 35a7acc08a3..b94b89e337a 100644 --- a/.github/workflows/ci-workflow-pull-request.yml +++ b/.github/workflows/ci-workflow-pull-request.yml @@ -76,7 +76,7 @@ jobs: run: | echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}" echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}" - - name: Build CUB benchmark dispatch matrix + - name: Build benchmark dispatch matrix id: build-bench-matrix run: | { # Defaults: @@ -86,7 +86,7 @@ jobs: # Compare ci/bench.yaml against its template. If they match, no benchmarks requested. if diff -q "ci/bench.template.yaml" "ci/bench.yaml" > /dev/null 2>&1; then - echo "ci/bench.yaml matches template; skipping CUB benchmark dispatch matrix." + echo "ci/bench.yaml matches template; skipping benchmark dispatch." exit 0 fi @@ -395,8 +395,8 @@ jobs: print('All CPU-only import tests passed!') " - dispatch-cub-bench: - name: CUB Bench Compare (${{ matrix.gpu }}) + dispatch-bench: + name: Bench Compare (${{ matrix.gpu }}) if: >- ${{ needs.build-workflow.outputs.bench_enabled == 'true' && @@ -409,14 +409,15 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON(needs.build-workflow.outputs.bench_matrix) }} - uses: ./.github/workflows/bench_cub.yml + uses: ./.github/workflows/bench.yml with: gpu: ${{ matrix.gpu }} launch_args: ${{ matrix.launch_args }} arch: ${{ matrix.arch }} base_ref: ${{ matrix.base_ref }} test_ref: ${{ matrix.test_ref }} - filters: ${{ matrix.filters }} + cub_filters: ${{ matrix.cub_filters }} + python_filters: ${{ matrix.python_filters }} nvbench_args: ${{ matrix.nvbench_args }} nvbench_compare_args: ${{ matrix.nvbench_compare_args }} @@ -428,7 +429,7 @@ jobs: needs.build-workflow.outputs.bench_enabled == 'true' && needs.build-workflow.outputs.pr_number != '' }} - needs: [build-workflow, dispatch-cub-bench] + needs: [build-workflow, dispatch-bench] permissions: pull-requests: write runs-on: ubuntu-latest @@ -436,7 +437,7 @@ jobs: - name: Determine outcome id: outcome run: | - result="${{ needs.dispatch-cub-bench.result }}" + result="${{ needs.dispatch-bench.result }}" case "${result}" in success) icon=":white_check_mark:"; status="completed successfully" ;; failure) icon=":x:"; status="had failures" ;; @@ -452,7 +453,7 @@ jobs: message: | ## ${{ steps.outcome.outputs.icon }} Benchmark Results - CUB benchmark comparison ${{ steps.outcome.outputs.status }}. + Benchmark comparison ${{ steps.outcome.outputs.status }}. **[Results](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})** **[Artifacts](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)** @@ -473,7 +474,7 @@ jobs: - verify-devcontainers - docs-build - test-cpu-import - - dispatch-cub-bench + - dispatch-bench runs-on: ubuntu-latest steps: - name: Check results @@ -498,12 +499,12 @@ jobs: check_result "docs-build" "success" "${{needs.docs-build.result}}" check_result "test-cpu-import" "success" "${{needs.test-cpu-import.result}}" - expected_cub_bench_result="skipped" + expected_bench_result="skipped" if [[ "${{ needs.build-workflow.outputs.bench_enabled }}" == "true" ]] \ && [[ "${{ toJSON(fromJSON(needs.build-workflow.outputs.bench_matrix).include) }}" != "[]" ]]; then - expected_cub_bench_result="success" + expected_bench_result="success" fi - check_result "dispatch-cub-bench" "${expected_cub_bench_result}" "${{needs.dispatch-cub-bench.result}}" + check_result "dispatch-bench" "${expected_bench_result}" "${{needs.dispatch-bench.result}}" bench_enabled="${{ needs.build-workflow.outputs.bench_enabled }}" if [[ "${bench_enabled}" == "true" ]]; then diff --git a/ci-overview.md b/ci-overview.md index 850e40d885f..1cd8fe2ba2a 100644 --- a/ci-overview.md +++ b/ci-overview.md @@ -71,11 +71,11 @@ CCCL's CI uses [`sccache`](https://github.com/mozilla/sccache) to cache compiler CI jobs employ the build and test scripts in the `ci/` directory to build and run tests. These scripts provide a consistent entry point for building and testing in both local and CI environments. For more information on using these scripts, see the [CONTRIBUTING.md guide](CONTRIBUTING.md#building-and-testing). -#### CUB Benchmark Comparison Workflow +#### Benchmark Comparison Workflow -The standalone CUB benchmark comparison workflow is implemented in `.github/workflows/bench_cub.yml` and uses: +The benchmark comparison workflow is implemented in `.github/workflows/bench.yml` and uses: -- `ci/bench/cub.sh` +- `ci/bench/bench.sh` - `ci/bench/compare_git_refs.sh` - `ci/bench/compare_paths.sh` diff --git a/ci/bench.template.yaml b/ci/bench.template.yaml index 0527f349a96..fbb48fc6048 100644 --- a/ci/bench.template.yaml +++ b/ci/bench.template.yaml @@ -1,8 +1,8 @@ -# # CUB PR benchmark request config. +# # CCCL PR benchmark request config. # # ## Overview: # -# This file is used to request CUB benchmark comparisons in PR CI. +# This file is used to request benchmark comparisons in PR CI. # # This file must match ci/bench.template.yaml to merge. # CI branch protections will fail if they differ. Reset before merging. @@ -17,18 +17,27 @@ # # ## Quick start: # -# 1. Add one or more benchmark regexes under benchmarks.filters. +# 1. Add one or more benchmark regexes under cub and/or python filters. # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus. # 3. Push and inspect the dispatched benchmark jobs/artifacts. # 4. Remove/reset benchmark-request edits before final merge. benchmarks: - # Inclusive regex filters (required). - filters: - # Examples: - # - '^cub\.bench\.for_each\.base' - # - '^cub\.bench\.reduce\.(sum|min)\.' + # CUB C++ benchmark filters (regex matched against ninja target names). + cub: + filters: + # Examples: + # - '^cub\.bench\.for_each\.base' + # - '^cub\.bench\.reduce\.(sum|min)\.' + + # Python benchmark filters (regex matched against paths under benchmarks/). + python: + filters: + # Examples: + # - 'compute/reduce/sum\.py' + # - 'compute/transform/.*\.py' + # - 'coop/bench_warp_reduce\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: diff --git a/ci/bench.yaml b/ci/bench.yaml index 0527f349a96..fbb48fc6048 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -1,8 +1,8 @@ -# # CUB PR benchmark request config. +# # CCCL PR benchmark request config. # # ## Overview: # -# This file is used to request CUB benchmark comparisons in PR CI. +# This file is used to request benchmark comparisons in PR CI. # # This file must match ci/bench.template.yaml to merge. # CI branch protections will fail if they differ. Reset before merging. @@ -17,18 +17,27 @@ # # ## Quick start: # -# 1. Add one or more benchmark regexes under benchmarks.filters. +# 1. Add one or more benchmark regexes under cub and/or python filters. # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus. # 3. Push and inspect the dispatched benchmark jobs/artifacts. # 4. Remove/reset benchmark-request edits before final merge. benchmarks: - # Inclusive regex filters (required). - filters: - # Examples: - # - '^cub\.bench\.for_each\.base' - # - '^cub\.bench\.reduce\.(sum|min)\.' + # CUB C++ benchmark filters (regex matched against ninja target names). + cub: + filters: + # Examples: + # - '^cub\.bench\.for_each\.base' + # - '^cub\.bench\.reduce\.(sum|min)\.' + + # Python benchmark filters (regex matched against paths under benchmarks/). + python: + filters: + # Examples: + # - 'compute/reduce/sum\.py' + # - 'compute/transform/.*\.py' + # - 'coop/bench_warp_reduce\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: diff --git a/ci/bench/README.md b/ci/bench/README.md index 17609e47045..990d1508134 100644 --- a/ci/bench/README.md +++ b/ci/bench/README.md @@ -1,32 +1,40 @@ -# CUB Benchmark Compare Scripts +# Benchmark Compare Scripts -This directory contains the scripts used by `.github/workflows/bench_cub.yml` to compare CUB benchmark results between two code states. +This directory contains the scripts used by `.github/workflows/bench.yml` to compare benchmark results between two code states. ## Scripts -- `ci/bench/cub.sh`: CI-oriented wrapper that calls `ci/bench/compare_git_refs.sh`. +- `ci/bench/bench.sh`: CI-oriented wrapper that calls `ci/bench/compare_git_refs.sh`. - `ci/bench/compare_git_refs.sh`: checks out `` and `` in temporary worktrees, then forwards all remaining args to `ci/bench/compare_paths.sh`. -- `ci/bench/compare_paths.sh`: configures/builds/runs common `cub.bench.*` targets in two source trees and runs `nvbench_compare.py` on produced JSON outputs. -- `ci/bench/parse_bench_matrix.sh`: parses `ci/bench.yaml` and emits a dispatch matrix JSON object for `.github/workflows/bench_cub.yml`. +- `ci/bench/compare_paths.sh`: configures/builds/runs CUB benchmarks and/or Python benchmarks in two source trees and runs comparison tools on produced JSON outputs. +- `ci/bench/parse_bench_matrix.sh`: parses `ci/bench.yaml` and emits a dispatch matrix JSON object for `.github/workflows/bench.yml`. ## Usage -Compare two refs: +Compare CUB benchmarks between two refs: ```bash -"./ci/bench/cub.sh" "origin/main" "HEAD" "^cub\\.bench\\.copy\\.memcpy\\.base$" +"./ci/bench/bench.sh" "origin/main" "HEAD" \ + --cub-filter "^cub\\.bench\\.copy\\.memcpy\\.base$" ``` -Forward additional options (parsed by `compare_paths.sh`): +Compare Python benchmarks between two refs: ```bash -"./ci/bench/cub.sh" \ +"./ci/bench/bench.sh" "origin/main" "HEAD" \ + --python-filter "compute/reduce/sum\\.py" +``` + +Run both CUB and Python benchmarks: + +```bash +"./ci/bench/bench.sh" \ "origin/main" \ "HEAD" \ --arch "native" \ --nvbench-args "..." \ - --nvbench-compare-args "..." \ - "^cub\\.bench\\.reduce\\..*$" + --cub-filter "^cub\\.bench\\.reduce\\..*$" \ + --python-filter "compute/reduce/sum\\.py" ``` Compare already checked-out trees: @@ -36,21 +44,40 @@ Compare already checked-out trees: "/path/to/base/cccl" \ "/path/to/test/cccl" \ --arch "native" \ - "^cub\\.bench\\.copy\\.memcpy\\.base$" + --cub-filter "^cub\\.bench\\.copy\\.memcpy\\.base$" \ + --python-filter "compute/transform/.*\\.py" ``` ## Workflow Inputs -In `.github/workflows/bench_cub.yml`: +In `.github/workflows/bench.yml`: -- If `raw_args` is non-empty, it is parsed and passed directly to `ci/bench/cub.sh`. -- Otherwise, args are assembled from `base_ref`, `test_ref`, `arch`, `filters`, `nvbench_args`, and `nvbench_compare_args`. +- If `raw_args` is non-empty, it is parsed and passed directly to `ci/bench/bench.sh`. +- Otherwise, args are assembled from `base_ref`, `test_ref`, `arch`, `cub_filters`, `python_filters`, `nvbench_args`, and `nvbench_compare_args`. +- CUB filters are passed as `--cub-filter` flags. Python filters are passed as `--python-filter` flags. - Malformed quoted input (for example unmatched quotes) fails the workflow step. +## Python Benchmarks + +Python benchmarks live under `python/cuda_cccl/benchmarks/` and use `cuda.bench` (the Python nvbench bindings). Each benchmark script outputs nvbench-compatible JSON. + +For Python benchmarks, `compare_paths.sh`: + +1. Creates isolated virtual environments for base and test trees. +2. Installs `cuda-cccl[bench-cuXX]` (editable, from each worktree), which pulls in `cuda-bench`, `cupy`, and all other benchmark dependencies. +3. Runs matching benchmark scripts in each venv. +4. Compares results using `nvbench-compare` (installed with `cuda-bench`). + +Python filters are regex patterns matched against relative paths under `python/cuda_cccl/benchmarks/`, for example: +- `compute/reduce/sum\.py` — single benchmark +- `compute/transform/.*\.py` — all transform benchmarks +- `coop/.*\.py` — all coop benchmarks + ## Artifacts `compare_paths.sh` writes a run directory under `${CCCL_BENCH_ARTIFACT_ROOT:-$(pwd)/bench-artifacts}` containing: - per-target JSON and markdown outputs for base/test runs, - grouped build logs (`build.base.log`, `build.test.log`), per-target run logs, and per-target compare logs (`compare..log`), -- `summary.md` with run metadata and per-target collapsible full `nvbench_compare.py` reports. +- Python venv setup logs (`py.venv.base.log`, `py.venv.test.log`), +- `summary.md` with run metadata and per-target collapsible full compare reports. diff --git a/ci/bench/cub.sh b/ci/bench/bench.sh similarity index 100% rename from ci/bench/cub.sh rename to ci/bench/bench.sh diff --git a/ci/bench/compare_git_refs.sh b/ci/bench/compare_git_refs.sh index d2c0765fdab..540f1cbdd2b 100755 --- a/ci/bench/compare_git_refs.sh +++ b/ci/bench/compare_git_refs.sh @@ -13,7 +13,7 @@ usage() { cat < [compare_paths args...] -Compare CUB benchmark performance between two git refs from the current CCCL repo. +Compare benchmark performance between two git refs from the current CCCL repo. Each ref is checked out in an isolated worktree and compared via compare_paths.sh. EOF } diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh index ffb30c02580..4e3a221ceb2 100755 --- a/ci/bench/compare_paths.sh +++ b/ci/bench/compare_paths.sh @@ -11,17 +11,30 @@ die() { usage() { cat < [filter1 [filter2 ...]] \ +Usage: $0 \ + [--cub-filter ""] \ + [--python-filter ""] \ [--arch ""] \ [--nvbench-args ""] \ [--nvbench-compare-args ""] -Compare CUB benchmark performance between two checked-out CCCL trees. +Compare benchmark performance between two checked-out CCCL trees. + +At least one --cub-filter or --python-filter must be provided. +CUB filters are regex patterns matched against ninja target names. +Python filters are regex patterns matched against benchmark script paths +under python/cuda_cccl/benchmarks/ (e.g. compute/reduce/sum.py). Arguments: Path to baseline CCCL source tree. Path to comparison CCCL source tree. - [filterN] Optional regex filters matched against benchmark target names. + +Options: + --cub-filter CUB benchmark regex filter (repeatable). + --python-filter Python benchmark regex filter (repeatable). + --arch CMAKE_CUDA_ARCHITECTURES for CUB builds. + --nvbench-args Extra args passed to benchmark binaries/scripts. + --nvbench-compare-args Extra args passed to nvbench_compare. Environment: CCCL_BENCH_ARTIFACT_ROOT Root directory for outputs. @@ -74,11 +87,13 @@ validate_repo_path() { fi } -validate_filters() { +validate_filter_array() { + local -n _validate_filters_ref="$1" + local label="$2" local filter="" - for filter in "${FILTERS[@]}"; do + for filter in "${_validate_filters_ref[@]}"; do grep -Eq -- "${filter}" <<< "" >/dev/null 2>&1 || { - [[ "$?" -eq 1 ]] || die "Invalid regex filter: ${filter}" + [[ "$?" -eq 1 ]] || die "Invalid ${label} regex filter: ${filter}" } done } @@ -103,6 +118,10 @@ print_shell_command() { printf '\n' } +# ============================================================================ +# CUB helpers +# ============================================================================ + configure_build_tree() { local src_path="$1" local build_path="$2" @@ -165,6 +184,259 @@ resolve_compare_script() { return 1 } +run_target_for_side() { + local side="$1" + local build_path="$2" + local target="$3" + local json_path="$4" + local md_path="$5" + local run_log="$6" + local binary_path="${build_path}/bin/${target}" + local -a bench_cmd + + if [[ ! -x "${binary_path}" ]]; then + echo "Benchmark binary missing: ${binary_path}" >&2 + return 127 + fi + + bench_cmd=( + "${binary_path}" + -d 0 + "${NVBENCH_RUN_ARGS[@]}" + --json "${json_path}" + --md "${md_path}" + ) + + run_grouped_logged_command \ + "[run:${side}] ${target}" \ + "${run_log}" \ + "${bench_cmd[@]}" +} + +select_targets() { + local base_build_path="$1" + local test_build_path="$2" + local -n selected_targets_ref="$3" + local -a base_targets + local -a test_targets + local -a common_targets + local target="" + + mapfile -t base_targets < <(list_all_benchmark_targets "${base_build_path}") + mapfile -t test_targets < <(list_all_benchmark_targets "${test_build_path}") + + if [[ "${#base_targets[@]}" -eq 0 ]]; then + die "No CUB benchmark targets were found in base build tree." 1 + fi + if [[ "${#test_targets[@]}" -eq 0 ]]; then + die "No CUB benchmark targets were found in test build tree." 1 + fi + + mapfile -t common_targets < <( + comm -12 \ + <(printf "%s\n" "${base_targets[@]}" | sort -u) \ + <(printf "%s\n" "${test_targets[@]}" | sort -u) + ) + + selected_targets_ref=() + for target in "${common_targets[@]}"; do + [[ -n "${target}" ]] || continue + if target_matches_filters "${target}"; then + selected_targets_ref+=("${target}") + fi + done + + if [[ "${#selected_targets_ref[@]}" -eq 0 ]]; then + die "No CUB benchmark targets matched the supplied filters." 1 + fi +} + +# ============================================================================ +# Python helpers +# ============================================================================ + +detect_cuda_major_version() { + local cuda_major="" + if command -v nvcc >/dev/null 2>&1; then + cuda_major="$(nvcc --version 2>/dev/null | sed -n 's/.*release \([0-9]*\)\..*/\1/p')" + fi + if [[ -z "${cuda_major}" ]]; then + cuda_major="12" + fi + printf "%s" "${cuda_major}" +} + +python_path_to_target_name() { + local py_path="$1" + # compute/reduce/sum.py -> py.compute.reduce.sum + local name="${py_path%.py}" + name="${name//\//.}" + printf "py.%s" "${name}" +} + +list_all_python_benchmarks() { + local benchmarks_path="$1" + if [[ ! -d "${benchmarks_path}" ]]; then + return 0 + fi + find "${benchmarks_path}" -name '*.py' -type f \ + ! -name 'utils.py' \ + ! -name 'run_benchmarks.py' \ + ! -name 'device_side_benchmark.py' \ + ! -name '__init__.py' \ + ! -path '*/__pycache__/*' \ + -printf '%P\n' \ + | sort -u +} + +python_target_matches_filters() { + local target="$1" + local filter="" + for filter in "${PYTHON_FILTERS[@]}"; do + if grep -Eq -- "${filter}" <<< "${target}"; then + return 0 + fi + done + return 1 +} + +select_python_targets() { + local base_bench_path="$1" + local test_bench_path="$2" + local -n selected_py_targets_ref="$3" + local -a base_py_targets + local -a test_py_targets + local -a common_py_targets + local target="" + + mapfile -t base_py_targets < <(list_all_python_benchmarks "${base_bench_path}") + mapfile -t test_py_targets < <(list_all_python_benchmarks "${test_bench_path}") + + if [[ "${#base_py_targets[@]}" -eq 0 ]]; then + die "No Python benchmark scripts were found in base tree: ${base_bench_path}" 1 + fi + if [[ "${#test_py_targets[@]}" -eq 0 ]]; then + die "No Python benchmark scripts were found in test tree: ${test_bench_path}" 1 + fi + + mapfile -t common_py_targets < <( + comm -12 \ + <(printf "%s\n" "${base_py_targets[@]}" | sort -u) \ + <(printf "%s\n" "${test_py_targets[@]}" | sort -u) + ) + + selected_py_targets_ref=() + for target in "${common_py_targets[@]}"; do + [[ -n "${target}" ]] || continue + if python_target_matches_filters "${target}"; then + selected_py_targets_ref+=("${target}") + fi + done + + if [[ "${#selected_py_targets_ref[@]}" -eq 0 ]]; then + die "No Python benchmark scripts matched the supplied --python-filter patterns." 1 + fi +} + +setup_python_venv() { + local venv_path="$1" + local src_path="$2" + local side="$3" + local log_path="$4" + local cuda_major="$5" + local cuda_cccl_dir="${src_path}/python/cuda_cccl" + + if [[ ! -d "${cuda_cccl_dir}" ]]; then + die "cuda_cccl source directory not found: ${cuda_cccl_dir}" + fi + + local -a setup_cmds + setup_cmds=( + bash -c " + set -euo pipefail + python3 -m venv '${venv_path}' + '${venv_path}/bin/pip' install --upgrade pip + '${venv_path}/bin/pip' install -e '${cuda_cccl_dir}[bench-cu${cuda_major}]' + # nvbench-compare runtime deps (until cuda-bench declares them): + '${venv_path}/bin/pip' install colorama jsondiff tabulate + " + ) + + run_grouped_logged_command \ + "[py-venv:${side}]" \ + "${log_path}" \ + "${setup_cmds[@]}" +} + +run_python_target_for_side() { + local side="$1" + local venv_path="$2" + local script_path="$3" + local json_path="$4" + local md_path="$5" + local run_log="$6" + local -a bench_cmd + + if [[ ! -f "${script_path}" ]]; then + echo "Python benchmark script missing: ${script_path}" >&2 + return 127 + fi + + bench_cmd=( + "${venv_path}/bin/python" + "${script_path}" + -d 0 + "${NVBENCH_RUN_ARGS[@]}" + --json "${json_path}" + --md "${md_path}" + ) + + run_grouped_logged_command \ + "[py-run:${side}] ${script_path##*/benchmarks/}" \ + "${run_log}" \ + "${bench_cmd[@]}" +} + +run_python_compare_target() { + local target_name="$1" + local venv_path="$2" + local base_json="$3" + local test_json="$4" + local compare_out="$5" + local compare_log="$6" + + local label="[py-compare] ${target_name}" + local started_at=0 + local elapsed_s=0 + local rc=0 + local -a compare_cmd + compare_cmd=("${venv_path}/bin/nvbench-compare" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}") + + : > "${compare_log}" + echo "::group::${label}" + print_shell_command "${compare_cmd[@]}" + started_at="${SECONDS}" + if "${compare_cmd[@]}" \ + > >(tee "${compare_out}" | tee -a "${compare_log}") \ + 2> >(tee -a "${compare_log}" >&2); then + rc=0 + else + rc=$? + fi + elapsed_s=$((SECONDS - started_at)) + echo "::endgroup::" + if [[ "${rc}" -eq 0 ]]; then + echo "${label} completed in ${elapsed_s}s" + else + echo "${label} failed in ${elapsed_s}s (rc=${rc})" + fi + return "${rc}" +} + +# ============================================================================ +# Common helpers +# ============================================================================ + run_grouped_logged_command() { local label="$1" local log_path="$2" @@ -278,72 +550,9 @@ parse_quoted_args_to_array() { rm -f "${parsed_args_file}" } -run_target_for_side() { - local side="$1" - local build_path="$2" - local target="$3" - local json_path="$4" - local md_path="$5" - local run_log="$6" - local binary_path="${build_path}/bin/${target}" - local -a bench_cmd - - if [[ ! -x "${binary_path}" ]]; then - echo "Benchmark binary missing: ${binary_path}" >&2 - return 127 - fi - - bench_cmd=( - "${binary_path}" - -d 0 - "${NVBENCH_RUN_ARGS[@]}" - --json "${json_path}" - --md "${md_path}" - ) - - run_grouped_logged_command \ - "[run:${side}] ${target}" \ - "${run_log}" \ - "${bench_cmd[@]}" -} - -select_targets() { - local base_build_path="$1" - local test_build_path="$2" - local -n selected_targets_ref="$3" - local -a base_targets - local -a test_targets - local -a common_targets - local target="" - - mapfile -t base_targets < <(list_all_benchmark_targets "${base_build_path}") - mapfile -t test_targets < <(list_all_benchmark_targets "${test_build_path}") - - if [[ "${#base_targets[@]}" -eq 0 ]]; then - die "No CUB benchmark targets were found in base build tree." 1 - fi - if [[ "${#test_targets[@]}" -eq 0 ]]; then - die "No CUB benchmark targets were found in test build tree." 1 - fi - - mapfile -t common_targets < <( - comm -12 \ - <(printf "%s\n" "${base_targets[@]}" | sort -u) \ - <(printf "%s\n" "${test_targets[@]}" | sort -u) - ) - - selected_targets_ref=() - for target in "${common_targets[@]}"; do - [[ -n "${target}" ]] || continue - if target_matches_filters "${target}"; then - selected_targets_ref+=("${target}") - fi - done - - if [[ "${#selected_targets_ref[@]}" -eq 0 ]]; then - die "No benchmark targets matched the supplied filters." 1 - fi -} +# ============================================================================ +# Summary +# ============================================================================ write_summary() { local summary_file="$1" @@ -352,7 +561,7 @@ write_summary() { local reports_emitted=0 { - echo "# CUB Benchmark Comparison Summary" + echo "# Benchmark Comparison Summary" echo echo "- Timestamp (UTC): ${timestamp}" echo "- GPU name: ${CCCL_BENCH_GPU_NAME:-not specified}" @@ -360,39 +569,77 @@ write_summary() { echo "- Test label: ${test_label_raw}" echo "- Base source path: \`${BASE_PATH}\`" echo "- Test source path: \`${TEST_PATH}\`" - echo "- Base build dir: \`${base_build_dir}\`" - echo "- Test build dir: \`${test_build_dir}\`" - echo "- Selected targets: ${#selected_targets[@]}" - echo "- Comparisons attempted: ${compares_attempted}" - echo "- Comparisons succeeded (nvbench_compare exit 0): ${compares_succeeded}" + if [[ "${#FILTERS[@]}" -gt 0 ]]; then + echo "- Base build dir: \`${base_build_dir}\`" + echo "- Test build dir: \`${test_build_dir}\`" + fi + echo "- CUB targets selected: ${#selected_targets[@]}" + echo "- CUB comparisons attempted: ${compares_attempted}" + echo "- CUB comparisons succeeded: ${compares_succeeded}" + echo "- Python targets selected: ${#selected_py_targets[@]}" + echo "- Python comparisons attempted: ${py_compares_attempted}" + echo "- Python comparisons succeeded: ${py_compares_succeeded}" echo "- Target arch: ${TARGET_ARCH:-preset-default}" echo "- Artifact directory: \`${artifact_dir}\`" echo - echo "## Filters" + if [[ "${#FILTERS[@]}" -gt 0 ]]; then + echo "## CUB Filters" for filter in "${FILTERS[@]}"; do echo "- \`${filter}\`" done - else - echo "- (none)" - fi - echo - echo "## Compare Reports" - for target in "${selected_targets[@]}"; do - compare_report_file="${artifact_dir}/compare/${target}.md" - if [[ ! -f "${compare_report_file}" ]]; then - continue - fi - reports_emitted=$((reports_emitted + 1)) - echo - echo "### \`${target}\`" - echo - echo "
Expand full compare output for \`${target}\`" echo - cat "${compare_report_file}" + fi + + if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then + echo "## Python Filters" + for filter in "${PYTHON_FILTERS[@]}"; do + echo "- \`${filter}\`" + done echo - echo "
" - done + fi + + if [[ "${#selected_targets[@]}" -gt 0 ]]; then + echo "## CUB Compare Reports" + for target in "${selected_targets[@]}"; do + compare_report_file="${artifact_dir}/compare/${target}.md" + if [[ ! -f "${compare_report_file}" ]]; then + continue + fi + reports_emitted=$((reports_emitted + 1)) + echo + echo "### \`${target}\`" + echo + echo "
Expand full compare output for \`${target}\`" + echo + cat "${compare_report_file}" + echo + echo "
" + done + fi + + if [[ "${#selected_py_targets[@]}" -gt 0 ]]; then + echo "## Python Compare Reports" + local py_target_path="" + local py_target_name="" + for py_target_path in "${selected_py_targets[@]}"; do + py_target_name="$(python_path_to_target_name "${py_target_path}")" + compare_report_file="${artifact_dir}/compare/${py_target_name}.md" + if [[ ! -f "${compare_report_file}" ]]; then + continue + fi + reports_emitted=$((reports_emitted + 1)) + echo + echo "### \`${py_target_name}\` (\`${py_target_path}\`)" + echo + echo "
Expand full compare output for \`${py_target_name}\`" + echo + cat "${compare_report_file}" + echo + echo "
" + done + fi + if [[ "${reports_emitted}" -eq 0 ]]; then echo echo "_No per-target compare reports were produced._" @@ -400,6 +647,10 @@ write_summary() { } > "${summary_file}" } +# ============================================================================ +# CLI parsing +# ============================================================================ + parse_cli_args() { if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then usage @@ -418,6 +669,7 @@ parse_cli_args() { NVBENCH_COMPARE_ARGS_STRING="" TARGET_ARCH="" FILTERS=() + PYTHON_FILTERS=() while [[ "$#" -gt 0 ]]; do case "$1" in --arch) @@ -441,14 +693,26 @@ parse_cli_args() { NVBENCH_COMPARE_ARGS_STRING="$2" shift 2 ;; + --cub-filter) + if [[ "$#" -lt 2 ]]; then + die "Missing value for --cub-filter" + fi + FILTERS+=("$2") + shift 2 + ;; + --python-filter) + if [[ "$#" -lt 2 ]]; then + die "Missing value for --python-filter" + fi + PYTHON_FILTERS+=("$2") + shift 2 + ;; --) shift - FILTERS+=("$@") break ;; *) - FILTERS+=("$1") - shift + die "Unknown option: $1" ;; esac done @@ -465,7 +729,12 @@ parse_quoted_args_to_array NVBENCH_COMPARE_ARGS "${NVBENCH_COMPARE_ARGS_STRING}" validate_repo_path "${BASE_PATH}" validate_repo_path "${TEST_PATH}" -validate_filters +validate_filter_array FILTERS "CUB" +validate_filter_array PYTHON_FILTERS "Python" + +# ============================================================================ +# Common setup +# ============================================================================ timestamp="$(date -u +'%Y%m%dT%H%M%SZ')" base_label_raw="${CCCL_BENCH_BASE_LABEL:-$(resolve_repo_label "${BASE_PATH}")}" @@ -484,21 +753,6 @@ build_token="$(sanitize_label "${test_label}-${timestamp}-${base_label}")" base_build_dir="${build_root}/base-${build_token}" test_build_dir="${build_root}/test-${build_token}" -external_base_build_dir="${CCCL_BENCH_BASE_BUILD_DIR:-}" -external_test_build_dir="${CCCL_BENCH_TEST_BUILD_DIR:-}" -if [[ -n "${external_base_build_dir}" || -n "${external_test_build_dir}" ]]; then - if [[ -z "${external_base_build_dir}" || -z "${external_test_build_dir}" ]]; then - die "Both CCCL_BENCH_BASE_BUILD_DIR and CCCL_BENCH_TEST_BUILD_DIR must be set together." - fi - base_build_dir="$(realpath "${external_base_build_dir}")" - test_build_dir="$(realpath "${external_test_build_dir}")" - validate_build_dir "${base_build_dir}" "base" - validate_build_dir "${test_build_dir}" "test" - if [[ -n "${TARGET_ARCH}" ]]; then - echo "Warning: --arch is ignored when using preconfigured build directories." >&2 - fi -fi - for subdir in base compare logs meta test; do mkdir -p "${artifact_dir}/${subdir}" done @@ -511,12 +765,20 @@ fi echo "Base source: ${BASE_PATH}" echo "Test source: ${TEST_PATH}" if [[ "${#FILTERS[@]}" -gt 0 ]]; then - echo "Filters:" + echo "CUB filters:" for filter in "${FILTERS[@]}"; do echo " - ${filter}" done else - echo "Filters: (none, all benchmark targets)" + echo "CUB filters: (none)" +fi +if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then + echo "Python filters:" + for filter in "${PYTHON_FILTERS[@]}"; do + echo " - ${filter}" + done +else + echo "Python filters: (none)" fi if [[ -n "${TARGET_ARCH}" ]]; then echo "Target arch: ${TARGET_ARCH}" @@ -534,113 +796,239 @@ if [[ "${#NVBENCH_COMPARE_ARGS[@]}" -gt 0 ]]; then done fi -if [[ -n "${external_base_build_dir}" ]]; then - echo "[configure:base] skipped (using existing build tree)" - echo "[configure:test] skipped (using existing build tree)" -else - configure_build_tree "${BASE_PATH}" "${base_build_dir}" "base" "${artifact_dir}/logs/configure.base.log" "${TARGET_ARCH}" - configure_build_tree "${TEST_PATH}" "${test_build_dir}" "test" "${artifact_dir}/logs/configure.test.log" "${TARGET_ARCH}" -fi +any_failures=0 +compares_attempted=0 +compares_succeeded=0 +declare -a selected_targets=() +py_compares_attempted=0 +py_compares_succeeded=0 +declare -a selected_py_targets=() -declare -a selected_targets -select_targets "${base_build_dir}" "${test_build_dir}" selected_targets +# ============================================================================ +# CUB benchmark pipeline +# ============================================================================ -printf "%s\n" "${selected_targets[@]}" > "${artifact_dir}/meta/selected_targets.txt" +if [[ "${#FILTERS[@]}" -gt 0 ]]; then + echo + echo "=== CUB Benchmark Pipeline ===" + echo + + external_base_build_dir="${CCCL_BENCH_BASE_BUILD_DIR:-}" + external_test_build_dir="${CCCL_BENCH_TEST_BUILD_DIR:-}" + if [[ -n "${external_base_build_dir}" || -n "${external_test_build_dir}" ]]; then + if [[ -z "${external_base_build_dir}" || -z "${external_test_build_dir}" ]]; then + die "Both CCCL_BENCH_BASE_BUILD_DIR and CCCL_BENCH_TEST_BUILD_DIR must be set together." + fi + base_build_dir="$(realpath "${external_base_build_dir}")" + test_build_dir="$(realpath "${external_test_build_dir}")" + validate_build_dir "${base_build_dir}" "base" + validate_build_dir "${test_build_dir}" "test" + if [[ -n "${TARGET_ARCH}" ]]; then + echo "Warning: --arch is ignored when using preconfigured build directories." >&2 + fi + fi -compare_script="$(resolve_compare_script "${test_build_dir}" || true)" -if [[ -z "${compare_script}" ]]; then - compare_script="$(resolve_compare_script "${base_build_dir}" || true)" -fi -if [[ -z "${compare_script}" ]]; then - die "Unable to locate nvbench_compare.py in build dependencies." 1 -fi -compare_script_dir="$(dirname "${compare_script}")" + if [[ -z "${external_base_build_dir:-}" ]]; then + configure_build_tree "${BASE_PATH}" "${base_build_dir}" "base" "${artifact_dir}/logs/configure.base.log" "${TARGET_ARCH}" + configure_build_tree "${TEST_PATH}" "${test_build_dir}" "test" "${artifact_dir}/logs/configure.test.log" "${TARGET_ARCH}" + else + echo "[configure:base] skipped (using existing build tree)" + echo "[configure:test] skipped (using existing build tree)" + fi -any_failures=0 -compares_attempted=0 -compares_succeeded=0 -base_build_all_rc=0 -test_build_all_rc=0 + select_targets "${base_build_dir}" "${test_build_dir}" selected_targets -if run_grouped_logged_command \ - "[build:base]" \ - "${artifact_dir}/logs/build.base.log" \ - ninja -C "${base_build_dir}" "${selected_targets[@]}"; then - base_build_all_rc=0 -else - base_build_all_rc=$? - any_failures=1 -fi + printf "%s\n" "${selected_targets[@]}" > "${artifact_dir}/meta/selected_targets.txt" -if run_grouped_logged_command \ - "[build:test]" \ - "${artifact_dir}/logs/build.test.log" \ - ninja -C "${test_build_dir}" "${selected_targets[@]}"; then + compare_script="$(resolve_compare_script "${test_build_dir}" || true)" + if [[ -z "${compare_script}" ]]; then + compare_script="$(resolve_compare_script "${base_build_dir}" || true)" + fi + if [[ -z "${compare_script}" ]]; then + die "Unable to locate nvbench_compare.py in build dependencies." 1 + fi + compare_script_dir="$(dirname "${compare_script}")" + + base_build_all_rc=0 test_build_all_rc=0 -else - test_build_all_rc=$? - any_failures=1 + + if run_grouped_logged_command \ + "[build:base]" \ + "${artifact_dir}/logs/build.base.log" \ + ninja -C "${base_build_dir}" "${selected_targets[@]}"; then + base_build_all_rc=0 + else + base_build_all_rc=$? + any_failures=1 + fi + + if run_grouped_logged_command \ + "[build:test]" \ + "${artifact_dir}/logs/build.test.log" \ + ninja -C "${test_build_dir}" "${selected_targets[@]}"; then + test_build_all_rc=0 + else + test_build_all_rc=$? + any_failures=1 + fi + + for target in "${selected_targets[@]}"; do + base_target_run_rc=125 + test_target_run_rc=125 + base_run_log="${artifact_dir}/logs/run.base.${target}.log" + test_run_log="${artifact_dir}/logs/run.test.${target}.log" + compare_report_md="${artifact_dir}/compare/${target}.md" + compare_report_log="${artifact_dir}/logs/compare.${target}.log" + + base_json="${artifact_dir}/base/${target}.json" + base_md="${artifact_dir}/base/${target}.md" + test_json="${artifact_dir}/test/${target}.json" + test_md="${artifact_dir}/test/${target}.md" + + if [[ "${base_build_all_rc}" -eq 0 ]]; then + if run_target_for_side \ + "base" \ + "${base_build_dir}" \ + "${target}" \ + "${base_json}" \ + "${base_md}" \ + "${base_run_log}"; then + base_target_run_rc=0 + else + base_target_run_rc=$? + any_failures=1 + fi + fi + + if [[ "${test_build_all_rc}" -eq 0 ]]; then + if run_target_for_side \ + "test" \ + "${test_build_dir}" \ + "${target}" \ + "${test_json}" \ + "${test_md}" \ + "${test_run_log}"; then + test_target_run_rc=0 + else + test_target_run_rc=$? + any_failures=1 + fi + fi + + if [[ "${base_target_run_rc}" -eq 0 && "${test_target_run_rc}" -eq 0 ]]; then + compares_attempted=$((compares_attempted + 1)) + if run_compare_target \ + "${target}" \ + "${compare_script}" \ + "${compare_script_dir}" \ + "${base_json}" \ + "${test_json}" \ + "${compare_report_md}" \ + "${compare_report_log}"; then + compares_succeeded=$((compares_succeeded + 1)) + else + any_failures=1 + fi + fi + done fi -for target in "${selected_targets[@]}"; do - base_target_run_rc=125 - test_target_run_rc=125 - base_run_log="${artifact_dir}/logs/run.base.${target}.log" - test_run_log="${artifact_dir}/logs/run.test.${target}.log" - compare_report_md="${artifact_dir}/compare/${target}.md" - compare_report_log="${artifact_dir}/logs/compare.${target}.log" - - base_json="${artifact_dir}/base/${target}.json" - base_md="${artifact_dir}/base/${target}.md" - test_json="${artifact_dir}/test/${target}.json" - test_md="${artifact_dir}/test/${target}.md" - - if [[ "${base_build_all_rc}" -eq 0 ]]; then - if run_target_for_side \ +# ============================================================================ +# Python benchmark pipeline +# ============================================================================ + +if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then + echo + echo "=== Python Benchmark Pipeline ===" + echo + + py_benchmarks_subdir="python/cuda_cccl/benchmarks" + base_py_bench_dir="${BASE_PATH}/${py_benchmarks_subdir}" + test_py_bench_dir="${TEST_PATH}/${py_benchmarks_subdir}" + + if [[ ! -d "${base_py_bench_dir}" ]]; then + die "Python benchmarks directory not found in base tree: ${base_py_bench_dir}" + fi + if [[ ! -d "${test_py_bench_dir}" ]]; then + die "Python benchmarks directory not found in test tree: ${test_py_bench_dir}" + fi + + cuda_major="$(detect_cuda_major_version)" + echo "Detected CUDA major version: ${cuda_major}" + + base_py_venv="${build_root}/py-base-${build_token}" + test_py_venv="${build_root}/py-test-${build_token}" + + setup_python_venv "${base_py_venv}" "${BASE_PATH}" "base" "${artifact_dir}/logs/py.venv.base.log" "${cuda_major}" + setup_python_venv "${test_py_venv}" "${TEST_PATH}" "test" "${artifact_dir}/logs/py.venv.test.log" "${cuda_major}" + + select_python_targets "${base_py_bench_dir}" "${test_py_bench_dir}" selected_py_targets + + # Append Python targets to the selected targets metadata file. + for py_target_path in "${selected_py_targets[@]}"; do + echo "$(python_path_to_target_name "${py_target_path}")" >> "${artifact_dir}/meta/selected_targets.txt" + done + + for py_target_path in "${selected_py_targets[@]}"; do + py_target_name="$(python_path_to_target_name "${py_target_path}")" + base_py_target_run_rc=125 + test_py_target_run_rc=125 + + base_py_json="${artifact_dir}/base/${py_target_name}.json" + base_py_md="${artifact_dir}/base/${py_target_name}.md" + test_py_json="${artifact_dir}/test/${py_target_name}.json" + test_py_md="${artifact_dir}/test/${py_target_name}.md" + base_py_run_log="${artifact_dir}/logs/run.base.${py_target_name}.log" + test_py_run_log="${artifact_dir}/logs/run.test.${py_target_name}.log" + compare_py_report_md="${artifact_dir}/compare/${py_target_name}.md" + compare_py_report_log="${artifact_dir}/logs/compare.${py_target_name}.log" + + if run_python_target_for_side \ "base" \ - "${base_build_dir}" \ - "${target}" \ - "${base_json}" \ - "${base_md}" \ - "${base_run_log}"; then - base_target_run_rc=0 + "${base_py_venv}" \ + "${base_py_bench_dir}/${py_target_path}" \ + "${base_py_json}" \ + "${base_py_md}" \ + "${base_py_run_log}"; then + base_py_target_run_rc=0 else - base_target_run_rc=$? + base_py_target_run_rc=$? any_failures=1 fi - fi - if [[ "${test_build_all_rc}" -eq 0 ]]; then - if run_target_for_side \ + if run_python_target_for_side \ "test" \ - "${test_build_dir}" \ - "${target}" \ - "${test_json}" \ - "${test_md}" \ - "${test_run_log}"; then - test_target_run_rc=0 + "${test_py_venv}" \ + "${test_py_bench_dir}/${py_target_path}" \ + "${test_py_json}" \ + "${test_py_md}" \ + "${test_py_run_log}"; then + test_py_target_run_rc=0 else - test_target_run_rc=$? + test_py_target_run_rc=$? any_failures=1 fi - fi - if [[ "${base_target_run_rc}" -eq 0 && "${test_target_run_rc}" -eq 0 ]]; then - compares_attempted=$((compares_attempted + 1)) - if run_compare_target \ - "${target}" \ - "${compare_script}" \ - "${compare_script_dir}" \ - "${base_json}" \ - "${test_json}" \ - "${compare_report_md}" \ - "${compare_report_log}"; then - compares_succeeded=$((compares_succeeded + 1)) - else - any_failures=1 + if [[ "${base_py_target_run_rc}" -eq 0 && "${test_py_target_run_rc}" -eq 0 ]]; then + py_compares_attempted=$((py_compares_attempted + 1)) + if run_python_compare_target \ + "${py_target_name}" \ + "${test_py_venv}" \ + "${base_py_json}" \ + "${test_py_json}" \ + "${compare_py_report_md}" \ + "${compare_py_report_log}"; then + py_compares_succeeded=$((py_compares_succeeded + 1)) + else + any_failures=1 + fi fi - fi -done + done +fi + +# ============================================================================ +# Summary and exit +# ============================================================================ summary_file="${artifact_dir}/summary.md" write_summary "${summary_file}" diff --git a/ci/bench/parse_bench_matrix.sh b/ci/bench/parse_bench_matrix.sh index c745bdd0063..c13faee08b2 100755 --- a/ci/bench/parse_bench_matrix.sh +++ b/ci/bench/parse_bench_matrix.sh @@ -16,7 +16,7 @@ Usage: $0 [bench-yaml-path] Parse ci/bench.yaml and emit a GitHub Actions strategy matrix JSON object: {"include":[...]} -Each include entry maps one enabled GPU to a bench_cub.yml workflow invocation. +Each include entry maps one enabled GPU to a benchmark workflow invocation. EOF } @@ -37,17 +37,31 @@ if ! bench_cfg_json="$(yq -o=json '.benchmarks // {}' "${bench_yaml_path}" 2>&1) die "Failed to parse ${bench_yaml_path} as YAML: ${bench_cfg_json}" fi -if ! jq -e '.filters? | type == "array" and length > 0 and all(.[]; type == "string")' >/dev/null <<<"${bench_cfg_json}"; then - die "${bench_yaml_path} must define at least one string entry in benchmarks.filters." +# Extract CUB and Python filter arrays (default to empty arrays). +cub_filters_json="$(jq -c '.cub.filters // []' <<<"${bench_cfg_json}")" +python_filters_json="$(jq -c '.python.filters // []' <<<"${bench_cfg_json}")" + +has_cub_filters="$(jq -e 'type == "array" and length > 0 and all(.[]; type == "string")' <<<"${cub_filters_json}" >/dev/null 2>&1 && echo true || echo false)" +has_python_filters="$(jq -e 'type == "array" and length > 0 and all(.[]; type == "string")' <<<"${python_filters_json}" >/dev/null 2>&1 && echo true || echo false)" + +if [[ "${has_cub_filters}" != "true" && "${has_python_filters}" != "true" ]]; then + die "${bench_yaml_path} must define at least one string entry in benchmarks.cub.filters or benchmarks.python.filters." fi -filters_arg="$( - jq -r '.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}" -)" +cub_filters_arg="" +if [[ "${has_cub_filters}" == "true" ]]; then + cub_filters_arg="$(jq -r '.cub.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}")" +fi + +python_filters_arg="" +if [[ "${has_python_filters}" == "true" ]]; then + python_filters_arg="$(jq -r '.python.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}")" +fi jq -cn \ --argjson cfg "${bench_cfg_json}" \ - --arg filters "${filters_arg}" \ + --arg cub_filters "${cub_filters_arg}" \ + --arg python_filters "${python_filters_arg}" \ '{ "include": [ ($cfg.gpus // [])[] as $gpu @@ -57,7 +71,8 @@ jq -cn \ "arch": ($cfg.arch // "native"), "base_ref": ($cfg.base_ref // "origin/main"), "test_ref": ($cfg.test_ref // "HEAD"), - "filters": $filters, + "cub_filters": $cub_filters, + "python_filters": $python_filters, "nvbench_args": ($cfg.nvbench_args // ""), "nvbench_compare_args": ($cfg.nvbench_compare_args // "") } diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index e2f30759d34..07984e384bb 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -62,17 +62,15 @@ test-cu12 = [ "pytest", "pytest-xdist", "cupy-cuda12x", - "pytest-benchmark", ] test-cu13 = [ "cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x", - "pytest-benchmark", ] -bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"] -bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"] +bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"] +bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"] [project.urls] Homepage = "https://github.com/NVIDIA/cccl" From 46fb87b9ddb1f9fa38caf6d0299e77b7fa994ef2 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 10:38:05 -0500 Subject: [PATCH 2/9] Make sure I didn't break CUB benchmarking CI [bench-only] --- ci/bench.yaml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/ci/bench.yaml b/ci/bench.yaml index fbb48fc6048..cc7001a84d4 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -27,9 +27,7 @@ benchmarks: # CUB C++ benchmark filters (regex matched against ninja target names). cub: filters: - # Examples: - # - '^cub\.bench\.for_each\.base' - # - '^cub\.bench\.reduce\.(sum|min)\.' + - '^cub\.bench\.reduce\.sum\.base$' # Python benchmark filters (regex matched against paths under benchmarks/). python: @@ -41,13 +39,7 @@ benchmarks: # Select GPUs. These are limited and shared, be intentional and conservative. gpus: - # - "t4" # sm_75, 16 GB - # - "rtx2080" # sm_75, 8 GB - # - "rtxa6000" # sm_86, 48 GB - # - "l4" # sm_89, 24 GB - # - "rtx4090" # sm_89, 24 GB - # - "h100" # sm_90, 80 GB - # - "rtxpro6000" # sm_120 + - "l4" # sm_89, 24 GB # Extra .devcontainer/launch.sh -d args # launch_args: "--cuda 13.1 --host gcc14" From d6685e60a227d40e586eb5b297b221387adecbf3 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 10:59:14 -0500 Subject: [PATCH 3/9] Add CuPy to bench dependencies and remove pytest-benchmark from test dependencies --- python/cuda_cccl/pyproject.toml | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index e2f30759d34..221684ca624 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -62,17 +62,10 @@ test-cu12 = [ "pytest", "pytest-xdist", "cupy-cuda12x", - "pytest-benchmark", ] -test-cu13 = [ - "cuda-cccl[cu13]", - "pytest", - "pytest-xdist", - "cupy-cuda13x", - "pytest-benchmark", -] -bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"] -bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"] +test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"] +bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"] +bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"] [project.urls] Homepage = "https://github.com/NVIDIA/cccl" From 2be14d5a00826c7c2a973638ce20e7dee08dd018 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 14:09:06 -0500 Subject: [PATCH 4/9] Test that python + CUB benchmarks work [bench-only] --- ci/bench.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ci/bench.yaml b/ci/bench.yaml index cc7001a84d4..9c7feff9dda 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -32,10 +32,7 @@ benchmarks: # Python benchmark filters (regex matched against paths under benchmarks/). python: filters: - # Examples: - # - 'compute/reduce/sum\.py' - # - 'compute/transform/.*\.py' - # - 'coop/bench_warp_reduce\.py' + - 'compute/reduce/sum\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: From ffc263e72de3f76269a470002828aa518192baf3 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 14:32:56 -0500 Subject: [PATCH 5/9] Revert back to bench.yaml template --- ci/bench.yaml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ci/bench.yaml b/ci/bench.yaml index 9c7feff9dda..fbb48fc6048 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -27,16 +27,27 @@ benchmarks: # CUB C++ benchmark filters (regex matched against ninja target names). cub: filters: - - '^cub\.bench\.reduce\.sum\.base$' + # Examples: + # - '^cub\.bench\.for_each\.base' + # - '^cub\.bench\.reduce\.(sum|min)\.' # Python benchmark filters (regex matched against paths under benchmarks/). python: filters: - - 'compute/reduce/sum\.py' + # Examples: + # - 'compute/reduce/sum\.py' + # - 'compute/transform/.*\.py' + # - 'coop/bench_warp_reduce\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: - - "l4" # sm_89, 24 GB + # - "t4" # sm_75, 16 GB + # - "rtx2080" # sm_75, 8 GB + # - "rtxa6000" # sm_86, 48 GB + # - "l4" # sm_89, 24 GB + # - "rtx4090" # sm_89, 24 GB + # - "h100" # sm_90, 80 GB + # - "rtxpro6000" # sm_120 # Extra .devcontainer/launch.sh -d args # launch_args: "--cuda 13.1 --host gcc14" From ebfea2c1acde97eb83c108aa2136b97df8e1e6e6 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Wed, 1 Apr 2026 15:02:10 -0500 Subject: [PATCH 6/9] Fix comparison formatting --- ci/bench/compare_paths.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh index 4e3a221ceb2..cf8b21f5a31 100755 --- a/ci/bench/compare_paths.sh +++ b/ci/bench/compare_paths.sh @@ -619,6 +619,7 @@ write_summary() { fi if [[ "${#selected_py_targets[@]}" -gt 0 ]]; then + echo echo "## Python Compare Reports" local py_target_path="" local py_target_name="" From d3d089cd3b28584ab6e3f7185febc30d2f39f350 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 2 Apr 2026 14:40:06 -0500 Subject: [PATCH 7/9] Use newer nvbench_compare version which contains flag --- ci/bench/compare_paths.sh | 4 ++-- cmake/CCCLGetDependencies.cmake | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh index cf8b21f5a31..3237f1bd862 100755 --- a/ci/bench/compare_paths.sh +++ b/ci/bench/compare_paths.sh @@ -410,7 +410,7 @@ run_python_compare_target() { local elapsed_s=0 local rc=0 local -a compare_cmd - compare_cmd=("${venv_path}/bin/nvbench-compare" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}") + compare_cmd=("${venv_path}/bin/nvbench-compare" --no-color "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}") : > "${compare_log}" echo "::group::${label}" @@ -483,7 +483,7 @@ run_compare_target() { local rc=0 local compare_pythonpath="${compare_script_dir}${PYTHONPATH:+:${PYTHONPATH}}" local -a compare_cmd - compare_cmd=(python3 "${compare_script}" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}") + compare_cmd=(python3 "${compare_script}" --no-color "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}") : > "${compare_log}" echo "::group::${label}" diff --git a/cmake/CCCLGetDependencies.cmake b/cmake/CCCLGetDependencies.cmake index 16717131636..7cc560788d7 100644 --- a/cmake/CCCLGetDependencies.cmake +++ b/cmake/CCCLGetDependencies.cmake @@ -82,7 +82,7 @@ endmacro() set( CCCL_NVBENCH_SHA - "836a6c12f4330d9cbbe9e0041956b82f09e702ee" + "373970323f3e2a3995761ea682ca64dfcbdd1e26" CACHE STRING "SHA/tag to use for CCCL's NVBench." ) From ef341bfaa7aab389b02ca9099ea541d1b97e56ff Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 2 Apr 2026 14:41:23 -0500 Subject: [PATCH 8/9] Test --no-color flag [bench-only] --- ci/bench.yaml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/ci/bench.yaml b/ci/bench.yaml index fbb48fc6048..9c7feff9dda 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -27,27 +27,16 @@ benchmarks: # CUB C++ benchmark filters (regex matched against ninja target names). cub: filters: - # Examples: - # - '^cub\.bench\.for_each\.base' - # - '^cub\.bench\.reduce\.(sum|min)\.' + - '^cub\.bench\.reduce\.sum\.base$' # Python benchmark filters (regex matched against paths under benchmarks/). python: filters: - # Examples: - # - 'compute/reduce/sum\.py' - # - 'compute/transform/.*\.py' - # - 'coop/bench_warp_reduce\.py' + - 'compute/reduce/sum\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: - # - "t4" # sm_75, 16 GB - # - "rtx2080" # sm_75, 8 GB - # - "rtxa6000" # sm_86, 48 GB - # - "l4" # sm_89, 24 GB - # - "rtx4090" # sm_89, 24 GB - # - "h100" # sm_90, 80 GB - # - "rtxpro6000" # sm_120 + - "l4" # sm_89, 24 GB # Extra .devcontainer/launch.sh -d args # launch_args: "--cuda 13.1 --host gcc14" From 10b520382ad54341080295955e1956c74a3de654 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Thu, 2 Apr 2026 15:10:03 -0500 Subject: [PATCH 9/9] Revert ci benchmarking test --- ci/bench.yaml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ci/bench.yaml b/ci/bench.yaml index 9c7feff9dda..fbb48fc6048 100644 --- a/ci/bench.yaml +++ b/ci/bench.yaml @@ -27,16 +27,27 @@ benchmarks: # CUB C++ benchmark filters (regex matched against ninja target names). cub: filters: - - '^cub\.bench\.reduce\.sum\.base$' + # Examples: + # - '^cub\.bench\.for_each\.base' + # - '^cub\.bench\.reduce\.(sum|min)\.' # Python benchmark filters (regex matched against paths under benchmarks/). python: filters: - - 'compute/reduce/sum\.py' + # Examples: + # - 'compute/reduce/sum\.py' + # - 'compute/transform/.*\.py' + # - 'coop/bench_warp_reduce\.py' # Select GPUs. These are limited and shared, be intentional and conservative. gpus: - - "l4" # sm_89, 24 GB + # - "t4" # sm_75, 16 GB + # - "rtx2080" # sm_75, 8 GB + # - "rtxa6000" # sm_86, 48 GB + # - "l4" # sm_89, 24 GB + # - "rtx4090" # sm_89, 24 GB + # - "h100" # sm_90, 80 GB + # - "rtxpro6000" # sm_120 # Extra .devcontainer/launch.sh -d args # launch_args: "--cuda 13.1 --host gcc14"