From ff2c7e7297ac55df99c0eb4bebd477ec5df1e456 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 10:33:16 -0500
Subject: [PATCH 1/9] Implement benchmarking CI for python

---
 .../workflows/{bench_cub.yml => bench.yml}    |  84 +-
 .../workflows/ci-workflow-pull-request.yml    |  27 +-
 ci-overview.md                                |   6 +-
 ci/bench.template.yaml                        |  25 +-
 ci/bench.yaml                                 |  25 +-
 ci/bench/README.md                            |  59 +-
 ci/bench/{cub.sh => bench.sh}                 |   0
 ci/bench/compare_git_refs.sh                  |   2 +-
 ci/bench/compare_paths.sh                     | 798 +++++++++++++-----
 ci/bench/parse_bench_matrix.sh                |  31 +-
 python/cuda_cccl/pyproject.toml               |   6 +-
 11 files changed, 771 insertions(+), 292 deletions(-)
 rename .github/workflows/{bench_cub.yml => bench.yml} (79%)
 rename ci/bench/{cub.sh => bench.sh} (100%)

diff --git a/.github/workflows/bench_cub.yml b/.github/workflows/bench.yml
similarity index 79%
rename from .github/workflows/bench_cub.yml
rename to .github/workflows/bench.yml
index 7810c858fc7..b5d54526c11 100644
--- a/.github/workflows/bench_cub.yml
+++ b/.github/workflows/bench.yml
@@ -1,4 +1,4 @@
-name: CUB Benchmark Compare
+name: Benchmark Compare
 
 defaults:
   run:
@@ -18,7 +18,7 @@ on:
         default: "--cuda 13.1 --host gcc14"
         type: string
       arch:
-        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch"
+        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch"
         required: false
         default: "native"
         type: string
@@ -32,8 +32,13 @@ on:
         required: false
         default: ""
         type: string
-      filters:
-        description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')"
+      cub_filters:
+        description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')"
+        required: false
+        default: ""
+        type: string
+      python_filters:
+        description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')"
         required: false
         default: ""
         type: string
@@ -65,7 +70,7 @@ on:
         default: "--cuda 13.1 --host gcc14"
         type: string
       arch:
-        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/cub.sh --arch"
+        description: "CMAKE_CUDA_ARCHITECTURES value passed as ci/bench/bench.sh --arch"
         required: false
         default: "native"
         type: string
@@ -79,8 +84,13 @@ on:
         required: false
         default: ""
         type: string
-      filters:
-        description: "Filters, space-separated if multiple (ex: '^cub.bench.copy.memcpy.base$' '.*foo.bar.*')"
+      cub_filters:
+        description: "CUB filters, space-separated (ex: '^cub.bench.copy.memcpy.base$')"
+        required: false
+        default: ""
+        type: string
+      python_filters:
+        description: "Python filters, space-separated (ex: 'compute/reduce/sum\\.py')"
         required: false
         default: ""
         type: string
@@ -146,7 +156,8 @@ jobs:
           INPUT_ARCH: ${{ inputs.arch }}
           INPUT_BASE_REF: ${{ inputs.base_ref }}
           INPUT_TEST_REF: ${{ inputs.test_ref }}
-          INPUT_FILTERS: ${{ inputs.filters }}
+          INPUT_CUB_FILTERS: ${{ inputs.cub_filters }}
+          INPUT_PYTHON_FILTERS: ${{ inputs.python_filters }}
           INPUT_RAW_ARGS: ${{ inputs.raw_args }}
           INPUT_NVBENCH_ARGS: ${{ inputs.nvbench_args }}
           INPUT_NVBENCH_COMPARE_ARGS: ${{ inputs.nvbench_compare_args }}
@@ -178,8 +189,12 @@ jobs:
             mapfile -d '' -t bench_args < "${parsed_raw_args_file}"
             rm -f "${parsed_raw_args_file}"
           else
-            if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" || -z "${INPUT_FILTERS}" ]]; then
-              echo "::error::When Raw Args is empty, Base Ref, Test Ref, and Filters must all be set."
+            if [[ -z "${INPUT_BASE_REF}" || -z "${INPUT_TEST_REF}" ]]; then
+              echo "::error::When Raw Args is empty, Base Ref and Test Ref must be set."
+              exit 2
+            fi
+            if [[ -z "${INPUT_CUB_FILTERS}" && -z "${INPUT_PYTHON_FILTERS}" ]]; then
+              echo "::error::At least one of CUB Filters or Python Filters must be set."
               exit 2
             fi
 
@@ -190,23 +205,40 @@ jobs:
               bench_args+=(--arch "${INPUT_ARCH}")
             fi
 
-            declare -a parsed_filters
-            parsed_filters=()
-            parsed_filters_file="$(mktemp "${RUNNER_TEMP}/bench-filters-XXXXXX")"
-            if ! parse_quoted_args "${INPUT_FILTERS}" > "${parsed_filters_file}"; then
-              rm -f "${parsed_filters_file}"
-              exit 2
+            # Add CUB filters as --cub-filter flags.
+            if [[ -n "${INPUT_CUB_FILTERS}" ]]; then
+              declare -a parsed_cub_filters
+              parsed_cub_filters=()
+              parsed_cub_filters_file="$(mktemp "${RUNNER_TEMP}/bench-cub-filters-XXXXXX")"
+              if ! parse_quoted_args "${INPUT_CUB_FILTERS}" > "${parsed_cub_filters_file}"; then
+                rm -f "${parsed_cub_filters_file}"
+                exit 2
+              fi
+              mapfile -d '' -t parsed_cub_filters < "${parsed_cub_filters_file}"
+              rm -f "${parsed_cub_filters_file}"
+
+              for cub_filter in "${parsed_cub_filters[@]}"; do
+                bench_args+=(--cub-filter "${cub_filter}")
+              done
             fi
-            mapfile -d '' -t parsed_filters < "${parsed_filters_file}"
-            rm -f "${parsed_filters_file}"
 
-            if [[ "${#parsed_filters[@]}" -eq 0 ]]; then
-              echo "::error::Filters must parse to at least one argument."
-              exit 2
+            # Add Python filters as --python-filter flags.
+            if [[ -n "${INPUT_PYTHON_FILTERS}" ]]; then
+              declare -a parsed_py_filters
+              parsed_py_filters=()
+              parsed_py_filters_file="$(mktemp "${RUNNER_TEMP}/bench-py-filters-XXXXXX")"
+              if ! parse_quoted_args "${INPUT_PYTHON_FILTERS}" > "${parsed_py_filters_file}"; then
+                rm -f "${parsed_py_filters_file}"
+                exit 2
+              fi
+              mapfile -d '' -t parsed_py_filters < "${parsed_py_filters_file}"
+              rm -f "${parsed_py_filters_file}"
+
+              for py_filter in "${parsed_py_filters[@]}"; do
+                bench_args+=(--python-filter "${py_filter}")
+              done
             fi
 
-            bench_args+=("${parsed_filters[@]}")
-
             if [[ -n "${INPUT_NVBENCH_ARGS}" ]]; then
               bench_args+=(--nvbench-args "${INPUT_NVBENCH_ARGS}")
             fi
@@ -234,7 +266,7 @@ jobs:
 
       - name: Show resolved benchmark args
         run: |
-          echo "Resolved args passed to ci/bench/cub.sh:"
+          echo "Resolved args passed to ci/bench/bench.sh:"
           while IFS= read -r arg; do
             echo "  ${arg}"
           done <<< "${{ steps.resolve-args.outputs.resolved_args }}"
@@ -263,7 +295,7 @@ jobs:
 
           base_sha_short="${base_sha_full:0:8}"
           test_sha_short="${test_sha_full:0:8}"
-          artifact_name="bench-cub-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}"
+          artifact_name="bench-${GPU_NAME}-${timestamp_utc}-${base_sha_short}-${test_sha_short}"
 
           mkdir -p "bench-artifacts"
 
@@ -289,7 +321,7 @@ jobs:
             jsondiff \
             tabulate
 
-          ./ci/bench/cub.sh "${bench_args[@]}"
+          ./ci/bench/bench.sh "${bench_args[@]}"
           EOF
           chmod +x "${ci_script}"
 
diff --git a/.github/workflows/ci-workflow-pull-request.yml b/.github/workflows/ci-workflow-pull-request.yml
index 35a7acc08a3..b94b89e337a 100644
--- a/.github/workflows/ci-workflow-pull-request.yml
+++ b/.github/workflows/ci-workflow-pull-request.yml
@@ -76,7 +76,7 @@ jobs:
         run: |
           echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
           echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
-      - name: Build CUB benchmark dispatch matrix
+      - name: Build benchmark dispatch matrix
         id: build-bench-matrix
         run: |
           { # Defaults:
@@ -86,7 +86,7 @@ jobs:
 
           # Compare ci/bench.yaml against its template. If they match, no benchmarks requested.
           if diff -q "ci/bench.template.yaml" "ci/bench.yaml" > /dev/null 2>&1; then
-            echo "ci/bench.yaml matches template; skipping CUB benchmark dispatch matrix."
+            echo "ci/bench.yaml matches template; skipping benchmark dispatch."
             exit 0
           fi
 
@@ -395,8 +395,8 @@ jobs:
           print('All CPU-only import tests passed!')
           "
 
-  dispatch-cub-bench:
-    name: CUB Bench Compare (${{ matrix.gpu }})
+  dispatch-bench:
+    name: Bench Compare (${{ matrix.gpu }})
     if: >-
       ${{
         needs.build-workflow.outputs.bench_enabled == 'true' &&
@@ -409,14 +409,15 @@ jobs:
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(needs.build-workflow.outputs.bench_matrix) }}
-    uses: ./.github/workflows/bench_cub.yml
+    uses: ./.github/workflows/bench.yml
     with:
       gpu: ${{ matrix.gpu }}
       launch_args: ${{ matrix.launch_args }}
       arch: ${{ matrix.arch }}
       base_ref: ${{ matrix.base_ref }}
       test_ref: ${{ matrix.test_ref }}
-      filters: ${{ matrix.filters }}
+      cub_filters: ${{ matrix.cub_filters }}
+      python_filters: ${{ matrix.python_filters }}
       nvbench_args: ${{ matrix.nvbench_args }}
       nvbench_compare_args: ${{ matrix.nvbench_compare_args }}
 
@@ -428,7 +429,7 @@ jobs:
         needs.build-workflow.outputs.bench_enabled == 'true' &&
         needs.build-workflow.outputs.pr_number != ''
       }}
-    needs: [build-workflow, dispatch-cub-bench]
+    needs: [build-workflow, dispatch-bench]
     permissions:
       pull-requests: write
     runs-on: ubuntu-latest
@@ -436,7 +437,7 @@ jobs:
       - name: Determine outcome
         id: outcome
         run: |
-          result="${{ needs.dispatch-cub-bench.result }}"
+          result="${{ needs.dispatch-bench.result }}"
           case "${result}" in
             success)  icon=":white_check_mark:"; status="completed successfully" ;;
             failure)  icon=":x:";                status="had failures" ;;
@@ -452,7 +453,7 @@ jobs:
           message: |
             ## ${{ steps.outcome.outputs.icon }} Benchmark Results
 
-            CUB benchmark comparison ${{ steps.outcome.outputs.status }}.
+            Benchmark comparison ${{ steps.outcome.outputs.status }}.
 
             **[Results](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})**
             **[Artifacts](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}#artifacts)**
@@ -473,7 +474,7 @@ jobs:
       - verify-devcontainers
       - docs-build
       - test-cpu-import
-      - dispatch-cub-bench
+      - dispatch-bench
     runs-on: ubuntu-latest
     steps:
       - name: Check results
@@ -498,12 +499,12 @@ jobs:
           check_result "docs-build"           "success" "${{needs.docs-build.result}}"
           check_result "test-cpu-import"      "success" "${{needs.test-cpu-import.result}}"
 
-          expected_cub_bench_result="skipped"
+          expected_bench_result="skipped"
           if [[ "${{ needs.build-workflow.outputs.bench_enabled }}" == "true" ]] \
             && [[ "${{ toJSON(fromJSON(needs.build-workflow.outputs.bench_matrix).include) }}" != "[]" ]]; then
-            expected_cub_bench_result="success"
+            expected_bench_result="success"
           fi
-          check_result "dispatch-cub-bench" "${expected_cub_bench_result}" "${{needs.dispatch-cub-bench.result}}"
+          check_result "dispatch-bench" "${expected_bench_result}" "${{needs.dispatch-bench.result}}"
 
           bench_enabled="${{ needs.build-workflow.outputs.bench_enabled }}"
           if [[ "${bench_enabled}" == "true" ]]; then
diff --git a/ci-overview.md b/ci-overview.md
index 850e40d885f..1cd8fe2ba2a 100644
--- a/ci-overview.md
+++ b/ci-overview.md
@@ -71,11 +71,11 @@ CCCL's CI uses [`sccache`](https://github.com/mozilla/sccache) to cache compiler
 
 CI jobs employ the build and test scripts in the `ci/` directory to build and run tests. These scripts provide a consistent entry point for building and testing in both local and CI environments. For more information on using these scripts, see the [CONTRIBUTING.md guide](CONTRIBUTING.md#building-and-testing).
 
-#### CUB Benchmark Comparison Workflow
+#### Benchmark Comparison Workflow
 
-The standalone CUB benchmark comparison workflow is implemented in `.github/workflows/bench_cub.yml` and uses:
+The benchmark comparison workflow is implemented in `.github/workflows/bench.yml` and uses:
 
-- `ci/bench/cub.sh`
+- `ci/bench/bench.sh`
 - `ci/bench/compare_git_refs.sh`
 - `ci/bench/compare_paths.sh`
 
diff --git a/ci/bench.template.yaml b/ci/bench.template.yaml
index 0527f349a96..fbb48fc6048 100644
--- a/ci/bench.template.yaml
+++ b/ci/bench.template.yaml
@@ -1,8 +1,8 @@
-# # CUB PR benchmark request config.
+# # CCCL PR benchmark request config.
 #
 # ## Overview:
 #
-# This file is used to request CUB benchmark comparisons in PR CI.
+# This file is used to request benchmark comparisons in PR CI.
 #
 # This file must match ci/bench.template.yaml to merge.
 # CI branch protections will fail if they differ. Reset before merging.
@@ -17,18 +17,27 @@
 #
 # ## Quick start:
 #
-# 1. Add one or more benchmark regexes under benchmarks.filters.
+# 1. Add one or more benchmark regexes under cub and/or python filters.
 # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus.
 # 3. Push and inspect the dispatched benchmark jobs/artifacts.
 # 4. Remove/reset benchmark-request edits before final merge.
 
 benchmarks:
 
-  # Inclusive regex filters (required).
-  filters:
-    # Examples:
-    # - '^cub\.bench\.for_each\.base'
-    # - '^cub\.bench\.reduce\.(sum|min)\.'
+  # CUB C++ benchmark filters (regex matched against ninja target names).
+  cub:
+    filters:
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
+
+  # Python benchmark filters (regex matched against paths under benchmarks/).
+  python:
+    filters:
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
diff --git a/ci/bench.yaml b/ci/bench.yaml
index 0527f349a96..fbb48fc6048 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -1,8 +1,8 @@
-# # CUB PR benchmark request config.
+# # CCCL PR benchmark request config.
 #
 # ## Overview:
 #
-# This file is used to request CUB benchmark comparisons in PR CI.
+# This file is used to request benchmark comparisons in PR CI.
 #
 # This file must match ci/bench.template.yaml to merge.
 # CI branch protections will fail if they differ. Reset before merging.
@@ -17,18 +17,27 @@
 #
 # ## Quick start:
 #
-# 1. Add one or more benchmark regexes under benchmarks.filters.
+# 1. Add one or more benchmark regexes under cub and/or python filters.
 # 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus.
 # 3. Push and inspect the dispatched benchmark jobs/artifacts.
 # 4. Remove/reset benchmark-request edits before final merge.
 
 benchmarks:
 
-  # Inclusive regex filters (required).
-  filters:
-    # Examples:
-    # - '^cub\.bench\.for_each\.base'
-    # - '^cub\.bench\.reduce\.(sum|min)\.'
+  # CUB C++ benchmark filters (regex matched against ninja target names).
+  cub:
+    filters:
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
+
+  # Python benchmark filters (regex matched against paths under benchmarks/).
+  python:
+    filters:
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
diff --git a/ci/bench/README.md b/ci/bench/README.md
index 17609e47045..990d1508134 100644
--- a/ci/bench/README.md
+++ b/ci/bench/README.md
@@ -1,32 +1,40 @@
-# CUB Benchmark Compare Scripts
+# Benchmark Compare Scripts
 
-This directory contains the scripts used by `.github/workflows/bench_cub.yml` to compare CUB benchmark results between two code states.
+This directory contains the scripts used by `.github/workflows/bench.yml` to compare benchmark results between two code states.
 
 ## Scripts
 
-- `ci/bench/cub.sh`: CI-oriented wrapper that calls `ci/bench/compare_git_refs.sh`.
+- `ci/bench/bench.sh`: CI-oriented wrapper that calls `ci/bench/compare_git_refs.sh`.
 - `ci/bench/compare_git_refs.sh`: checks out `<base-ref>` and `<test-ref>` in temporary worktrees, then forwards all remaining args to `ci/bench/compare_paths.sh`.
-- `ci/bench/compare_paths.sh`: configures/builds/runs common `cub.bench.*` targets in two source trees and runs `nvbench_compare.py` on produced JSON outputs.
-- `ci/bench/parse_bench_matrix.sh`: parses `ci/bench.yaml` and emits a dispatch matrix JSON object for `.github/workflows/bench_cub.yml`.
+- `ci/bench/compare_paths.sh`: configures/builds/runs CUB benchmarks and/or Python benchmarks in two source trees and runs comparison tools on produced JSON outputs.
+- `ci/bench/parse_bench_matrix.sh`: parses `ci/bench.yaml` and emits a dispatch matrix JSON object for `.github/workflows/bench.yml`.
 
 ## Usage
 
-Compare two refs:
+Compare CUB benchmarks between two refs:
 
 ```bash
-"./ci/bench/cub.sh" "origin/main" "HEAD" "^cub\\.bench\\.copy\\.memcpy\\.base$"
+"./ci/bench/bench.sh" "origin/main" "HEAD" \
+  --cub-filter "^cub\\.bench\\.copy\\.memcpy\\.base$"
 ```
 
-Forward additional options (parsed by `compare_paths.sh`):
+Compare Python benchmarks between two refs:
 
 ```bash
-"./ci/bench/cub.sh" \
+"./ci/bench/bench.sh" "origin/main" "HEAD" \
+  --python-filter "compute/reduce/sum\\.py"
+```
+
+Run both CUB and Python benchmarks:
+
+```bash
+"./ci/bench/bench.sh" \
   "origin/main" \
   "HEAD" \
   --arch "native" \
   --nvbench-args "..." \
-  --nvbench-compare-args "..." \
-  "^cub\\.bench\\.reduce\\..*$"
+  --cub-filter "^cub\\.bench\\.reduce\\..*$" \
+  --python-filter "compute/reduce/sum\\.py"
 ```
 
 Compare already checked-out trees:
@@ -36,21 +44,40 @@ Compare already checked-out trees:
   "/path/to/base/cccl" \
   "/path/to/test/cccl" \
   --arch "native" \
-  "^cub\\.bench\\.copy\\.memcpy\\.base$"
+  --cub-filter "^cub\\.bench\\.copy\\.memcpy\\.base$" \
+  --python-filter "compute/transform/.*\\.py"
 ```
 
 ## Workflow Inputs
 
-In `.github/workflows/bench_cub.yml`:
+In `.github/workflows/bench.yml`:
 
-- If `raw_args` is non-empty, it is parsed and passed directly to `ci/bench/cub.sh`.
-- Otherwise, args are assembled from `base_ref`, `test_ref`, `arch`, `filters`, `nvbench_args`, and `nvbench_compare_args`.
+- If `raw_args` is non-empty, it is parsed and passed directly to `ci/bench/bench.sh`.
+- Otherwise, args are assembled from `base_ref`, `test_ref`, `arch`, `cub_filters`, `python_filters`, `nvbench_args`, and `nvbench_compare_args`.
+- CUB filters are passed as `--cub-filter` flags. Python filters are passed as `--python-filter` flags.
 - Malformed quoted input (for example unmatched quotes) fails the workflow step.
 
+## Python Benchmarks
+
+Python benchmarks live under `python/cuda_cccl/benchmarks/` and use `cuda.bench` (the Python nvbench bindings). Each benchmark script outputs nvbench-compatible JSON.
+
+For Python benchmarks, `compare_paths.sh`:
+
+1. Creates isolated virtual environments for base and test trees.
+2. Installs `cuda-cccl[bench-cuXX]` (editable, from each worktree), which pulls in `cuda-bench`, `cupy`, and all other benchmark dependencies.
+3. Runs matching benchmark scripts in each venv.
+4. Compares results using `nvbench-compare` (installed with `cuda-bench`).
+
+Python filters are regex patterns matched against relative paths under `python/cuda_cccl/benchmarks/`, for example:
+- `compute/reduce/sum\.py` — single benchmark
+- `compute/transform/.*\.py` — all transform benchmarks
+- `coop/.*\.py` — all coop benchmarks
+
 ## Artifacts
 
 `compare_paths.sh` writes a run directory under `${CCCL_BENCH_ARTIFACT_ROOT:-$(pwd)/bench-artifacts}` containing:
 
 - per-target JSON and markdown outputs for base/test runs,
 - grouped build logs (`build.base.log`, `build.test.log`), per-target run logs, and per-target compare logs (`compare.<target>.log`),
-- `summary.md` with run metadata and per-target collapsible full `nvbench_compare.py` reports.
+- Python venv setup logs (`py.venv.base.log`, `py.venv.test.log`),
+- `summary.md` with run metadata and per-target collapsible full compare reports.
diff --git a/ci/bench/cub.sh b/ci/bench/bench.sh
similarity index 100%
rename from ci/bench/cub.sh
rename to ci/bench/bench.sh
diff --git a/ci/bench/compare_git_refs.sh b/ci/bench/compare_git_refs.sh
index d2c0765fdab..540f1cbdd2b 100755
--- a/ci/bench/compare_git_refs.sh
+++ b/ci/bench/compare_git_refs.sh
@@ -13,7 +13,7 @@ usage() {
   cat <<EOF
 Usage: $0 <base-ref> <test-ref> [compare_paths args...]
 
-Compare CUB benchmark performance between two git refs from the current CCCL repo.
+Compare benchmark performance between two git refs from the current CCCL repo.
 Each ref is checked out in an isolated worktree and compared via compare_paths.sh.
 EOF
 }
diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh
index ffb30c02580..4e3a221ceb2 100755
--- a/ci/bench/compare_paths.sh
+++ b/ci/bench/compare_paths.sh
@@ -11,17 +11,30 @@ die() {
 
 usage() {
   cat <<EOF
-Usage: $0 <base-path> <test-path> [filter1 [filter2 ...]] \
+Usage: $0 <base-path> <test-path> \
+  [--cub-filter "<regex>"] \
+  [--python-filter "<regex>"] \
   [--arch "<arch>"] \
   [--nvbench-args "<args>"] \
   [--nvbench-compare-args "<args>"]
 
-Compare CUB benchmark performance between two checked-out CCCL trees.
+Compare benchmark performance between two checked-out CCCL trees.
+
+At least one --cub-filter or --python-filter must be provided.
+CUB filters are regex patterns matched against ninja target names.
+Python filters are regex patterns matched against benchmark script paths
+under python/cuda_cccl/benchmarks/ (e.g. compute/reduce/sum.py).
 
 Arguments:
   <base-path>  Path to baseline CCCL source tree.
   <test-path>  Path to comparison CCCL source tree.
-  [filterN]    Optional regex filters matched against benchmark target names.
+
+Options:
+  --cub-filter <regex>      CUB benchmark regex filter (repeatable).
+  --python-filter <regex>   Python benchmark regex filter (repeatable).
+  --arch <arch>             CMAKE_CUDA_ARCHITECTURES for CUB builds.
+  --nvbench-args <args>     Extra args passed to benchmark binaries/scripts.
+  --nvbench-compare-args <args>  Extra args passed to nvbench_compare.
 
 Environment:
   CCCL_BENCH_ARTIFACT_ROOT   Root directory for outputs.
@@ -74,11 +87,13 @@ validate_repo_path() {
   fi
 }
 
-validate_filters() {
+validate_filter_array() {
+  local -n _validate_filters_ref="$1"
+  local label="$2"
   local filter=""
-  for filter in "${FILTERS[@]}"; do
+  for filter in "${_validate_filters_ref[@]}"; do
     grep -Eq -- "${filter}" <<< "" >/dev/null 2>&1 || {
-      [[ "$?" -eq 1 ]] || die "Invalid regex filter: ${filter}"
+      [[ "$?" -eq 1 ]] || die "Invalid ${label} regex filter: ${filter}"
     }
   done
 }
@@ -103,6 +118,10 @@ print_shell_command() {
   printf '\n'
 }
 
+# ============================================================================
+# CUB helpers
+# ============================================================================
+
 configure_build_tree() {
   local src_path="$1"
   local build_path="$2"
@@ -165,6 +184,259 @@ resolve_compare_script() {
   return 1
 }
 
+run_target_for_side() {
+  local side="$1"
+  local build_path="$2"
+  local target="$3"
+  local json_path="$4"
+  local md_path="$5"
+  local run_log="$6"
+  local binary_path="${build_path}/bin/${target}"
+  local -a bench_cmd
+
+  if [[ ! -x "${binary_path}" ]]; then
+    echo "Benchmark binary missing: ${binary_path}" >&2
+    return 127
+  fi
+
+  bench_cmd=(
+    "${binary_path}"
+    -d 0
+    "${NVBENCH_RUN_ARGS[@]}"
+    --json "${json_path}"
+    --md "${md_path}"
+  )
+
+  run_grouped_logged_command \
+    "[run:${side}] ${target}" \
+    "${run_log}" \
+    "${bench_cmd[@]}"
+}
+
+select_targets() {
+  local base_build_path="$1"
+  local test_build_path="$2"
+  local -n selected_targets_ref="$3"
+  local -a base_targets
+  local -a test_targets
+  local -a common_targets
+  local target=""
+
+  mapfile -t base_targets < <(list_all_benchmark_targets "${base_build_path}")
+  mapfile -t test_targets < <(list_all_benchmark_targets "${test_build_path}")
+
+  if [[ "${#base_targets[@]}" -eq 0 ]]; then
+    die "No CUB benchmark targets were found in base build tree." 1
+  fi
+  if [[ "${#test_targets[@]}" -eq 0 ]]; then
+    die "No CUB benchmark targets were found in test build tree." 1
+  fi
+
+  mapfile -t common_targets < <(
+    comm -12 \
+      <(printf "%s\n" "${base_targets[@]}" | sort -u) \
+      <(printf "%s\n" "${test_targets[@]}" | sort -u)
+  )
+
+  selected_targets_ref=()
+  for target in "${common_targets[@]}"; do
+    [[ -n "${target}" ]] || continue
+    if target_matches_filters "${target}"; then
+      selected_targets_ref+=("${target}")
+    fi
+  done
+
+  if [[ "${#selected_targets_ref[@]}" -eq 0 ]]; then
+    die "No CUB benchmark targets matched the supplied filters." 1
+  fi
+}
+
+# ============================================================================
+# Python helpers
+# ============================================================================
+
+detect_cuda_major_version() {
+  local cuda_major=""
+  if command -v nvcc >/dev/null 2>&1; then
+    cuda_major="$(nvcc --version 2>/dev/null | sed -n 's/.*release \([0-9]*\)\..*/\1/p')"
+  fi
+  if [[ -z "${cuda_major}" ]]; then
+    cuda_major="12"
+  fi
+  printf "%s" "${cuda_major}"
+}
+
+python_path_to_target_name() {
+  local py_path="$1"
+  # compute/reduce/sum.py -> py.compute.reduce.sum
+  local name="${py_path%.py}"
+  name="${name//\//.}"
+  printf "py.%s" "${name}"
+}
+
+list_all_python_benchmarks() {
+  local benchmarks_path="$1"
+  if [[ ! -d "${benchmarks_path}" ]]; then
+    return 0
+  fi
+  find "${benchmarks_path}" -name '*.py' -type f \
+    ! -name 'utils.py' \
+    ! -name 'run_benchmarks.py' \
+    ! -name 'device_side_benchmark.py' \
+    ! -name '__init__.py' \
+    ! -path '*/__pycache__/*' \
+    -printf '%P\n' \
+    | sort -u
+}
+
+python_target_matches_filters() {
+  local target="$1"
+  local filter=""
+  for filter in "${PYTHON_FILTERS[@]}"; do
+    if grep -Eq -- "${filter}" <<< "${target}"; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+select_python_targets() {
+  local base_bench_path="$1"
+  local test_bench_path="$2"
+  local -n selected_py_targets_ref="$3"
+  local -a base_py_targets
+  local -a test_py_targets
+  local -a common_py_targets
+  local target=""
+
+  mapfile -t base_py_targets < <(list_all_python_benchmarks "${base_bench_path}")
+  mapfile -t test_py_targets < <(list_all_python_benchmarks "${test_bench_path}")
+
+  if [[ "${#base_py_targets[@]}" -eq 0 ]]; then
+    die "No Python benchmark scripts were found in base tree: ${base_bench_path}" 1
+  fi
+  if [[ "${#test_py_targets[@]}" -eq 0 ]]; then
+    die "No Python benchmark scripts were found in test tree: ${test_bench_path}" 1
+  fi
+
+  mapfile -t common_py_targets < <(
+    comm -12 \
+      <(printf "%s\n" "${base_py_targets[@]}" | sort -u) \
+      <(printf "%s\n" "${test_py_targets[@]}" | sort -u)
+  )
+
+  selected_py_targets_ref=()
+  for target in "${common_py_targets[@]}"; do
+    [[ -n "${target}" ]] || continue
+    if python_target_matches_filters "${target}"; then
+      selected_py_targets_ref+=("${target}")
+    fi
+  done
+
+  if [[ "${#selected_py_targets_ref[@]}" -eq 0 ]]; then
+    die "No Python benchmark scripts matched the supplied --python-filter patterns." 1
+  fi
+}
+
+setup_python_venv() {
+  local venv_path="$1"
+  local src_path="$2"
+  local side="$3"
+  local log_path="$4"
+  local cuda_major="$5"
+  local cuda_cccl_dir="${src_path}/python/cuda_cccl"
+
+  if [[ ! -d "${cuda_cccl_dir}" ]]; then
+    die "cuda_cccl source directory not found: ${cuda_cccl_dir}"
+  fi
+
+  local -a setup_cmds
+  setup_cmds=(
+    bash -c "
+      set -euo pipefail
+      python3 -m venv '${venv_path}'
+      '${venv_path}/bin/pip' install --upgrade pip
+      '${venv_path}/bin/pip' install -e '${cuda_cccl_dir}[bench-cu${cuda_major}]'
+      # nvbench-compare runtime deps (until cuda-bench declares them):
+      '${venv_path}/bin/pip' install colorama jsondiff tabulate
+    "
+  )
+
+  run_grouped_logged_command \
+    "[py-venv:${side}]" \
+    "${log_path}" \
+    "${setup_cmds[@]}"
+}
+
+run_python_target_for_side() {
+  local side="$1"
+  local venv_path="$2"
+  local script_path="$3"
+  local json_path="$4"
+  local md_path="$5"
+  local run_log="$6"
+  local -a bench_cmd
+
+  if [[ ! -f "${script_path}" ]]; then
+    echo "Python benchmark script missing: ${script_path}" >&2
+    return 127
+  fi
+
+  bench_cmd=(
+    "${venv_path}/bin/python"
+    "${script_path}"
+    -d 0
+    "${NVBENCH_RUN_ARGS[@]}"
+    --json "${json_path}"
+    --md "${md_path}"
+  )
+
+  run_grouped_logged_command \
+    "[py-run:${side}] ${script_path##*/benchmarks/}" \
+    "${run_log}" \
+    "${bench_cmd[@]}"
+}
+
+run_python_compare_target() {
+  local target_name="$1"
+  local venv_path="$2"
+  local base_json="$3"
+  local test_json="$4"
+  local compare_out="$5"
+  local compare_log="$6"
+
+  local label="[py-compare] ${target_name}"
+  local started_at=0
+  local elapsed_s=0
+  local rc=0
+  local -a compare_cmd
+  compare_cmd=("${venv_path}/bin/nvbench-compare" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}")
+
+  : > "${compare_log}"
+  echo "::group::${label}"
+  print_shell_command "${compare_cmd[@]}"
+  started_at="${SECONDS}"
+  if "${compare_cmd[@]}" \
+    > >(tee "${compare_out}" | tee -a "${compare_log}") \
+    2> >(tee -a "${compare_log}" >&2); then
+    rc=0
+  else
+    rc=$?
+  fi
+  elapsed_s=$((SECONDS - started_at))
+  echo "::endgroup::"
+  if [[ "${rc}" -eq 0 ]]; then
+    echo "${label} completed in ${elapsed_s}s"
+  else
+    echo "${label} failed in ${elapsed_s}s (rc=${rc})"
+  fi
+  return "${rc}"
+}
+
+# ============================================================================
+# Common helpers
+# ============================================================================
+
 run_grouped_logged_command() {
   local label="$1"
   local log_path="$2"
@@ -278,72 +550,9 @@ parse_quoted_args_to_array() {
   rm -f "${parsed_args_file}"
 }
 
-run_target_for_side() {
-  local side="$1"
-  local build_path="$2"
-  local target="$3"
-  local json_path="$4"
-  local md_path="$5"
-  local run_log="$6"
-  local binary_path="${build_path}/bin/${target}"
-  local -a bench_cmd
-
-  if [[ ! -x "${binary_path}" ]]; then
-    echo "Benchmark binary missing: ${binary_path}" >&2
-    return 127
-  fi
-
-  bench_cmd=(
-    "${binary_path}"
-    -d 0
-    "${NVBENCH_RUN_ARGS[@]}"
-    --json "${json_path}"
-    --md "${md_path}"
-  )
-
-  run_grouped_logged_command \
-    "[run:${side}] ${target}" \
-    "${run_log}" \
-    "${bench_cmd[@]}"
-}
-
-select_targets() {
-  local base_build_path="$1"
-  local test_build_path="$2"
-  local -n selected_targets_ref="$3"
-  local -a base_targets
-  local -a test_targets
-  local -a common_targets
-  local target=""
-
-  mapfile -t base_targets < <(list_all_benchmark_targets "${base_build_path}")
-  mapfile -t test_targets < <(list_all_benchmark_targets "${test_build_path}")
-
-  if [[ "${#base_targets[@]}" -eq 0 ]]; then
-    die "No CUB benchmark targets were found in base build tree." 1
-  fi
-  if [[ "${#test_targets[@]}" -eq 0 ]]; then
-    die "No CUB benchmark targets were found in test build tree." 1
-  fi
-
-  mapfile -t common_targets < <(
-    comm -12 \
-      <(printf "%s\n" "${base_targets[@]}" | sort -u) \
-      <(printf "%s\n" "${test_targets[@]}" | sort -u)
-  )
-
-  selected_targets_ref=()
-  for target in "${common_targets[@]}"; do
-    [[ -n "${target}" ]] || continue
-    if target_matches_filters "${target}"; then
-      selected_targets_ref+=("${target}")
-    fi
-  done
-
-  if [[ "${#selected_targets_ref[@]}" -eq 0 ]]; then
-    die "No benchmark targets matched the supplied filters." 1
-  fi
-}
+# ============================================================================
+# Summary
+# ============================================================================
 
 write_summary() {
   local summary_file="$1"
@@ -352,7 +561,7 @@ write_summary() {
   local reports_emitted=0
 
   {
-    echo "# CUB Benchmark Comparison Summary"
+    echo "# Benchmark Comparison Summary"
     echo
     echo "- Timestamp (UTC): ${timestamp}"
     echo "- GPU name: ${CCCL_BENCH_GPU_NAME:-not specified}"
@@ -360,39 +569,77 @@ write_summary() {
     echo "- Test label: ${test_label_raw}"
     echo "- Base source path: \`${BASE_PATH}\`"
     echo "- Test source path: \`${TEST_PATH}\`"
-    echo "- Base build dir: \`${base_build_dir}\`"
-    echo "- Test build dir: \`${test_build_dir}\`"
-    echo "- Selected targets: ${#selected_targets[@]}"
-    echo "- Comparisons attempted: ${compares_attempted}"
-    echo "- Comparisons succeeded (nvbench_compare exit 0): ${compares_succeeded}"
+    if [[ "${#FILTERS[@]}" -gt 0 ]]; then
+      echo "- Base build dir: \`${base_build_dir}\`"
+      echo "- Test build dir: \`${test_build_dir}\`"
+    fi
+    echo "- CUB targets selected: ${#selected_targets[@]}"
+    echo "- CUB comparisons attempted: ${compares_attempted}"
+    echo "- CUB comparisons succeeded: ${compares_succeeded}"
+    echo "- Python targets selected: ${#selected_py_targets[@]}"
+    echo "- Python comparisons attempted: ${py_compares_attempted}"
+    echo "- Python comparisons succeeded: ${py_compares_succeeded}"
     echo "- Target arch: ${TARGET_ARCH:-preset-default}"
     echo "- Artifact directory: \`${artifact_dir}\`"
     echo
-    echo "## Filters"
+
     if [[ "${#FILTERS[@]}" -gt 0 ]]; then
+      echo "## CUB Filters"
       for filter in "${FILTERS[@]}"; do
         echo "- \`${filter}\`"
       done
-    else
-      echo "- (none)"
-    fi
-    echo
-    echo "## Compare Reports"
-    for target in "${selected_targets[@]}"; do
-      compare_report_file="${artifact_dir}/compare/${target}.md"
-      if [[ ! -f "${compare_report_file}" ]]; then
-        continue
-      fi
-      reports_emitted=$((reports_emitted + 1))
-      echo
-      echo "### \`${target}\`"
-      echo
-      echo "<details><summary>Expand full compare output for \`${target}\`</summary>"
       echo
-      cat "${compare_report_file}"
+    fi
+
+    if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then
+      echo "## Python Filters"
+      for filter in "${PYTHON_FILTERS[@]}"; do
+        echo "- \`${filter}\`"
+      done
       echo
-      echo "</details>"
-    done
+    fi
+
+    if [[ "${#selected_targets[@]}" -gt 0 ]]; then
+      echo "## CUB Compare Reports"
+      for target in "${selected_targets[@]}"; do
+        compare_report_file="${artifact_dir}/compare/${target}.md"
+        if [[ ! -f "${compare_report_file}" ]]; then
+          continue
+        fi
+        reports_emitted=$((reports_emitted + 1))
+        echo
+        echo "### \`${target}\`"
+        echo
+        echo "<details><summary>Expand full compare output for \`${target}\`</summary>"
+        echo
+        cat "${compare_report_file}"
+        echo
+        echo "</details>"
+      done
+    fi
+
+    if [[ "${#selected_py_targets[@]}" -gt 0 ]]; then
+      echo "## Python Compare Reports"
+      local py_target_path=""
+      local py_target_name=""
+      for py_target_path in "${selected_py_targets[@]}"; do
+        py_target_name="$(python_path_to_target_name "${py_target_path}")"
+        compare_report_file="${artifact_dir}/compare/${py_target_name}.md"
+        if [[ ! -f "${compare_report_file}" ]]; then
+          continue
+        fi
+        reports_emitted=$((reports_emitted + 1))
+        echo
+        echo "### \`${py_target_name}\` (\`${py_target_path}\`)"
+        echo
+        echo "<details><summary>Expand full compare output for \`${py_target_name}\`</summary>"
+        echo
+        cat "${compare_report_file}"
+        echo
+        echo "</details>"
+      done
+    fi
+
     if [[ "${reports_emitted}" -eq 0 ]]; then
       echo
       echo "_No per-target compare reports were produced._"
@@ -400,6 +647,10 @@ write_summary() {
   } > "${summary_file}"
 }
 
+# ============================================================================
+# CLI parsing
+# ============================================================================
+
 parse_cli_args() {
   if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
     usage
@@ -418,6 +669,7 @@ parse_cli_args() {
   NVBENCH_COMPARE_ARGS_STRING=""
   TARGET_ARCH=""
   FILTERS=()
+  PYTHON_FILTERS=()
   while [[ "$#" -gt 0 ]]; do
     case "$1" in
       --arch)
@@ -441,14 +693,26 @@ parse_cli_args() {
         NVBENCH_COMPARE_ARGS_STRING="$2"
         shift 2
         ;;
+      --cub-filter)
+        if [[ "$#" -lt 2 ]]; then
+          die "Missing value for --cub-filter"
+        fi
+        FILTERS+=("$2")
+        shift 2
+        ;;
+      --python-filter)
+        if [[ "$#" -lt 2 ]]; then
+          die "Missing value for --python-filter"
+        fi
+        PYTHON_FILTERS+=("$2")
+        shift 2
+        ;;
       --)
         shift
-        FILTERS+=("$@")
         break
         ;;
       *)
-        FILTERS+=("$1")
-        shift
+        die "Unknown option: $1"
         ;;
     esac
   done
@@ -465,7 +729,12 @@ parse_quoted_args_to_array NVBENCH_COMPARE_ARGS "${NVBENCH_COMPARE_ARGS_STRING}"
 
 validate_repo_path "${BASE_PATH}"
 validate_repo_path "${TEST_PATH}"
-validate_filters
+validate_filter_array FILTERS "CUB"
+validate_filter_array PYTHON_FILTERS "Python"
+
+# ============================================================================
+# Common setup
+# ============================================================================
 
 timestamp="$(date -u +'%Y%m%dT%H%M%SZ')"
 base_label_raw="${CCCL_BENCH_BASE_LABEL:-$(resolve_repo_label "${BASE_PATH}")}"
@@ -484,21 +753,6 @@ build_token="$(sanitize_label "${test_label}-${timestamp}-${base_label}")"
 base_build_dir="${build_root}/base-${build_token}"
 test_build_dir="${build_root}/test-${build_token}"
 
-external_base_build_dir="${CCCL_BENCH_BASE_BUILD_DIR:-}"
-external_test_build_dir="${CCCL_BENCH_TEST_BUILD_DIR:-}"
-if [[ -n "${external_base_build_dir}" || -n "${external_test_build_dir}" ]]; then
-  if [[ -z "${external_base_build_dir}" || -z "${external_test_build_dir}" ]]; then
-    die "Both CCCL_BENCH_BASE_BUILD_DIR and CCCL_BENCH_TEST_BUILD_DIR must be set together."
-  fi
-  base_build_dir="$(realpath "${external_base_build_dir}")"
-  test_build_dir="$(realpath "${external_test_build_dir}")"
-  validate_build_dir "${base_build_dir}" "base"
-  validate_build_dir "${test_build_dir}" "test"
-  if [[ -n "${TARGET_ARCH}" ]]; then
-    echo "Warning: --arch is ignored when using preconfigured build directories." >&2
-  fi
-fi
-
 for subdir in base compare logs meta test; do
   mkdir -p "${artifact_dir}/${subdir}"
 done
@@ -511,12 +765,20 @@ fi
 echo "Base source: ${BASE_PATH}"
 echo "Test source: ${TEST_PATH}"
 if [[ "${#FILTERS[@]}" -gt 0 ]]; then
-  echo "Filters:"
+  echo "CUB filters:"
   for filter in "${FILTERS[@]}"; do
     echo "  - ${filter}"
   done
 else
-  echo "Filters: (none, all benchmark targets)"
+  echo "CUB filters: (none)"
+fi
+if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then
+  echo "Python filters:"
+  for filter in "${PYTHON_FILTERS[@]}"; do
+    echo "  - ${filter}"
+  done
+else
+  echo "Python filters: (none)"
 fi
 if [[ -n "${TARGET_ARCH}" ]]; then
   echo "Target arch: ${TARGET_ARCH}"
@@ -534,113 +796,239 @@ if [[ "${#NVBENCH_COMPARE_ARGS[@]}" -gt 0 ]]; then
   done
 fi
 
-if [[ -n "${external_base_build_dir}" ]]; then
-  echo "[configure:base] skipped (using existing build tree)"
-  echo "[configure:test] skipped (using existing build tree)"
-else
-  configure_build_tree "${BASE_PATH}" "${base_build_dir}" "base" "${artifact_dir}/logs/configure.base.log" "${TARGET_ARCH}"
-  configure_build_tree "${TEST_PATH}" "${test_build_dir}" "test" "${artifact_dir}/logs/configure.test.log" "${TARGET_ARCH}"
-fi
+any_failures=0
+compares_attempted=0
+compares_succeeded=0
+declare -a selected_targets=()
+py_compares_attempted=0
+py_compares_succeeded=0
+declare -a selected_py_targets=()
 
-declare -a selected_targets
-select_targets "${base_build_dir}" "${test_build_dir}" selected_targets
+# ============================================================================
+# CUB benchmark pipeline
+# ============================================================================
 
-printf "%s\n" "${selected_targets[@]}" > "${artifact_dir}/meta/selected_targets.txt"
+if [[ "${#FILTERS[@]}" -gt 0 ]]; then
+  echo
+  echo "=== CUB Benchmark Pipeline ==="
+  echo
+
+  external_base_build_dir="${CCCL_BENCH_BASE_BUILD_DIR:-}"
+  external_test_build_dir="${CCCL_BENCH_TEST_BUILD_DIR:-}"
+  if [[ -n "${external_base_build_dir}" || -n "${external_test_build_dir}" ]]; then
+    if [[ -z "${external_base_build_dir}" || -z "${external_test_build_dir}" ]]; then
+      die "Both CCCL_BENCH_BASE_BUILD_DIR and CCCL_BENCH_TEST_BUILD_DIR must be set together."
+    fi
+    base_build_dir="$(realpath "${external_base_build_dir}")"
+    test_build_dir="$(realpath "${external_test_build_dir}")"
+    validate_build_dir "${base_build_dir}" "base"
+    validate_build_dir "${test_build_dir}" "test"
+    if [[ -n "${TARGET_ARCH}" ]]; then
+      echo "Warning: --arch is ignored when using preconfigured build directories." >&2
+    fi
+  fi
 
-compare_script="$(resolve_compare_script "${test_build_dir}" || true)"
-if [[ -z "${compare_script}" ]]; then
-  compare_script="$(resolve_compare_script "${base_build_dir}" || true)"
-fi
-if [[ -z "${compare_script}" ]]; then
-  die "Unable to locate nvbench_compare.py in build dependencies." 1
-fi
-compare_script_dir="$(dirname "${compare_script}")"
+  if [[ -z "${external_base_build_dir:-}" ]]; then
+    configure_build_tree "${BASE_PATH}" "${base_build_dir}" "base" "${artifact_dir}/logs/configure.base.log" "${TARGET_ARCH}"
+    configure_build_tree "${TEST_PATH}" "${test_build_dir}" "test" "${artifact_dir}/logs/configure.test.log" "${TARGET_ARCH}"
+  else
+    echo "[configure:base] skipped (using existing build tree)"
+    echo "[configure:test] skipped (using existing build tree)"
+  fi
 
-any_failures=0
-compares_attempted=0
-compares_succeeded=0
-base_build_all_rc=0
-test_build_all_rc=0
+  select_targets "${base_build_dir}" "${test_build_dir}" selected_targets
 
-if run_grouped_logged_command \
-  "[build:base]" \
-  "${artifact_dir}/logs/build.base.log" \
-  ninja -C "${base_build_dir}" "${selected_targets[@]}"; then
-  base_build_all_rc=0
-else
-  base_build_all_rc=$?
-  any_failures=1
-fi
+  printf "%s\n" "${selected_targets[@]}" > "${artifact_dir}/meta/selected_targets.txt"
 
-if run_grouped_logged_command \
-  "[build:test]" \
-  "${artifact_dir}/logs/build.test.log" \
-  ninja -C "${test_build_dir}" "${selected_targets[@]}"; then
+  compare_script="$(resolve_compare_script "${test_build_dir}" || true)"
+  if [[ -z "${compare_script}" ]]; then
+    compare_script="$(resolve_compare_script "${base_build_dir}" || true)"
+  fi
+  if [[ -z "${compare_script}" ]]; then
+    die "Unable to locate nvbench_compare.py in build dependencies." 1
+  fi
+  compare_script_dir="$(dirname "${compare_script}")"
+
+  base_build_all_rc=0
   test_build_all_rc=0
-else
-  test_build_all_rc=$?
-  any_failures=1
+
+  if run_grouped_logged_command \
+    "[build:base]" \
+    "${artifact_dir}/logs/build.base.log" \
+    ninja -C "${base_build_dir}" "${selected_targets[@]}"; then
+    base_build_all_rc=0
+  else
+    base_build_all_rc=$?
+    any_failures=1
+  fi
+
+  if run_grouped_logged_command \
+    "[build:test]" \
+    "${artifact_dir}/logs/build.test.log" \
+    ninja -C "${test_build_dir}" "${selected_targets[@]}"; then
+    test_build_all_rc=0
+  else
+    test_build_all_rc=$?
+    any_failures=1
+  fi
+
+  for target in "${selected_targets[@]}"; do
+    base_target_run_rc=125
+    test_target_run_rc=125
+    base_run_log="${artifact_dir}/logs/run.base.${target}.log"
+    test_run_log="${artifact_dir}/logs/run.test.${target}.log"
+    compare_report_md="${artifact_dir}/compare/${target}.md"
+    compare_report_log="${artifact_dir}/logs/compare.${target}.log"
+
+    base_json="${artifact_dir}/base/${target}.json"
+    base_md="${artifact_dir}/base/${target}.md"
+    test_json="${artifact_dir}/test/${target}.json"
+    test_md="${artifact_dir}/test/${target}.md"
+
+    if [[ "${base_build_all_rc}" -eq 0 ]]; then
+      if run_target_for_side \
+        "base" \
+        "${base_build_dir}" \
+        "${target}" \
+        "${base_json}" \
+        "${base_md}" \
+        "${base_run_log}"; then
+        base_target_run_rc=0
+      else
+        base_target_run_rc=$?
+        any_failures=1
+      fi
+    fi
+
+    if [[ "${test_build_all_rc}" -eq 0 ]]; then
+      if run_target_for_side \
+        "test" \
+        "${test_build_dir}" \
+        "${target}" \
+        "${test_json}" \
+        "${test_md}" \
+        "${test_run_log}"; then
+        test_target_run_rc=0
+      else
+        test_target_run_rc=$?
+        any_failures=1
+      fi
+    fi
+
+    if [[ "${base_target_run_rc}" -eq 0 && "${test_target_run_rc}" -eq 0 ]]; then
+      compares_attempted=$((compares_attempted + 1))
+      if run_compare_target \
+        "${target}" \
+        "${compare_script}" \
+        "${compare_script_dir}" \
+        "${base_json}" \
+        "${test_json}" \
+        "${compare_report_md}" \
+        "${compare_report_log}"; then
+        compares_succeeded=$((compares_succeeded + 1))
+      else
+        any_failures=1
+      fi
+    fi
+  done
 fi
 
-for target in "${selected_targets[@]}"; do
-  base_target_run_rc=125
-  test_target_run_rc=125
-  base_run_log="${artifact_dir}/logs/run.base.${target}.log"
-  test_run_log="${artifact_dir}/logs/run.test.${target}.log"
-  compare_report_md="${artifact_dir}/compare/${target}.md"
-  compare_report_log="${artifact_dir}/logs/compare.${target}.log"
-
-  base_json="${artifact_dir}/base/${target}.json"
-  base_md="${artifact_dir}/base/${target}.md"
-  test_json="${artifact_dir}/test/${target}.json"
-  test_md="${artifact_dir}/test/${target}.md"
-
-  if [[ "${base_build_all_rc}" -eq 0 ]]; then
-    if run_target_for_side \
+# ============================================================================
+# Python benchmark pipeline
+# ============================================================================
+
+if [[ "${#PYTHON_FILTERS[@]}" -gt 0 ]]; then
+  echo
+  echo "=== Python Benchmark Pipeline ==="
+  echo
+
+  py_benchmarks_subdir="python/cuda_cccl/benchmarks"
+  base_py_bench_dir="${BASE_PATH}/${py_benchmarks_subdir}"
+  test_py_bench_dir="${TEST_PATH}/${py_benchmarks_subdir}"
+
+  if [[ ! -d "${base_py_bench_dir}" ]]; then
+    die "Python benchmarks directory not found in base tree: ${base_py_bench_dir}"
+  fi
+  if [[ ! -d "${test_py_bench_dir}" ]]; then
+    die "Python benchmarks directory not found in test tree: ${test_py_bench_dir}"
+  fi
+
+  cuda_major="$(detect_cuda_major_version)"
+  echo "Detected CUDA major version: ${cuda_major}"
+
+  base_py_venv="${build_root}/py-base-${build_token}"
+  test_py_venv="${build_root}/py-test-${build_token}"
+
+  setup_python_venv "${base_py_venv}" "${BASE_PATH}" "base" "${artifact_dir}/logs/py.venv.base.log" "${cuda_major}"
+  setup_python_venv "${test_py_venv}" "${TEST_PATH}" "test" "${artifact_dir}/logs/py.venv.test.log" "${cuda_major}"
+
+  select_python_targets "${base_py_bench_dir}" "${test_py_bench_dir}" selected_py_targets
+
+  # Append Python targets to the selected targets metadata file.
+  for py_target_path in "${selected_py_targets[@]}"; do
+    echo "$(python_path_to_target_name "${py_target_path}")" >> "${artifact_dir}/meta/selected_targets.txt"
+  done
+
+  for py_target_path in "${selected_py_targets[@]}"; do
+    py_target_name="$(python_path_to_target_name "${py_target_path}")"
+    base_py_target_run_rc=125
+    test_py_target_run_rc=125
+
+    base_py_json="${artifact_dir}/base/${py_target_name}.json"
+    base_py_md="${artifact_dir}/base/${py_target_name}.md"
+    test_py_json="${artifact_dir}/test/${py_target_name}.json"
+    test_py_md="${artifact_dir}/test/${py_target_name}.md"
+    base_py_run_log="${artifact_dir}/logs/run.base.${py_target_name}.log"
+    test_py_run_log="${artifact_dir}/logs/run.test.${py_target_name}.log"
+    compare_py_report_md="${artifact_dir}/compare/${py_target_name}.md"
+    compare_py_report_log="${artifact_dir}/logs/compare.${py_target_name}.log"
+
+    if run_python_target_for_side \
       "base" \
-      "${base_build_dir}" \
-      "${target}" \
-      "${base_json}" \
-      "${base_md}" \
-      "${base_run_log}"; then
-      base_target_run_rc=0
+      "${base_py_venv}" \
+      "${base_py_bench_dir}/${py_target_path}" \
+      "${base_py_json}" \
+      "${base_py_md}" \
+      "${base_py_run_log}"; then
+      base_py_target_run_rc=0
     else
-      base_target_run_rc=$?
+      base_py_target_run_rc=$?
       any_failures=1
     fi
-  fi
 
-  if [[ "${test_build_all_rc}" -eq 0 ]]; then
-    if run_target_for_side \
+    if run_python_target_for_side \
       "test" \
-      "${test_build_dir}" \
-      "${target}" \
-      "${test_json}" \
-      "${test_md}" \
-      "${test_run_log}"; then
-      test_target_run_rc=0
+      "${test_py_venv}" \
+      "${test_py_bench_dir}/${py_target_path}" \
+      "${test_py_json}" \
+      "${test_py_md}" \
+      "${test_py_run_log}"; then
+      test_py_target_run_rc=0
     else
-      test_target_run_rc=$?
+      test_py_target_run_rc=$?
       any_failures=1
     fi
-  fi
 
-  if [[ "${base_target_run_rc}" -eq 0 && "${test_target_run_rc}" -eq 0 ]]; then
-    compares_attempted=$((compares_attempted + 1))
-    if run_compare_target \
-      "${target}" \
-      "${compare_script}" \
-      "${compare_script_dir}" \
-      "${base_json}" \
-      "${test_json}" \
-      "${compare_report_md}" \
-      "${compare_report_log}"; then
-      compares_succeeded=$((compares_succeeded + 1))
-    else
-      any_failures=1
+    if [[ "${base_py_target_run_rc}" -eq 0 && "${test_py_target_run_rc}" -eq 0 ]]; then
+      py_compares_attempted=$((py_compares_attempted + 1))
+      if run_python_compare_target \
+        "${py_target_name}" \
+        "${test_py_venv}" \
+        "${base_py_json}" \
+        "${test_py_json}" \
+        "${compare_py_report_md}" \
+        "${compare_py_report_log}"; then
+        py_compares_succeeded=$((py_compares_succeeded + 1))
+      else
+        any_failures=1
+      fi
     fi
-  fi
-done
+  done
+fi
+
+# ============================================================================
+# Summary and exit
+# ============================================================================
 
 summary_file="${artifact_dir}/summary.md"
 write_summary "${summary_file}"
diff --git a/ci/bench/parse_bench_matrix.sh b/ci/bench/parse_bench_matrix.sh
index c745bdd0063..c13faee08b2 100755
--- a/ci/bench/parse_bench_matrix.sh
+++ b/ci/bench/parse_bench_matrix.sh
@@ -16,7 +16,7 @@ Usage: $0 [bench-yaml-path]
 Parse ci/bench.yaml and emit a GitHub Actions strategy matrix JSON object:
   {"include":[...]}
 
-Each include entry maps one enabled GPU to a bench_cub.yml workflow invocation.
+Each include entry maps one enabled GPU to a benchmark workflow invocation.
 EOF
 }
 
@@ -37,17 +37,31 @@ if ! bench_cfg_json="$(yq -o=json '.benchmarks // {}' "${bench_yaml_path}" 2>&1)
   die "Failed to parse ${bench_yaml_path} as YAML: ${bench_cfg_json}"
 fi
 
-if ! jq -e '.filters? | type == "array" and length > 0 and all(.[]; type == "string")' >/dev/null <<<"${bench_cfg_json}"; then
-  die "${bench_yaml_path} must define at least one string entry in benchmarks.filters."
+# Extract CUB and Python filter arrays (default to empty arrays).
+cub_filters_json="$(jq -c '.cub.filters // []' <<<"${bench_cfg_json}")"
+python_filters_json="$(jq -c '.python.filters // []' <<<"${bench_cfg_json}")"
+
+has_cub_filters="$(jq -e 'type == "array" and length > 0 and all(.[]; type == "string")' <<<"${cub_filters_json}" >/dev/null 2>&1 && echo true || echo false)"
+has_python_filters="$(jq -e 'type == "array" and length > 0 and all(.[]; type == "string")' <<<"${python_filters_json}" >/dev/null 2>&1 && echo true || echo false)"
+
+if [[ "${has_cub_filters}" != "true" && "${has_python_filters}" != "true" ]]; then
+  die "${bench_yaml_path} must define at least one string entry in benchmarks.cub.filters or benchmarks.python.filters."
 fi
 
-filters_arg="$(
-  jq -r '.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}"
-)"
+cub_filters_arg=""
+if [[ "${has_cub_filters}" == "true" ]]; then
+  cub_filters_arg="$(jq -r '.cub.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}")"
+fi
+
+python_filters_arg=""
+if [[ "${has_python_filters}" == "true" ]]; then
+  python_filters_arg="$(jq -r '.python.filters | map(@sh) | join(" ")' <<<"${bench_cfg_json}")"
+fi
 
 jq -cn \
   --argjson cfg "${bench_cfg_json}" \
-  --arg filters "${filters_arg}" \
+  --arg cub_filters "${cub_filters_arg}" \
+  --arg python_filters "${python_filters_arg}" \
   '{
     "include": [
       ($cfg.gpus // [])[] as $gpu
@@ -57,7 +71,8 @@ jq -cn \
           "arch": ($cfg.arch // "native"),
           "base_ref": ($cfg.base_ref // "origin/main"),
           "test_ref": ($cfg.test_ref // "HEAD"),
-          "filters": $filters,
+          "cub_filters": $cub_filters,
+          "python_filters": $python_filters,
           "nvbench_args": ($cfg.nvbench_args // ""),
           "nvbench_compare_args": ($cfg.nvbench_compare_args // "")
         }
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index e2f30759d34..07984e384bb 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -62,17 +62,15 @@ test-cu12 = [
   "pytest",
   "pytest-xdist",
   "cupy-cuda12x",
-  "pytest-benchmark",
 ]
 test-cu13 = [
   "cuda-cccl[cu13]",
   "pytest",
   "pytest-xdist",
   "cupy-cuda13x",
-  "pytest-benchmark",
 ]
-bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"]
-bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"]
+bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"]
+bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"]
 
 [project.urls]
 Homepage = "https://github.com/NVIDIA/cccl"

From 46fb87b9ddb1f9fa38caf6d0299e77b7fa994ef2 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 10:38:05 -0500
Subject: [PATCH 2/9] Make sure I didn't break CUB benchmarking CI [bench-only]

---
 ci/bench.yaml | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/ci/bench.yaml b/ci/bench.yaml
index fbb48fc6048..cc7001a84d4 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -27,9 +27,7 @@ benchmarks:
   # CUB C++ benchmark filters (regex matched against ninja target names).
   cub:
     filters:
-      # Examples:
-      # - '^cub\.bench\.for_each\.base'
-      # - '^cub\.bench\.reduce\.(sum|min)\.'
+      - '^cub\.bench\.reduce\.sum\.base$'
 
   # Python benchmark filters (regex matched against paths under benchmarks/).
   python:
@@ -41,13 +39,7 @@ benchmarks:
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
-    # - "t4"         # sm_75, 16 GB
-    # - "rtx2080"    # sm_75,  8 GB
-    # - "rtxa6000"   # sm_86, 48 GB
-    # - "l4"         # sm_89, 24 GB
-    # - "rtx4090"    # sm_89, 24 GB
-    # - "h100"       # sm_90, 80 GB
-    # - "rtxpro6000" # sm_120
+    - "l4"         # sm_89, 24 GB
 
   # Extra .devcontainer/launch.sh -d args
   # launch_args: "--cuda 13.1 --host gcc14"

From d6685e60a227d40e586eb5b297b221387adecbf3 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 10:59:14 -0500
Subject: [PATCH 3/9] Add CuPy to bench dependencies and remove
 pytest-benchmark from test dependencies

---
 python/cuda_cccl/pyproject.toml | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index e2f30759d34..221684ca624 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -62,17 +62,10 @@ test-cu12 = [
   "pytest",
   "pytest-xdist",
   "cupy-cuda12x",
-  "pytest-benchmark",
 ]
-test-cu13 = [
-  "cuda-cccl[cu13]",
-  "pytest",
-  "pytest-xdist",
-  "cupy-cuda13x",
-  "pytest-benchmark",
-]
-bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]"]
-bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]"]
+test-cu13 = ["cuda-cccl[cu13]", "pytest", "pytest-xdist", "cupy-cuda13x"]
+bench-cu12 = ["cuda-cccl[cu12]", "cuda-bench[cu12]", "cupy-cuda12x"]
+bench-cu13 = ["cuda-cccl[cu13]", "cuda-bench[cu13]", "cupy-cuda13x"]
 
 [project.urls]
 Homepage = "https://github.com/NVIDIA/cccl"

From 2be14d5a00826c7c2a973638ce20e7dee08dd018 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 14:09:06 -0500
Subject: [PATCH 4/9] Test that python + CUB benchmarks work [bench-only]

---
 ci/bench.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ci/bench.yaml b/ci/bench.yaml
index cc7001a84d4..9c7feff9dda 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -32,10 +32,7 @@ benchmarks:
   # Python benchmark filters (regex matched against paths under benchmarks/).
   python:
     filters:
-      # Examples:
-      # - 'compute/reduce/sum\.py'
-      # - 'compute/transform/.*\.py'
-      # - 'coop/bench_warp_reduce\.py'
+      - 'compute/reduce/sum\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:

From ffc263e72de3f76269a470002828aa518192baf3 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 14:32:56 -0500
Subject: [PATCH 5/9] Revert back to bench.yaml template

---
 ci/bench.yaml | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/ci/bench.yaml b/ci/bench.yaml
index 9c7feff9dda..fbb48fc6048 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -27,16 +27,27 @@ benchmarks:
   # CUB C++ benchmark filters (regex matched against ninja target names).
   cub:
     filters:
-      - '^cub\.bench\.reduce\.sum\.base$'
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
 
   # Python benchmark filters (regex matched against paths under benchmarks/).
   python:
     filters:
-      - 'compute/reduce/sum\.py'
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
-    - "l4"         # sm_89, 24 GB
+    # - "t4"         # sm_75, 16 GB
+    # - "rtx2080"    # sm_75,  8 GB
+    # - "rtxa6000"   # sm_86, 48 GB
+    # - "l4"         # sm_89, 24 GB
+    # - "rtx4090"    # sm_89, 24 GB
+    # - "h100"       # sm_90, 80 GB
+    # - "rtxpro6000" # sm_120
 
   # Extra .devcontainer/launch.sh -d args
   # launch_args: "--cuda 13.1 --host gcc14"

From ebfea2c1acde97eb83c108aa2136b97df8e1e6e6 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Wed, 1 Apr 2026 15:02:10 -0500
Subject: [PATCH 6/9] Fix comparison formatting

---
 ci/bench/compare_paths.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh
index 4e3a221ceb2..cf8b21f5a31 100755
--- a/ci/bench/compare_paths.sh
+++ b/ci/bench/compare_paths.sh
@@ -619,6 +619,7 @@ write_summary() {
     fi
 
     if [[ "${#selected_py_targets[@]}" -gt 0 ]]; then
+      echo
       echo "## Python Compare Reports"
       local py_target_path=""
       local py_target_name=""

From d3d089cd3b28584ab6e3f7185febc30d2f39f350 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Thu, 2 Apr 2026 14:40:06 -0500
Subject: [PATCH 7/9] Use newer nvbench_compare version which contains  flag

---
 ci/bench/compare_paths.sh       | 4 ++--
 cmake/CCCLGetDependencies.cmake | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/bench/compare_paths.sh b/ci/bench/compare_paths.sh
index cf8b21f5a31..3237f1bd862 100755
--- a/ci/bench/compare_paths.sh
+++ b/ci/bench/compare_paths.sh
@@ -410,7 +410,7 @@ run_python_compare_target() {
   local elapsed_s=0
   local rc=0
   local -a compare_cmd
-  compare_cmd=("${venv_path}/bin/nvbench-compare" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}")
+  compare_cmd=("${venv_path}/bin/nvbench-compare" --no-color "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}")
 
   : > "${compare_log}"
   echo "::group::${label}"
@@ -483,7 +483,7 @@ run_compare_target() {
   local rc=0
   local compare_pythonpath="${compare_script_dir}${PYTHONPATH:+:${PYTHONPATH}}"
   local -a compare_cmd
-  compare_cmd=(python3 "${compare_script}" "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}")
+  compare_cmd=(python3 "${compare_script}" --no-color "${NVBENCH_COMPARE_ARGS[@]}" "${base_json}" "${test_json}")
 
   : > "${compare_log}"
   echo "::group::${label}"
diff --git a/cmake/CCCLGetDependencies.cmake b/cmake/CCCLGetDependencies.cmake
index 16717131636..7cc560788d7 100644
--- a/cmake/CCCLGetDependencies.cmake
+++ b/cmake/CCCLGetDependencies.cmake
@@ -82,7 +82,7 @@ endmacro()
 
 set(
   CCCL_NVBENCH_SHA
-  "836a6c12f4330d9cbbe9e0041956b82f09e702ee"
+  "373970323f3e2a3995761ea682ca64dfcbdd1e26"
   CACHE STRING
   "SHA/tag to use for CCCL's NVBench."
 )

From ef341bfaa7aab389b02ca9099ea541d1b97e56ff Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Thu, 2 Apr 2026 14:41:23 -0500
Subject: [PATCH 8/9] Test --no-color flag [bench-only]

---
 ci/bench.yaml | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/ci/bench.yaml b/ci/bench.yaml
index fbb48fc6048..9c7feff9dda 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -27,27 +27,16 @@ benchmarks:
   # CUB C++ benchmark filters (regex matched against ninja target names).
   cub:
     filters:
-      # Examples:
-      # - '^cub\.bench\.for_each\.base'
-      # - '^cub\.bench\.reduce\.(sum|min)\.'
+      - '^cub\.bench\.reduce\.sum\.base$'
 
   # Python benchmark filters (regex matched against paths under benchmarks/).
   python:
     filters:
-      # Examples:
-      # - 'compute/reduce/sum\.py'
-      # - 'compute/transform/.*\.py'
-      # - 'coop/bench_warp_reduce\.py'
+      - 'compute/reduce/sum\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
-    # - "t4"         # sm_75, 16 GB
-    # - "rtx2080"    # sm_75,  8 GB
-    # - "rtxa6000"   # sm_86, 48 GB
-    # - "l4"         # sm_89, 24 GB
-    # - "rtx4090"    # sm_89, 24 GB
-    # - "h100"       # sm_90, 80 GB
-    # - "rtxpro6000" # sm_120
+    - "l4"         # sm_89, 24 GB
 
   # Extra .devcontainer/launch.sh -d args
   # launch_args: "--cuda 13.1 --host gcc14"

From 10b520382ad54341080295955e1956c74a3de654 Mon Sep 17 00:00:00 2001
From: Nader Al Awar <naderalawar@gmail.com>
Date: Thu, 2 Apr 2026 15:10:03 -0500
Subject: [PATCH 9/9] Revert ci benchmarking test

---
 ci/bench.yaml | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/ci/bench.yaml b/ci/bench.yaml
index 9c7feff9dda..fbb48fc6048 100644
--- a/ci/bench.yaml
+++ b/ci/bench.yaml
@@ -27,16 +27,27 @@ benchmarks:
   # CUB C++ benchmark filters (regex matched against ninja target names).
   cub:
     filters:
-      - '^cub\.bench\.reduce\.sum\.base$'
+      # Examples:
+      # - '^cub\.bench\.for_each\.base'
+      # - '^cub\.bench\.reduce\.(sum|min)\.'
 
   # Python benchmark filters (regex matched against paths under benchmarks/).
   python:
     filters:
-      - 'compute/reduce/sum\.py'
+      # Examples:
+      # - 'compute/reduce/sum\.py'
+      # - 'compute/transform/.*\.py'
+      # - 'coop/bench_warp_reduce\.py'
 
   # Select GPUs. These are limited and shared, be intentional and conservative.
   gpus:
-    - "l4"         # sm_89, 24 GB
+    # - "t4"         # sm_75, 16 GB
+    # - "rtx2080"    # sm_75,  8 GB
+    # - "rtxa6000"   # sm_86, 48 GB
+    # - "l4"         # sm_89, 24 GB
+    # - "rtx4090"    # sm_89, 24 GB
+    # - "h100"       # sm_90, 80 GB
+    # - "rtxpro6000" # sm_120
 
   # Extra .devcontainer/launch.sh -d args
   # launch_args: "--cuda 13.1 --host gcc14"