From f47ecaaa6d56b7f133a61688ec0c32ba7b1c1fcf Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 17 Mar 2026 14:37:18 -0700 Subject: [PATCH] ci: add stagger gate to spread out CI runs during rebase storms When KPD rebases all pending PR branches after an upstream commit lands (on bpf-next/master, bpf/master) or after a vmtest CI update, hundreds of workflow runs get triggered within seconds, which may cause various glitched due to increased stress on the runners. Add a stagger script that runs as the first step of the set-matrix job and detects the "storm" condition by checking: - This is a PR synchronize event (force-push rebase, not a new PR) - The base branch was updated within the last 30 minutes (KPD just mirrored upstream) - Active workflow runs (queued + in-progress) are at least half the number of open PRs, indicating a bulk rebase rather than normal organic CI activity When all conditions are met, the script sleeps for random 1-15 minute intervals in a loop. As runs complete or get cancelled the ratio drops and waiting runs proceed naturally. A hard cap of 2 hours prevents indefinite waiting. Because the workflow already uses cancel-in-progress on a per-branch concurrency group, a newer force-push will cancel a sleeping set-matrix job before any expensive build/test work starts. During normal operation (developer pushes, single PR rebases, new PRs) the storm condition is never true and the script exits immediately with zero delay. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Ihor Solodrai --- .github/scripts/stagger.py | 125 +++++++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 7 +++ 2 files changed, 132 insertions(+) create mode 100644 .github/scripts/stagger.py diff --git a/.github/scripts/stagger.py b/.github/scripts/stagger.py new file mode 100644 index 00000000..3b6664e3 --- /dev/null +++ b/.github/scripts/stagger.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Stagger CI runs during KPD rebase storms. + +When KPD rebases all PR branches after an upstream commit, hundreds of +workflow runs fire at once. This script detects the storm and waits for +it to subside before letting the expensive build jobs start. + +Storm = all of: + 1. PR synchronize event (force-push rebase, not a new PR) + 2. Base branch updated within the last 30 minutes (KPD just mirrored) + 3. Active workflow runs (queued + in-progress) >= half of open PRs + +Re-checks in a loop with random 1-15 min sleeps. Gives up after 2 hours. +cancel-in-progress on the concurrency group kills sleeping runs on new pushes. +""" + +import os +import random +import time +from datetime import datetime, timezone + +import requests + +BASE_BRANCH_RECENCY_S = 1800 # base branch "just updated" threshold +STORM_RATIO = 0.5 # active runs / open PRs threshold +WAIT_MIN_S = 60 # min sleep per iteration +WAIT_MAX_S = 900 # max sleep per iteration +MAX_TOTAL_WAIT_S = 7200 # hard cap on total wait + + +def gh_api(endpoint): + token = os.environ.get("GITHUB_TOKEN", "") + resp = requests.get( + f"https://api.github.com{endpoint}", + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + }, + ) + resp.raise_for_status() + return resp.json() + + +def base_branch_age_s(repo, base_branch): + """Seconds since last commit on the base branch, or None on error.""" + try: + sha = gh_api(f"/repos/{repo}/branches/{base_branch}")["commit"]["sha"] + date_str = gh_api(f"/repos/{repo}/commits/{sha}")["commit"]["committer"]["date"] + commit_time = datetime.fromisoformat(date_str.replace("Z", "+00:00")) + return (datetime.now(timezone.utc) - commit_time).total_seconds() + except Exception as e: + print(f"Warning: could not get base branch age: {e}") + return None + + +def active_run_count(repo): + """Number of queued + in-progress workflow runs.""" + total = 0 + for status in ("queued", "in_progress"): + try: + data = gh_api(f"/repos/{repo}/actions/runs?status={status}&per_page=1") + total += data.get("total_count", 0) + except Exception as e: + print(f"Warning: could not query {status} runs: {e}") + return total + + +def open_pr_count(repo): + """Number of open pull requests.""" + try: + data = gh_api(f"/search/issues?q=repo:{repo}+type:pr+state:open&per_page=1") + return data.get("total_count", 0) + except Exception as e: + print(f"Warning: could not query open PRs: {e}") + return 0 + + +def is_storm(repo, base_branch): + age = base_branch_age_s(repo, base_branch) + if age is None or age > BASE_BRANCH_RECENCY_S: + print(f"Base branch {base_branch} updated {age}s ago — no storm.") + return False + + active = active_run_count(repo) + open_prs = open_pr_count(repo) + if open_prs == 0: + return False + + ratio = active / open_prs + if ratio < STORM_RATIO: + print(f"{active} active / {open_prs} PRs ({ratio:.0%}) — no storm.") + return False + + print( + f"Storm: base {base_branch} updated {age:.0f}s ago, " + f"{active} active / {open_prs} PRs ({ratio:.0%})." + ) + return True + + +def main(): + action = os.environ.get("GITHUB_EVENT_ACTION", "") + repo = os.environ.get("GITHUB_REPOSITORY", "") + base = os.environ.get("PR_BASE_BRANCH", "") + + if action != "synchronize": + return + + if not repo or not base: + return + + start = time.monotonic() + while is_storm(repo, base): + elapsed = time.monotonic() - start + remaining = MAX_TOTAL_WAIT_S - elapsed + if remaining <= 0: + print(f"Hit {MAX_TOTAL_WAIT_S}s cap — proceeding.") + break + delay = random.randint(WAIT_MIN_S, min(WAIT_MAX_S, int(remaining))) + print(f"Waiting {delay}s (elapsed: {elapsed:.0f}s)...") + time.sleep(delay) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4bf0f107..e36885f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,6 +30,13 @@ jobs: run: | sudo apt-get -y update sudo apt-get -y install python3-requests + - name: Stagger if runners are busy + if: ${{ github.event.action == 'synchronize' && github.repository == 'kernel-patches/bpf' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_EVENT_ACTION: ${{ github.event.action }} + PR_BASE_BRANCH: ${{ github.event.pull_request.base.ref }} + run: python3 .github/scripts/stagger.py - id: set-matrix-impl env: GITHUB_TOKEN: ${{ secrets.GH_PAT_READ_RUNNERS }}