Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/RerunUnstableFailures.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Rerun Unstable Failures
on:
workflow_dispatch:
inputs:
WhatIf:
description: 'WhatIf'
type: boolean
default: false
schedule:
- cron: '0 * * * *'

permissions: read-all

defaults:
run:
shell: pwsh

jobs:
RerunUnstableFailures:
runs-on: ubuntu-slim
permissions:
actions: write
contents: read
env:
MAX_FAILED_JOBS: 3 # Maximum number of failed jobs in order to consider rerunning
MIN_TOTAL_JOBS: 10 # Minimum number of jobs that has run in order to consider rerunning
MAX_ATTEMPTS: 1 # Maximum number of attempts to rerun a failed job
LOOKBACK_HOURS: 24 # How far back to look for failed jobs
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Create GitHub App Token
id: app-token
if: vars.APP_ID != ''
continue-on-error: true
uses: actions/create-github-app-token@v1
with:
app-id: ${{ vars.APP_ID }}
private-key: ${{ secrets.PRIVATE_KEY }}

- name: Rerun unstable failures
env:
GH_TOKEN: ${{ steps.app-token.outputs.token || secrets.RERUNPAT }}
run: |
$whatIf = '${{ inputs.WhatIf }}'
$params = @{
Owner = "microsoft"
Repo = "BCApps"
MaxFailedJobs = $env:MAX_FAILED_JOBS
MinTotalJobs = $env:MIN_TOTAL_JOBS
MaxAttempts = $env:MAX_ATTEMPTS
LookbackHours = $env:LOOKBACK_HOURS
}
if ($whatIf -ne 'false') { $params.WhatIf = $true }
build/scripts/RerunUnstableFailures.ps1 @params
106 changes: 106 additions & 0 deletions build/scripts/RerunUnstableFailures.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
param(
[Parameter(Mandatory = $true)]
[string] $Owner,
[Parameter(Mandatory = $true)]
[string] $Repo,
[Parameter(Mandatory = $false)]
[int] $MaxFailedJobs = 3,
[Parameter(Mandatory = $false)]
[int] $MinTotalJobs = 10,
[Parameter(Mandatory = $false)]
[int] $MaxAttempts = 1,
[Parameter(Mandatory = $false)]
[int] $LookbackHours = 2,
[Parameter(Mandatory = $false)]
[switch] $WhatIf
)

$cutoff = (Get-Date).ToUniversalTime().AddHours(-$LookbackHours).ToString("yyyy-MM-ddTHH:mm:ssZ")
$workflowFiles = @("CICD.yaml", "PullRequestHandler.yaml")

foreach ($workflowFile in $workflowFiles) {
Write-Host "===== Processing workflow: $workflowFile ====="

# Get recent completed runs
$runsJson = gh api "/repos/$Owner/$Repo/actions/workflows/$workflowFile/runs?status=completed&created=%3E$cutoff&per_page=100" 2>&1
if ($LASTEXITCODE -ne 0) {
Write-Host "::warning::Failed to fetch runs for $workflowFile"
continue
}
$runs = ($runsJson | ConvertFrom-Json).workflow_runs

# Filter to failures on first attempt only
$failedRuns = $runs | Where-Object { $_.conclusion -eq 'failure' -and $_.run_attempt -le $MaxAttempts }
if (-not $failedRuns) {
Write-Host "No eligible failed runs found."
continue
}

# For PR builds: deduplicate by PR number, keep latest per PR, skip if latest run for that PR is not a failure
if ($workflowFile -eq "PullRequestHandler.yaml") {
$candidates = @()
$prGroups = $runs | Where-Object { $_.pull_requests.Count -gt 0 } | Group-Object { ($_.pull_requests | Select-Object -First 1).number }
foreach ($group in $prGroups) {
$latest = $group.Group | Sort-Object created_at -Descending | Select-Object -First 1
if ($latest.conclusion -eq 'failure' -and $latest.run_attempt -le $MaxAttempts) {
$candidates += $latest
}
}
$failedRuns = $candidates
}

if (-not $failedRuns) {
Write-Host "No eligible failed runs after deduplication."
continue
}

foreach ($run in $failedRuns) {
Write-Host "--- Checking run $($run.id): $($run.display_title) ---"

# Count failed jobs
$jobsJson = gh api "/repos/$Owner/$Repo/actions/runs/$($run.id)/jobs?filter=latest&per_page=100" 2>&1
if ($LASTEXITCODE -ne 0) {
Write-Host "::warning::Failed to fetch jobs for run $($run.id)"
continue
}
$jobs = ($jobsJson | ConvertFrom-Json).jobs
# Exclude utility jobs that are not actual build jobs
$excludedJobs = @("Pull Request Status Check", "Initialization")
$buildJobs = $jobs | Where-Object { $_.name -notin $excludedJobs -and $_.conclusion -ne 'skipped' }
$buildJobCount = ($buildJobs | Measure-Object).Count
$failedJobs = $buildJobs | Where-Object { $_.conclusion -eq 'failure' }
$failedCount = ($failedJobs | Measure-Object).Count

if ($failedCount -eq 0) {
Write-Host "No failed build jobs found. Skipping."
continue
}

if ($buildJobCount -lt $MinTotalJobs) {
Write-Host "Too few build jobs ($buildJobCount < $MinTotalJobs). Run likely didn't reach the large matrix. Skipping."
continue
}

if ($failedCount -gt $MaxFailedJobs) {
Write-Host "Too many failed jobs ($failedCount > $MaxFailedJobs). Skipping."
continue
}

Write-Host "Rerunning $failedCount failed job(s):"
$failedJobs | ForEach-Object { Write-Host " - $($_.name)" }

$runUrl = "https://github.com/$Owner/$Repo/actions/runs/$($run.id)"
if ($WhatIf) {
Write-Host "::notice::WhatIf: Would rerun '$($run.display_title)': $runUrl"
} else {
gh run rerun $run.id --failed --repo "$Owner/$Repo" 2>&1
if ($LASTEXITCODE -ne 0) {
Write-Host "::warning::Failed to rerun run $($run.id)"
} else {
Write-Host "::notice::Rerun triggered for '$($run.display_title)': $runUrl"
}
}
}
}

Write-Host "===== Done ====="
Loading