diff --git a/.github/workflows/RerunUnstableFailures.yaml b/.github/workflows/RerunUnstableFailures.yaml new file mode 100644 index 0000000000..69db83705f --- /dev/null +++ b/.github/workflows/RerunUnstableFailures.yaml @@ -0,0 +1,56 @@ +name: Rerun Unstable Failures +on: + workflow_dispatch: + inputs: + WhatIf: + description: 'WhatIf' + type: boolean + default: false + schedule: + - cron: '0 * * * *' + +permissions: read-all + +defaults: + run: + shell: pwsh + +jobs: + RerunUnstableFailures: + runs-on: ubuntu-slim + permissions: + actions: write + contents: read + env: + MAX_FAILED_JOBS: 3 # Maximum number of failed jobs in order to consider rerunning + MIN_TOTAL_JOBS: 10 # Minimum number of jobs that has run in order to consider rerunning + MAX_ATTEMPTS: 1 # Maximum number of attempts to rerun a failed job + LOOKBACK_HOURS: 24 # How far back to look for failed jobs + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Create GitHub App Token + id: app-token + if: vars.APP_ID != '' + continue-on-error: true + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ vars.APP_ID }} + private-key: ${{ secrets.PRIVATE_KEY }} + + - name: Rerun unstable failures + env: + GH_TOKEN: ${{ steps.app-token.outputs.token || secrets.RERUNPAT }} + run: | + $whatIf = '${{ inputs.WhatIf }}' + $params = @{ + Owner = "microsoft" + Repo = "BCApps" + MaxFailedJobs = $env:MAX_FAILED_JOBS + MinTotalJobs = $env:MIN_TOTAL_JOBS + MaxAttempts = $env:MAX_ATTEMPTS + LookbackHours = $env:LOOKBACK_HOURS + } + if ($whatIf -ne 'false') { $params.WhatIf = $true } + build/scripts/RerunUnstableFailures.ps1 @params diff --git a/build/scripts/RerunUnstableFailures.ps1 b/build/scripts/RerunUnstableFailures.ps1 new file mode 100644 index 0000000000..1197d0245e --- /dev/null +++ b/build/scripts/RerunUnstableFailures.ps1 @@ -0,0 +1,106 @@ +param( + [Parameter(Mandatory = $true)] + [string] $Owner, + [Parameter(Mandatory = $true)] + [string] $Repo, + [Parameter(Mandatory = $false)] + [int] $MaxFailedJobs = 3, + [Parameter(Mandatory = $false)] + [int] $MinTotalJobs = 10, + [Parameter(Mandatory = $false)] + [int] $MaxAttempts = 1, + [Parameter(Mandatory = $false)] + [int] $LookbackHours = 2, + [Parameter(Mandatory = $false)] + [switch] $WhatIf +) + +$cutoff = (Get-Date).ToUniversalTime().AddHours(-$LookbackHours).ToString("yyyy-MM-ddTHH:mm:ssZ") +$workflowFiles = @("CICD.yaml", "PullRequestHandler.yaml") + +foreach ($workflowFile in $workflowFiles) { + Write-Host "===== Processing workflow: $workflowFile =====" + + # Get recent completed runs + $runsJson = gh api "/repos/$Owner/$Repo/actions/workflows/$workflowFile/runs?status=completed&created=%3E$cutoff&per_page=100" 2>&1 + if ($LASTEXITCODE -ne 0) { + Write-Host "::warning::Failed to fetch runs for $workflowFile" + continue + } + $runs = ($runsJson | ConvertFrom-Json).workflow_runs + + # Filter to failures on first attempt only + $failedRuns = $runs | Where-Object { $_.conclusion -eq 'failure' -and $_.run_attempt -le $MaxAttempts } + if (-not $failedRuns) { + Write-Host "No eligible failed runs found." + continue + } + + # For PR builds: deduplicate by PR number, keep latest per PR, skip if latest run for that PR is not a failure + if ($workflowFile -eq "PullRequestHandler.yaml") { + $candidates = @() + $prGroups = $runs | Where-Object { $_.pull_requests.Count -gt 0 } | Group-Object { ($_.pull_requests | Select-Object -First 1).number } + foreach ($group in $prGroups) { + $latest = $group.Group | Sort-Object created_at -Descending | Select-Object -First 1 + if ($latest.conclusion -eq 'failure' -and $latest.run_attempt -le $MaxAttempts) { + $candidates += $latest + } + } + $failedRuns = $candidates + } + + if (-not $failedRuns) { + Write-Host "No eligible failed runs after deduplication." + continue + } + + foreach ($run in $failedRuns) { + Write-Host "--- Checking run $($run.id): $($run.display_title) ---" + + # Count failed jobs + $jobsJson = gh api "/repos/$Owner/$Repo/actions/runs/$($run.id)/jobs?filter=latest&per_page=100" 2>&1 + if ($LASTEXITCODE -ne 0) { + Write-Host "::warning::Failed to fetch jobs for run $($run.id)" + continue + } + $jobs = ($jobsJson | ConvertFrom-Json).jobs + # Exclude utility jobs that are not actual build jobs + $excludedJobs = @("Pull Request Status Check", "Initialization") + $buildJobs = $jobs | Where-Object { $_.name -notin $excludedJobs -and $_.conclusion -ne 'skipped' } + $buildJobCount = ($buildJobs | Measure-Object).Count + $failedJobs = $buildJobs | Where-Object { $_.conclusion -eq 'failure' } + $failedCount = ($failedJobs | Measure-Object).Count + + if ($failedCount -eq 0) { + Write-Host "No failed build jobs found. Skipping." + continue + } + + if ($buildJobCount -lt $MinTotalJobs) { + Write-Host "Too few build jobs ($buildJobCount < $MinTotalJobs). Run likely didn't reach the large matrix. Skipping." + continue + } + + if ($failedCount -gt $MaxFailedJobs) { + Write-Host "Too many failed jobs ($failedCount > $MaxFailedJobs). Skipping." + continue + } + + Write-Host "Rerunning $failedCount failed job(s):" + $failedJobs | ForEach-Object { Write-Host " - $($_.name)" } + + $runUrl = "https://github.com/$Owner/$Repo/actions/runs/$($run.id)" + if ($WhatIf) { + Write-Host "::notice::WhatIf: Would rerun '$($run.display_title)': $runUrl" + } else { + gh run rerun $run.id --failed --repo "$Owner/$Repo" 2>&1 + if ($LASTEXITCODE -ne 0) { + Write-Host "::warning::Failed to rerun run $($run.id)" + } else { + Write-Host "::notice::Rerun triggered for '$($run.display_title)': $runUrl" + } + } + } +} + +Write-Host "===== Done ====="