From 0ed1677195e0cb3f724c7b64dc03ce8352cefc2d Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:46:01 -0400 Subject: [PATCH 1/7] adding auto retry on terraform failure Signed-off-by: Chris Lyons <52037738+mephmanx@users.noreply.github.com> --- .../workflows/terraform-deploy-auto-retry.yml | 76 +++++++++++++++++++ .github/workflows/terraform-deploy.yml | 2 + 2 files changed, 78 insertions(+) create mode 100644 .github/workflows/terraform-deploy-auto-retry.yml diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml new file mode 100644 index 0000000..dd46b68 --- /dev/null +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -0,0 +1,76 @@ +name: Terraform Deploy Auto Retry + +on: + workflow_run: + workflows: + - Terraform Deploy + types: + - completed + +permissions: + actions: write + contents: read + +jobs: + rerun-on-tfc-discovery-timeout: + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + runs-on: ubuntu-latest + + steps: + - name: Evaluate Terraform Deploy failure and optionally rerun + env: + GH_TOKEN: ${{ github.token }} + OWNER: ${{ github.repository_owner }} + REPO: ${{ github.event.repository.name }} + RUN_ID: ${{ github.event.workflow_run.id }} + HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + run: | + set -euo pipefail + + echo "Terraform Deploy run id: ${RUN_ID}" + echo "Failed run branch: ${HEAD_BRANCH}" + + workdir="$(mktemp -d)" + trap 'rm -rf "${workdir}"' EXIT + logs_zip="${workdir}/logs.zip" + logs_dir="${workdir}/logs" + + downloaded="false" + for i in 1 2 3 4 5 6; do + if gh api "/repos/${OWNER}/${REPO}/actions/runs/${RUN_ID}/logs" > "${logs_zip}"; then + downloaded="true" + break + fi + echo "Run logs not ready yet, retrying in 10s (${i}/6)." + sleep 10 + done + + if [ "${downloaded}" != "true" ]; then + echo "Could not download logs for failed run; skipping automatic rerun." + exit 0 + fi + + unzip -q "${logs_zip}" -d "${logs_dir}" + + if rg -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ + || rg -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ + || rg -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then + if [ -z "${HEAD_BRANCH}" ]; then + echo "Head branch is empty; cannot dispatch Terraform Deploy retry." + exit 0 + fi + + if [ "${HEAD_BRANCH}" != "dev" ] && [ "${HEAD_BRANCH}" != "prod" ]; then + echo "Head branch '${HEAD_BRANCH}' is not a deploy branch; skipping retry dispatch." + exit 0 + fi + + echo "Detected Terraform Cloud discovery timeout. Waiting 120s before retry dispatch." + sleep 120 + + echo "Dispatching Terraform Deploy on branch '${HEAD_BRANCH}'." + gh api --method POST "/repos/${OWNER}/${REPO}/actions/workflows/terraform-deploy.yml/dispatches" -f ref="${HEAD_BRANCH}" + exit 0 + fi + + echo "Failure reason did not match Terraform Cloud discovery timeout; no rerun requested." diff --git a/.github/workflows/terraform-deploy.yml b/.github/workflows/terraform-deploy.yml index 55bed21..03fdf4f 100644 --- a/.github/workflows/terraform-deploy.yml +++ b/.github/workflows/terraform-deploy.yml @@ -18,6 +18,8 @@ jobs: TF_API_TOKEN: ${{ secrets.TFE_TOKEN }} TFC_WORKSPACE_PRODUCTION: ${{ vars.TFC_WORKSPACE_PRODUCTION }} TFC_WORKSPACE_PREVIEW: ${{ vars.TFC_WORKSPACE_PREVIEW }} + TF_REGISTRY_CLIENT_TIMEOUT: "120" + TF_REGISTRY_DISCOVERY_RETRY: "8" steps: - name: Check state of initial-deploy workflow From 2d4ddafee16b0f9168b2a4eb372764471ea66d91 Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:25:38 -0400 Subject: [PATCH 2/7] limit terraform auto-retry detection to deploy branches --- .github/workflows/terraform-deploy-auto-retry.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index dd46b68..327f08b 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -4,6 +4,9 @@ on: workflow_run: workflows: - Terraform Deploy + branches: + - dev + - prod types: - completed From dae3f1b91a83ee6503c37dbf4bf35085eb5c9896 Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 16:37:56 -0400 Subject: [PATCH 3/7] use grep in terraform auto-retry log detection --- .github/workflows/terraform-deploy-auto-retry.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index 327f08b..31c64f2 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -55,9 +55,9 @@ jobs: unzip -q "${logs_zip}" -d "${logs_dir}" - if rg -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ - || rg -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ - || rg -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then + if grep -R -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ + || grep -R -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ + || grep -R -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then if [ -z "${HEAD_BRANCH}" ]; then echo "Head branch is empty; cannot dispatch Terraform Deploy retry." exit 0 From 6ca4ad2fb91cffebff506387f8b6c16efb40fde8 Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:13:50 -0400 Subject: [PATCH 4/7] add scheduled terraform deploy auto-retry polling --- .../workflows/terraform-deploy-auto-retry.yml | 174 ++++++++++++++---- 1 file changed, 137 insertions(+), 37 deletions(-) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index 31c64f2..b427345 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -9,6 +9,9 @@ on: - prod types: - completed + schedule: + - cron: "*/5 * * * *" + workflow_dispatch: permissions: actions: write @@ -16,64 +19,161 @@ permissions: jobs: rerun-on-tfc-discovery-timeout: - if: ${{ github.event.workflow_run.conclusion == 'failure' }} runs-on: ubuntu-latest steps: - - name: Evaluate Terraform Deploy failure and optionally rerun + - name: Evaluate Terraform Deploy failures and optionally rerun env: GH_TOKEN: ${{ github.token }} OWNER: ${{ github.repository_owner }} REPO: ${{ github.event.repository.name }} - RUN_ID: ${{ github.event.workflow_run.id }} - HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + EVENT_NAME: ${{ github.event_name }} + WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }} + WORKFLOW_RUN_BRANCH: ${{ github.event.workflow_run.head_branch }} + WORKFLOW_RUN_CONCLUSION: ${{ github.event.workflow_run.conclusion }} run: | set -euo pipefail - echo "Terraform Deploy run id: ${RUN_ID}" - echo "Failed run branch: ${HEAD_BRANCH}" + is_timeout_failure() { + local run_id="$1" + local workdir + local logs_zip + local logs_dir + local downloaded="false" - workdir="$(mktemp -d)" - trap 'rm -rf "${workdir}"' EXIT - logs_zip="${workdir}/logs.zip" - logs_dir="${workdir}/logs" + workdir="$(mktemp -d)" + logs_zip="${workdir}/logs.zip" + logs_dir="${workdir}/logs" - downloaded="false" - for i in 1 2 3 4 5 6; do - if gh api "/repos/${OWNER}/${REPO}/actions/runs/${RUN_ID}/logs" > "${logs_zip}"; then - downloaded="true" - break + for i in 1 2 3 4 5 6 7 8 9 10 11 12; do + if gh api "/repos/${OWNER}/${REPO}/actions/runs/${run_id}/logs" > "${logs_zip}"; then + downloaded="true" + break + fi + echo "Run ${run_id} logs not ready yet, retrying in 10s (${i}/12)." + sleep 10 + done + + if [ "${downloaded}" != "true" ]; then + echo "Could not download logs for run ${run_id}." + rm -rf "${workdir}" + return 1 fi - echo "Run logs not ready yet, retrying in 10s (${i}/6)." - sleep 10 - done - if [ "${downloaded}" != "true" ]; then - echo "Could not download logs for failed run; skipping automatic rerun." - exit 0 - fi + unzip -q "${logs_zip}" -d "${logs_dir}" + + if grep -R -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ + || grep -R -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ + || grep -R -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then + rm -rf "${workdir}" + return 0 + fi - unzip -q "${logs_zip}" -d "${logs_dir}" + rm -rf "${workdir}" + return 1 + } - if grep -R -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ - || grep -R -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ - || grep -R -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then - if [ -z "${HEAD_BRANCH}" ]; then - echo "Head branch is empty; cannot dispatch Terraform Deploy retry." - exit 0 + dispatch_branch_if_idle() { + local branch="$1" + local latest_json + local latest_status + + if [ -z "${branch}" ]; then + echo "Branch is empty; cannot dispatch retry." + return 0 fi - if [ "${HEAD_BRANCH}" != "dev" ] && [ "${HEAD_BRANCH}" != "prod" ]; then - echo "Head branch '${HEAD_BRANCH}' is not a deploy branch; skipping retry dispatch." - exit 0 + if [ "${branch}" != "dev" ] && [ "${branch}" != "prod" ]; then + echo "Branch '${branch}' is not a deploy branch; skipping." + return 0 + fi + + latest_json="$(gh run list -R "${OWNER}/${REPO}" --workflow "Terraform Deploy" --branch "${branch}" --limit 1 --json databaseId,status,conclusion,createdAt,url || true)" + if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then + echo "No Terraform Deploy runs found for branch '${branch}'." + else + latest_status="$(echo "${latest_json}" | jq -r '.[0].status')" + if [ "${latest_status}" = "in_progress" ] || [ "${latest_status}" = "queued" ]; then + echo "Latest Terraform Deploy on '${branch}' is ${latest_status}; skipping dispatch." + return 0 + fi + fi + + echo "Dispatching Terraform Deploy on branch '${branch}'." + gh api --method POST "/repos/${OWNER}/${REPO}/actions/workflows/terraform-deploy.yml/dispatches" -f ref="${branch}" + return 0 + } + + handle_specific_failed_run() { + local run_id="$1" + local branch="$2" + + if [ -z "${run_id}" ] || [ "${run_id}" = "null" ]; then + echo "workflow_run payload did not include a run id; nothing to do." + return 0 + fi + + echo "Evaluating failed Terraform Deploy run ${run_id} on branch '${branch}'." + if is_timeout_failure "${run_id}"; then + echo "Detected Terraform Cloud discovery timeout in run ${run_id}. Waiting 120s before retry dispatch." + sleep 120 + dispatch_branch_if_idle "${branch}" + return 0 + fi + + echo "Run ${run_id} did not match timeout signature; no retry requested." + return 0 + } + + handle_latest_failed_run_for_branch() { + local branch="$1" + local latest_json + local run_id + local run_status + local run_conclusion + + latest_json="$(gh run list -R "${OWNER}/${REPO}" --workflow "Terraform Deploy" --branch "${branch}" --limit 1 --json databaseId,status,conclusion,createdAt,url || true)" + if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then + echo "No Terraform Deploy run history found for '${branch}'." + return 0 fi - echo "Detected Terraform Cloud discovery timeout. Waiting 120s before retry dispatch." - sleep 120 + run_id="$(echo "${latest_json}" | jq -r '.[0].databaseId')" + run_status="$(echo "${latest_json}" | jq -r '.[0].status')" + run_conclusion="$(echo "${latest_json}" | jq -r '.[0].conclusion // \"\"')" + + echo "Latest Terraform Deploy for '${branch}': run=${run_id} status=${run_status} conclusion=${run_conclusion}" + + if [ "${run_status}" != "completed" ]; then + echo "Branch '${branch}' currently has a deploy ${run_status}; no action needed." + return 0 + fi - echo "Dispatching Terraform Deploy on branch '${HEAD_BRANCH}'." - gh api --method POST "/repos/${OWNER}/${REPO}/actions/workflows/terraform-deploy.yml/dispatches" -f ref="${HEAD_BRANCH}" + if [ "${run_conclusion}" != "failure" ]; then + echo "Latest deploy for '${branch}' is not a failure; no action needed." + return 0 + fi + + if is_timeout_failure "${run_id}"; then + echo "Latest failed run ${run_id} on '${branch}' matches timeout signature. Dispatching retry." + dispatch_branch_if_idle "${branch}" + return 0 + fi + + echo "Latest failed run ${run_id} on '${branch}' is not the timeout signature; no retry requested." + return 0 + } + + echo "Event type: ${EVENT_NAME}" + if [ "${EVENT_NAME}" = "workflow_run" ]; then + if [ "${WORKFLOW_RUN_CONCLUSION}" != "failure" ]; then + echo "workflow_run conclusion was '${WORKFLOW_RUN_CONCLUSION}'; nothing to do." + exit 0 + fi + handle_specific_failed_run "${WORKFLOW_RUN_ID}" "${WORKFLOW_RUN_BRANCH}" exit 0 fi - echo "Failure reason did not match Terraform Cloud discovery timeout; no rerun requested." + echo "Polling latest deploy failures for dev/prod." + handle_latest_failed_run_for_branch "dev" + handle_latest_failed_run_for_branch "prod" From 5f93fa2226141bfcb405cb113cdb169d99af5a6b Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:21:52 -0400 Subject: [PATCH 5/7] fix jq parsing in scheduled terraform auto-retry --- .github/workflows/terraform-deploy-auto-retry.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index b427345..455ef32 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -140,7 +140,7 @@ jobs: run_id="$(echo "${latest_json}" | jq -r '.[0].databaseId')" run_status="$(echo "${latest_json}" | jq -r '.[0].status')" - run_conclusion="$(echo "${latest_json}" | jq -r '.[0].conclusion // \"\"')" + run_conclusion="$(echo "${latest_json}" | jq -r '.[0].conclusion // ""')" echo "Latest Terraform Deploy for '${branch}': run=${run_id} status=${run_status} conclusion=${run_conclusion}" From 72f525965204de38608c57b9f397ffea940d284a Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:56:25 -0400 Subject: [PATCH 6/7] make terraform auto-retry self-monitor until success --- .../workflows/terraform-deploy-auto-retry.yml | 216 ++++++++---------- 1 file changed, 96 insertions(+), 120 deletions(-) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index 455ef32..be9fe99 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -18,162 +18,138 @@ permissions: contents: read jobs: - rerun-on-tfc-discovery-timeout: + retry-until-success: runs-on: ubuntu-latest + if: ${{ github.event_name != 'workflow_run' || matrix.branch == github.event.workflow_run.head_branch }} + strategy: + fail-fast: false + matrix: + branch: + - dev + - prod + concurrency: + group: terraform-deploy-auto-retry-${{ matrix.branch }} + cancel-in-progress: false steps: - - name: Evaluate Terraform Deploy failures and optionally rerun + - name: Retry failed Terraform deploys until successful env: - GH_TOKEN: ${{ github.token }} + GH_TOKEN: ${{ secrets.ACTIONS_BOT_TOKEN || github.token }} OWNER: ${{ github.repository_owner }} REPO: ${{ github.event.repository.name }} EVENT_NAME: ${{ github.event_name }} - WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }} + BRANCH: ${{ matrix.branch }} WORKFLOW_RUN_BRANCH: ${{ github.event.workflow_run.head_branch }} WORKFLOW_RUN_CONCLUSION: ${{ github.event.workflow_run.conclusion }} + TARGET_WORKFLOW_NAME: "Terraform Deploy" + TARGET_WORKFLOW_FILE: "terraform-deploy.yml" run: | set -euo pipefail - is_timeout_failure() { - local run_id="$1" - local workdir - local logs_zip - local logs_dir - local downloaded="false" - - workdir="$(mktemp -d)" - logs_zip="${workdir}/logs.zip" - logs_dir="${workdir}/logs" - - for i in 1 2 3 4 5 6 7 8 9 10 11 12; do - if gh api "/repos/${OWNER}/${REPO}/actions/runs/${run_id}/logs" > "${logs_zip}"; then - downloaded="true" - break - fi - echo "Run ${run_id} logs not ready yet, retrying in 10s (${i}/12)." - sleep 10 - done - - if [ "${downloaded}" != "true" ]; then - echo "Could not download logs for run ${run_id}." - rm -rf "${workdir}" - return 1 - fi - - unzip -q "${logs_zip}" -d "${logs_dir}" - - if grep -R -n -F "Failed to request discovery document" "${logs_dir}" >/dev/null \ - || grep -R -n -F "https://app.terraform.io/.well-known/terraform.json" "${logs_dir}" >/dev/null \ - || grep -R -n -F "context deadline exceeded (Client.Timeout exceeded while awaiting headers)" "${logs_dir}" >/dev/null; then - rm -rf "${workdir}" - return 0 - fi - - rm -rf "${workdir}" - return 1 - } - - dispatch_branch_if_idle() { - local branch="$1" - local latest_json - local latest_status - - if [ -z "${branch}" ]; then - echo "Branch is empty; cannot dispatch retry." + should_monitor_branch() { + if [ "${EVENT_NAME}" != "workflow_run" ]; then return 0 fi - if [ "${branch}" != "dev" ] && [ "${branch}" != "prod" ]; then - echo "Branch '${branch}' is not a deploy branch; skipping." - return 0 + if [ "${WORKFLOW_RUN_BRANCH}" != "${BRANCH}" ]; then + echo "workflow_run was for '${WORKFLOW_RUN_BRANCH}', this job is '${BRANCH}'. Skipping." + return 1 fi - latest_json="$(gh run list -R "${OWNER}/${REPO}" --workflow "Terraform Deploy" --branch "${branch}" --limit 1 --json databaseId,status,conclusion,createdAt,url || true)" - if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then - echo "No Terraform Deploy runs found for branch '${branch}'." - else - latest_status="$(echo "${latest_json}" | jq -r '.[0].status')" - if [ "${latest_status}" = "in_progress" ] || [ "${latest_status}" = "queued" ]; then - echo "Latest Terraform Deploy on '${branch}' is ${latest_status}; skipping dispatch." - return 0 - fi + if [ "${WORKFLOW_RUN_CONCLUSION}" != "failure" ]; then + echo "workflow_run conclusion for '${BRANCH}' was '${WORKFLOW_RUN_CONCLUSION}'. Nothing to retry." + return 1 fi - echo "Dispatching Terraform Deploy on branch '${branch}'." - gh api --method POST "/repos/${OWNER}/${REPO}/actions/workflows/terraform-deploy.yml/dispatches" -f ref="${branch}" return 0 } - handle_specific_failed_run() { - local run_id="$1" - local branch="$2" - - if [ -z "${run_id}" ] || [ "${run_id}" = "null" ]; then - echo "workflow_run payload did not include a run id; nothing to do." - return 0 - fi - - echo "Evaluating failed Terraform Deploy run ${run_id} on branch '${branch}'." - if is_timeout_failure "${run_id}"; then - echo "Detected Terraform Cloud discovery timeout in run ${run_id}. Waiting 120s before retry dispatch." - sleep 120 - dispatch_branch_if_idle "${branch}" - return 0 - fi + latest_run_json() { + gh run list -R "${OWNER}/${REPO}" \ + --workflow "${TARGET_WORKFLOW_NAME}" \ + --branch "${BRANCH}" \ + --limit 1 \ + --json databaseId,status,conclusion,createdAt,url,event || echo "[]" + } - echo "Run ${run_id} did not match timeout signature; no retry requested." - return 0 + dispatch_branch() { + echo "Dispatching ${TARGET_WORKFLOW_NAME} on branch '${BRANCH}'." + gh api --method POST "/repos/${OWNER}/${REPO}/actions/workflows/${TARGET_WORKFLOW_FILE}/dispatches" -f ref="${BRANCH}" >/dev/null } - handle_latest_failed_run_for_branch() { - local branch="$1" + monitor_branch_until_success() { local latest_json local run_id - local run_status + local latest_status local run_conclusion + local previous_failed_run_id="" + local wait_count=0 + + while true; do + latest_json="$(latest_run_json)" + if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then + echo "No ${TARGET_WORKFLOW_NAME} runs found for '${BRANCH}'. Waiting 20s." + sleep 20 + continue + fi - latest_json="$(gh run list -R "${OWNER}/${REPO}" --workflow "Terraform Deploy" --branch "${branch}" --limit 1 --json databaseId,status,conclusion,createdAt,url || true)" - if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then - echo "No Terraform Deploy run history found for '${branch}'." - return 0 - fi - - run_id="$(echo "${latest_json}" | jq -r '.[0].databaseId')" - run_status="$(echo "${latest_json}" | jq -r '.[0].status')" - run_conclusion="$(echo "${latest_json}" | jq -r '.[0].conclusion // ""')" + run_id="$(echo "${latest_json}" | jq -r '.[0].databaseId')" + latest_status="$(echo "${latest_json}" | jq -r '.[0].status')" + run_conclusion="$(echo "${latest_json}" | jq -r '.[0].conclusion // ""')" - echo "Latest Terraform Deploy for '${branch}': run=${run_id} status=${run_status} conclusion=${run_conclusion}" + echo "Branch '${BRANCH}' latest run=${run_id} status=${latest_status} conclusion=${run_conclusion}" - if [ "${run_status}" != "completed" ]; then - echo "Branch '${branch}' currently has a deploy ${run_status}; no action needed." - return 0 - fi + if [ "${latest_status}" = "in_progress" ] || [ "${latest_status}" = "queued" ]; then + sleep 20 + continue + fi - if [ "${run_conclusion}" != "failure" ]; then - echo "Latest deploy for '${branch}' is not a failure; no action needed." - return 0 - fi + if [ "${run_conclusion}" = "success" ]; then + echo "Branch '${BRANCH}' latest run succeeded. Retry loop complete." + return 0 + fi - if is_timeout_failure "${run_id}"; then - echo "Latest failed run ${run_id} on '${branch}' matches timeout signature. Dispatching retry." - dispatch_branch_if_idle "${branch}" - return 0 - fi + if [ "${run_conclusion}" = "failure" ] || [ "${run_conclusion}" = "cancelled" ] || [ "${run_conclusion}" = "timed_out" ] || [ "${run_conclusion}" = "startup_failure" ]; then + if [ "${previous_failed_run_id}" = "${run_id}" ]; then + sleep 20 + continue + fi + + previous_failed_run_id="${run_id}" + dispatch_branch + wait_count=0 + + while true; do + sleep 15 + latest_json="$(latest_run_json)" + if [ -z "${latest_json}" ] || [ "$(echo "${latest_json}" | jq 'length')" -eq 0 ]; then + continue + fi + + run_id="$(echo "${latest_json}" | jq -r '.[0].databaseId')" + if [ "${run_id}" != "${previous_failed_run_id}" ]; then + echo "Detected new ${TARGET_WORKFLOW_NAME} run ${run_id} on '${BRANCH}'." + break + fi + + wait_count=$((wait_count + 1)) + if [ "${wait_count}" -ge 40 ]; then + echo "Dispatched run not visible yet on '${BRANCH}'. Re-dispatching." + dispatch_branch + wait_count=0 + fi + done + continue + fi - echo "Latest failed run ${run_id} on '${branch}' is not the timeout signature; no retry requested." - return 0 + echo "Branch '${BRANCH}' has non-retryable conclusion '${run_conclusion}'. Waiting 20s." + sleep 20 + done } - echo "Event type: ${EVENT_NAME}" - if [ "${EVENT_NAME}" = "workflow_run" ]; then - if [ "${WORKFLOW_RUN_CONCLUSION}" != "failure" ]; then - echo "workflow_run conclusion was '${WORKFLOW_RUN_CONCLUSION}'; nothing to do." - exit 0 - fi - handle_specific_failed_run "${WORKFLOW_RUN_ID}" "${WORKFLOW_RUN_BRANCH}" + if ! should_monitor_branch; then exit 0 fi - echo "Polling latest deploy failures for dev/prod." - handle_latest_failed_run_for_branch "dev" - handle_latest_failed_run_for_branch "prod" + echo "Starting retry monitor for branch '${BRANCH}' (event=${EVENT_NAME})." + monitor_branch_until_success From 3a1ddeca0f9f2f445015d9228bda0dbc6478a4b2 Mon Sep 17 00:00:00 2001 From: Chris Lyons <52037738+mephmanx@users.noreply.github.com> Date: Fri, 13 Mar 2026 17:58:36 -0400 Subject: [PATCH 7/7] fix auto-retry workflow matrix if compile error --- .github/workflows/terraform-deploy-auto-retry.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/terraform-deploy-auto-retry.yml b/.github/workflows/terraform-deploy-auto-retry.yml index be9fe99..91e72f5 100644 --- a/.github/workflows/terraform-deploy-auto-retry.yml +++ b/.github/workflows/terraform-deploy-auto-retry.yml @@ -20,7 +20,6 @@ permissions: jobs: retry-until-success: runs-on: ubuntu-latest - if: ${{ github.event_name != 'workflow_run' || matrix.branch == github.event.workflow_run.head_branch }} strategy: fail-fast: false matrix: