From 40feda156fa0dfd5f9ec276348baab7f5f83d365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Snorri=20Hj=C3=B6rvar=20J=C3=B3hannsson?= Date: Thu, 16 Apr 2026 18:35:25 +0000 Subject: [PATCH 1/2] fix: retry all SQS messages on unhandled errors instead of silently dropping them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the scale-up lambda encounters an unhandled error (e.g. SSM ThrottlingException during registration token creation), the catch block returned an empty batchItemFailures array. With ReportBatchItemFailures enabled, this tells SQS that all messages were processed successfully, permanently deleting them from the queue. This causes queued GitHub Actions jobs to be silently lost — they never get a runner and remain stuck in 'queued' state indefinitely. The fix returns all message IDs as batch item failures on unhandled errors, so SQS retries them after the visibility timeout. --- lambdas/functions/control-plane/src/lambda.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/lambda.ts b/lambdas/functions/control-plane/src/lambda.ts index e2a0451c95..f43a9e3531 100644 --- a/lambdas/functions/control-plane/src/lambda.ts +++ b/lambdas/functions/control-plane/src/lambda.ts @@ -55,9 +55,11 @@ export async function scaleUpHandler(event: SQSEvent, context: Context): Promise batchItemFailures.push(...e.toBatchItemFailures(sqsMessages)); logger.warn(`${e.detailedMessage} A retry will be attempted via SQS.`, { error: e }); } else { - logger.error(`Error processing batch (size: ${sqsMessages.length}): ${(e as Error).message}, ignoring batch`, { - error: e, - }); + batchItemFailures.push(...sqsMessages.map(({ messageId }) => ({ itemIdentifier: messageId }))); + logger.error( + `Error processing batch (size: ${sqsMessages.length}): ${(e as Error).message}, all messages will be retried via SQS.`, + { error: e }, + ); } return { batchItemFailures }; From 46347b8b7286ad2b1db58cae404b83b43d020820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Snorri=20Hj=C3=B6rvar=20J=C3=B3hannsson?= Date: Thu, 16 Apr 2026 18:37:04 +0000 Subject: [PATCH 2/2] test: update test to expect all messages retried on unhandled error --- lambdas/functions/control-plane/src/lambda.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/lambda.test.ts b/lambdas/functions/control-plane/src/lambda.test.ts index 2c9a98e420..03ff052c0e 100644 --- a/lambdas/functions/control-plane/src/lambda.test.ts +++ b/lambdas/functions/control-plane/src/lambda.test.ts @@ -215,7 +215,9 @@ describe('Test scale up lambda wrapper.', () => { vi.mocked(scaleUp).mockRejectedValue(new Error('Generic error')); const result = await scaleUpHandler(multiRecordEvent, context); - expect(result).toEqual({ batchItemFailures: [] }); + expect(result).toEqual({ + batchItemFailures: [{ itemIdentifier: 'message-0' }, { itemIdentifier: 'message-1' }], + }); }); it('Should throw when scaleUp throws ScaleError', async () => {