From e22ae663a29d3ec6cc718d77f151f8d86031c63a Mon Sep 17 00:00:00 2001 From: "Bernhard K. Weisshuhn (a.k.a. bernhorst)" Date: Thu, 9 Apr 2026 13:55:41 +0200 Subject: [PATCH 1/3] fix: treat deleted runner registration as non-retryable error When a runner's registration is garbage-collected by GitHub (OAuth error "invalid_client", message "Registration was not found"), the runner enters an infinite retry loop instead of exiting. This leaves the pod in Running status indefinitely, blocking the ARC controller from creating a replacement runner and causing jobs to queue with no runner to pick them up. The fix adds VssOAuthTokenRequestException with Error=="invalid_client" to the non-retryable exception lists in both BrokerServer.ShouldRetryException() and BrokerMessageListener.IsGetNextMessageExceptionRetriable(), matching the existing precedent in CreateSessionAsync() which already treats this error as terminal. With this fix, the runner exits immediately with TerminatedError on the first "Registration was not found" error, allowing the ARC controller to detect the exit and create a fresh replacement runner. Fixes #4191 --- src/Runner.Common/BrokerServer.cs | 9 +++++++++ src/Runner.Listener/BrokerMessageListener.cs | 13 ++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Runner.Common/BrokerServer.cs b/src/Runner.Common/BrokerServer.cs index 751ae1eee32..90fd2334016 100644 --- a/src/Runner.Common/BrokerServer.cs +++ b/src/Runner.Common/BrokerServer.cs @@ -7,6 +7,7 @@ using GitHub.DistributedTask.WebApi; using GitHub.Runner.Sdk; using GitHub.Services.Common; +using GitHub.Services.OAuth; using GitHub.Services.WebApi; using Sdk.RSWebApi.Contracts; using Sdk.WebApi.WebApi.RawClient; @@ -113,6 +114,14 @@ public bool ShouldRetryException(Exception ex) return false; } + // "invalid_client" means the runner registration has been deleted from the server. + // Retrying will never succeed, so bail out immediately. + if (ex is VssOAuthTokenRequestException oAuthEx && + string.Equals(oAuthEx.Error, "invalid_client", StringComparison.OrdinalIgnoreCase)) + { + return false; + } + return true; } } diff --git a/src/Runner.Listener/BrokerMessageListener.cs b/src/Runner.Listener/BrokerMessageListener.cs index 7c9ca401cba..360d00f571f 100644 --- a/src/Runner.Listener/BrokerMessageListener.cs +++ b/src/Runner.Listener/BrokerMessageListener.cs @@ -444,11 +444,18 @@ ex is RunnerNotFoundException || Trace.Info($"Non-retriable exception: {ex.Message}"); return false; } - else + + // "invalid_client" means the runner registration has been deleted from the server. + // This is permanent — retrying will never succeed. + if (ex is VssOAuthTokenRequestException oAuthEx && + string.Equals(oAuthEx.Error, "invalid_client", StringComparison.OrdinalIgnoreCase)) { - Trace.Info($"Retriable exception: {ex.Message}"); - return true; + Trace.Info($"Non-retriable exception: runner registration deleted. {ex.Message}"); + return false; } + + Trace.Info($"Retriable exception: {ex.Message}"); + return true; } private bool IsSessionCreationExceptionRetriable(Exception ex) From 4f6c72e2764b28ca984a12e4fab15c0adc35f306 Mon Sep 17 00:00:00 2001 From: "Bernhard K. Weisshuhn (a.k.a. bernhorst)" Date: Thu, 9 Apr 2026 14:03:59 +0200 Subject: [PATCH 2/3] test: add L0 tests for invalid_client non-retryable behavior - GetNextMessage_ThrowsNonRetryableOnInvalidClientOAuth: verifies that VssOAuthTokenRequestException with Error="invalid_client" causes GetNextMessageAsync to throw NonRetryableException without retrying - ShouldRetryException_ReturnsFalseForInvalidClientOAuth: verifies BrokerServer.ShouldRetryException returns false for invalid_client - ShouldRetryException_ReturnsTrueForOtherOAuthErrors: verifies other OAuth errors are still retried normally --- .../L0/Listener/BrokerMessageListenerL0.cs | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/Test/L0/Listener/BrokerMessageListenerL0.cs b/src/Test/L0/Listener/BrokerMessageListenerL0.cs index c42d134dd41..f17913f0f0e 100644 --- a/src/Test/L0/Listener/BrokerMessageListenerL0.cs +++ b/src/Test/L0/Listener/BrokerMessageListenerL0.cs @@ -7,6 +7,7 @@ using GitHub.Runner.Listener; using GitHub.Runner.Listener.Configuration; using GitHub.Services.Common; +using GitHub.Services.OAuth; using Moq; using Xunit; @@ -406,6 +407,87 @@ public async Task CreatesSessionWithProvidedSettings() } } + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetNextMessage_ThrowsNonRetryableOnInvalidClientOAuth() + { + using (TestHostContext tc = CreateTestContext()) + using (var tokenSource = new CancellationTokenSource()) + { + Tracing trace = tc.GetTrace(); + + // Arrange. + _credMgr.Setup(x => x.LoadCredentials(true)).Returns(new VssCredentials()); + + var expectedSession = new TaskAgentSession(); + _brokerServer + .Setup(x => x.CreateSessionAsync( + It.Is(y => y != null), + tokenSource.Token)) + .Returns(Task.FromResult(expectedSession)); + + // Simulate "Registration was not found" — OAuth invalid_client error + _brokerServer + .Setup(x => x.GetRunnerMessageAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .ThrowsAsync(new VssOAuthTokenRequestException("Registration abc-123 was not found.") { Error = "invalid_client" }); + + // Act. + BrokerMessageListener listener = new(); + listener.Initialize(tc); + + CreateSessionResult result = await listener.CreateSessionAsync(tokenSource.Token); + Assert.Equal(CreateSessionResult.Success, result); + + // Assert — should throw NonRetryableException, not retry forever + var ex = await Assert.ThrowsAsync(() => listener.GetNextMessageAsync(tokenSource.Token)); + Assert.Contains("non-retryable", ex.Message, StringComparison.OrdinalIgnoreCase); + + // Should have been called exactly once (no retries) + _brokerServer.Verify(x => x.GetRunnerMessageAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny()), Times.Once()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public void ShouldRetryException_ReturnsFalseForInvalidClientOAuth() + { + // Arrange + var brokerServer = new BrokerServer(); + var oauthEx = new VssOAuthTokenRequestException("Registration abc-123 was not found.") { Error = "invalid_client" }; + + // Act & Assert — invalid_client should not be retried + Assert.False(brokerServer.ShouldRetryException(oauthEx)); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public void ShouldRetryException_ReturnsTrueForOtherOAuthErrors() + { + // Arrange + var brokerServer = new BrokerServer(); + var oauthEx = new VssOAuthTokenRequestException("Temporary failure") { Error = "server_error" }; + + // Act & Assert — other OAuth errors should still be retried + Assert.True(brokerServer.ShouldRetryException(oauthEx)); + } + private TestHostContext CreateTestContext([CallerMemberName] String testName = "") { TestHostContext tc = new(this, testName); From 7eabbd2eabcdf3d6ba493e5872b41fe83a21d274 Mon Sep 17 00:00:00 2001 From: "Bernhard K. Weisshuhn (a.k.a. bernhorst)" Date: Thu, 9 Apr 2026 15:17:47 +0200 Subject: [PATCH 3/3] feat: add idle timeout for ephemeral runners waiting for jobs Ephemeral runners created via JIT config can end up registered with GitHub but never receive a job (e.g., due to pod recreation causing session conflicts, or the original job being reassigned). These orphaned runners sit idle indefinitely, blocking a slot in the ARC scale set and preventing replacement runners from being created. This adds a configurable idle timeout (default: 10 minutes) that exits the runner if no job is received after session creation. The timeout only applies to ephemeral/run-once runners and is disabled once a job is received. Configurable via ACTIONS_RUNNER_IDLE_TIMEOUT_MINUTES env var: - Default: 10 (minutes) for ephemeral runners - Set to 0 to disable - Has no effect on non-ephemeral runners Uses the existing Task.WhenAny pattern already used for auto-update and run-once job completion checks. Returns TerminatedError (exit code 1) so the ARC controller treats it as a non-retryable exit and creates a fresh replacement. --- src/Runner.Listener/Runner.cs | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/Runner.Listener/Runner.cs b/src/Runner.Listener/Runner.cs index e20eb6256e5..cfd736496f1 100644 --- a/src/Runner.Listener/Runner.cs +++ b/src/Runner.Listener/Runner.cs @@ -511,6 +511,22 @@ private async Task RunAsync(RunnerSettings settings, bool runOnce = false, jobDispatcher.JobStatus += _listener.OnJobStatus; + // For ephemeral runners, set up an idle timeout. If no job is received within + // this period, the runner exits to free up the slot for a replacement. + // This prevents orphaned runners (e.g., from pod recreation or session conflicts) + // from blocking scale set capacity indefinitely. + Task idleTimeoutTask = Task.Delay(Timeout.Infinite, messageQueueLoopTokenSource.Token); + if (runOnce) + { + var idleTimeoutEnv = Environment.GetEnvironmentVariable("ACTIONS_RUNNER_IDLE_TIMEOUT_MINUTES"); + var idleTimeoutMinutes = int.TryParse(idleTimeoutEnv, out var parsed) ? parsed : 10; + if (idleTimeoutMinutes > 0) + { + Trace.Info($"Ephemeral runner idle timeout set to {idleTimeoutMinutes} minutes."); + idleTimeoutTask = Task.Delay(TimeSpan.FromMinutes(idleTimeoutMinutes), messageQueueLoopTokenSource.Token); + } + } + while (!HostContext.RunnerShutdownToken.IsCancellationRequested) { // Check if we need to restart the session and can do so (job dispatcher not busy) @@ -527,6 +543,30 @@ private async Task RunAsync(RunnerSettings settings, bool runOnce = false, try { Task getNextMessage = _listener.GetNextMessageAsync(messageQueueLoopTokenSource.Token); + + // For ephemeral runners that haven't received a job yet, + // race the message poll against the idle timeout. + if (runOnce && !runOnceJobReceived && !idleTimeoutTask.IsCompleted) + { + Task completedTask = await Task.WhenAny(getNextMessage, idleTimeoutTask); + if (completedTask == idleTimeoutTask) + { + Trace.Info("Ephemeral runner idle timeout reached without receiving a job. Exiting."); + _term.WriteError($"{DateTime.UtcNow:u}: Ephemeral runner idle timeout reached without receiving a job. Exiting to free up capacity."); + messageQueueLoopTokenSource.Cancel(); + try + { + await getNextMessage; + } + catch (Exception ex) + { + Trace.Info($"Ignore any exception after cancel message loop. {ex}"); + } + + return Constants.Runner.ReturnCode.TerminatedError; + } + } + if (autoUpdateInProgress) { Trace.Verbose("Auto update task running at backend, waiting for getNextMessage or selfUpdateTask to finish.");