diff --git a/src/Runner.Common/BrokerServer.cs b/src/Runner.Common/BrokerServer.cs index 751ae1eee32..90fd2334016 100644 --- a/src/Runner.Common/BrokerServer.cs +++ b/src/Runner.Common/BrokerServer.cs @@ -7,6 +7,7 @@ using GitHub.DistributedTask.WebApi; using GitHub.Runner.Sdk; using GitHub.Services.Common; +using GitHub.Services.OAuth; using GitHub.Services.WebApi; using Sdk.RSWebApi.Contracts; using Sdk.WebApi.WebApi.RawClient; @@ -113,6 +114,14 @@ public bool ShouldRetryException(Exception ex) return false; } + // "invalid_client" means the runner registration has been deleted from the server. + // Retrying will never succeed, so bail out immediately. + if (ex is VssOAuthTokenRequestException oAuthEx && + string.Equals(oAuthEx.Error, "invalid_client", StringComparison.OrdinalIgnoreCase)) + { + return false; + } + return true; } } diff --git a/src/Runner.Listener/BrokerMessageListener.cs b/src/Runner.Listener/BrokerMessageListener.cs index 7c9ca401cba..360d00f571f 100644 --- a/src/Runner.Listener/BrokerMessageListener.cs +++ b/src/Runner.Listener/BrokerMessageListener.cs @@ -444,11 +444,18 @@ ex is RunnerNotFoundException || Trace.Info($"Non-retriable exception: {ex.Message}"); return false; } - else + + // "invalid_client" means the runner registration has been deleted from the server. + // This is permanent — retrying will never succeed. + if (ex is VssOAuthTokenRequestException oAuthEx && + string.Equals(oAuthEx.Error, "invalid_client", StringComparison.OrdinalIgnoreCase)) { - Trace.Info($"Retriable exception: {ex.Message}"); - return true; + Trace.Info($"Non-retriable exception: runner registration deleted. {ex.Message}"); + return false; } + + Trace.Info($"Retriable exception: {ex.Message}"); + return true; } private bool IsSessionCreationExceptionRetriable(Exception ex) diff --git a/src/Runner.Listener/Runner.cs b/src/Runner.Listener/Runner.cs index e20eb6256e5..cfd736496f1 100644 --- a/src/Runner.Listener/Runner.cs +++ b/src/Runner.Listener/Runner.cs @@ -511,6 +511,22 @@ private async Task RunAsync(RunnerSettings settings, bool runOnce = false, jobDispatcher.JobStatus += _listener.OnJobStatus; + // For ephemeral runners, set up an idle timeout. If no job is received within + // this period, the runner exits to free up the slot for a replacement. + // This prevents orphaned runners (e.g., from pod recreation or session conflicts) + // from blocking scale set capacity indefinitely. + Task idleTimeoutTask = Task.Delay(Timeout.Infinite, messageQueueLoopTokenSource.Token); + if (runOnce) + { + var idleTimeoutEnv = Environment.GetEnvironmentVariable("ACTIONS_RUNNER_IDLE_TIMEOUT_MINUTES"); + var idleTimeoutMinutes = int.TryParse(idleTimeoutEnv, out var parsed) ? parsed : 10; + if (idleTimeoutMinutes > 0) + { + Trace.Info($"Ephemeral runner idle timeout set to {idleTimeoutMinutes} minutes."); + idleTimeoutTask = Task.Delay(TimeSpan.FromMinutes(idleTimeoutMinutes), messageQueueLoopTokenSource.Token); + } + } + while (!HostContext.RunnerShutdownToken.IsCancellationRequested) { // Check if we need to restart the session and can do so (job dispatcher not busy) @@ -527,6 +543,30 @@ private async Task RunAsync(RunnerSettings settings, bool runOnce = false, try { Task getNextMessage = _listener.GetNextMessageAsync(messageQueueLoopTokenSource.Token); + + // For ephemeral runners that haven't received a job yet, + // race the message poll against the idle timeout. + if (runOnce && !runOnceJobReceived && !idleTimeoutTask.IsCompleted) + { + Task completedTask = await Task.WhenAny(getNextMessage, idleTimeoutTask); + if (completedTask == idleTimeoutTask) + { + Trace.Info("Ephemeral runner idle timeout reached without receiving a job. Exiting."); + _term.WriteError($"{DateTime.UtcNow:u}: Ephemeral runner idle timeout reached without receiving a job. Exiting to free up capacity."); + messageQueueLoopTokenSource.Cancel(); + try + { + await getNextMessage; + } + catch (Exception ex) + { + Trace.Info($"Ignore any exception after cancel message loop. {ex}"); + } + + return Constants.Runner.ReturnCode.TerminatedError; + } + } + if (autoUpdateInProgress) { Trace.Verbose("Auto update task running at backend, waiting for getNextMessage or selfUpdateTask to finish."); diff --git a/src/Test/L0/Listener/BrokerMessageListenerL0.cs b/src/Test/L0/Listener/BrokerMessageListenerL0.cs index c42d134dd41..f17913f0f0e 100644 --- a/src/Test/L0/Listener/BrokerMessageListenerL0.cs +++ b/src/Test/L0/Listener/BrokerMessageListenerL0.cs @@ -7,6 +7,7 @@ using GitHub.Runner.Listener; using GitHub.Runner.Listener.Configuration; using GitHub.Services.Common; +using GitHub.Services.OAuth; using Moq; using Xunit; @@ -406,6 +407,87 @@ public async Task CreatesSessionWithProvidedSettings() } } + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetNextMessage_ThrowsNonRetryableOnInvalidClientOAuth() + { + using (TestHostContext tc = CreateTestContext()) + using (var tokenSource = new CancellationTokenSource()) + { + Tracing trace = tc.GetTrace(); + + // Arrange. + _credMgr.Setup(x => x.LoadCredentials(true)).Returns(new VssCredentials()); + + var expectedSession = new TaskAgentSession(); + _brokerServer + .Setup(x => x.CreateSessionAsync( + It.Is(y => y != null), + tokenSource.Token)) + .Returns(Task.FromResult(expectedSession)); + + // Simulate "Registration was not found" — OAuth invalid_client error + _brokerServer + .Setup(x => x.GetRunnerMessageAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .ThrowsAsync(new VssOAuthTokenRequestException("Registration abc-123 was not found.") { Error = "invalid_client" }); + + // Act. + BrokerMessageListener listener = new(); + listener.Initialize(tc); + + CreateSessionResult result = await listener.CreateSessionAsync(tokenSource.Token); + Assert.Equal(CreateSessionResult.Success, result); + + // Assert — should throw NonRetryableException, not retry forever + var ex = await Assert.ThrowsAsync(() => listener.GetNextMessageAsync(tokenSource.Token)); + Assert.Contains("non-retryable", ex.Message, StringComparison.OrdinalIgnoreCase); + + // Should have been called exactly once (no retries) + _brokerServer.Verify(x => x.GetRunnerMessageAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny()), Times.Once()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public void ShouldRetryException_ReturnsFalseForInvalidClientOAuth() + { + // Arrange + var brokerServer = new BrokerServer(); + var oauthEx = new VssOAuthTokenRequestException("Registration abc-123 was not found.") { Error = "invalid_client" }; + + // Act & Assert — invalid_client should not be retried + Assert.False(brokerServer.ShouldRetryException(oauthEx)); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public void ShouldRetryException_ReturnsTrueForOtherOAuthErrors() + { + // Arrange + var brokerServer = new BrokerServer(); + var oauthEx = new VssOAuthTokenRequestException("Temporary failure") { Error = "server_error" }; + + // Act & Assert — other OAuth errors should still be retried + Assert.True(brokerServer.ShouldRetryException(oauthEx)); + } + private TestHostContext CreateTestContext([CallerMemberName] String testName = "") { TestHostContext tc = new(this, testName);