From 13af26fb92178279099e2c3644166f6e3afb7f41 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:39:30 +0000 Subject: [PATCH 1/2] fix: catch exceptions from user-provided retry handlers to prevent orchestration crashes When a user-provided retry handler (AsyncRetryHandler) or handleFailure predicate (RetryPolicy) throws an exception, the error previously propagated uncaught through tryHandleRetry() -> handleFailedTask() -> processEvent() -> execute(), crashing the entire orchestration. The fix wraps these user-provided function calls in try-catch blocks within tryHandleRetry(). When an exception is caught: - A warning is logged (EVENT_RETRY_HANDLER_EXCEPTION = 737) - The method returns false (don't retry) - The task fails normally with its original error - The orchestrator can catch the TaskFailedError as usual Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- packages/durabletask-js/src/worker/logs.ts | 14 +++ .../src/worker/orchestration-executor.ts | 22 +++- .../test/orchestration_executor.spec.ts | 109 ++++++++++++++++++ 3 files changed, 143 insertions(+), 2 deletions(-) diff --git a/packages/durabletask-js/src/worker/logs.ts b/packages/durabletask-js/src/worker/logs.ts index 47ce1c4..9e9beec 100644 --- a/packages/durabletask-js/src/worker/logs.ts +++ b/packages/durabletask-js/src/worker/logs.ts @@ -76,6 +76,7 @@ const CATEGORY_ENTITIES = "Microsoft.DurableTask.Worker.Entities"; /** @internal */ export const EVENT_ACTIVITY_EXECUTION_ERROR = 734; /** @internal */ export const EVENT_ACTIVITY_RESPONSE_ERROR = 735; /** @internal */ export const EVENT_STREAM_ERROR_INFO = 736; +/** @internal */ export const EVENT_RETRY_HANDLER_EXCEPTION = 737; // ── Entity-specific Event IDs (800+ range) ────────────────────────────────── @@ -194,6 +195,19 @@ export function retryingTask(logger: Logger, instanceId: string, name: string, a }, `${instanceId}: Evaluating custom retry handler for failed '${name}' task. Attempt = ${attempt}.`); } +/** + * Logs that a retry handler or handleFailure predicate threw an exception. + * The task will not be retried and will fail with its original error. + */ +export function retryHandlerException(logger: Logger, instanceId: string, name: string, error: unknown): void { + const msg = toErrorMessage(error); + emitLog(logger, "warn", { + eventId: EVENT_RETRY_HANDLER_EXCEPTION, + category: CATEGORY_ORCHESTRATIONS, + properties: { instanceId, name }, + }, `${instanceId}: Retry handler for '${name}' threw an exception and will not be retried: ${msg}`); +} + // ═══════════════════════════════════════════════════════════════════════════════ // JS-specific Worker Lifecycle Logs (Event IDs 700+) // ═══════════════════════════════════════════════════════════════════════════════ diff --git a/packages/durabletask-js/src/worker/orchestration-executor.ts b/packages/durabletask-js/src/worker/orchestration-executor.ts index 3d17d76..e081d32 100644 --- a/packages/durabletask-js/src/worker/orchestration-executor.ts +++ b/packages/durabletask-js/src/worker/orchestration-executor.ts @@ -765,7 +765,16 @@ export class OrchestrationExecutor { ): Promise { if (task instanceof RetryableTask) { task.recordFailure(errorMessage, failureDetails); - const nextDelayMs = task.computeNextDelayInMilliseconds(ctx._currentUtcDatetime); + let nextDelayMs: number | undefined; + try { + nextDelayMs = task.computeNextDelayInMilliseconds(ctx._currentUtcDatetime); + } catch (e: unknown) { + // The retry policy's handleFailure predicate threw an exception. + // Treat this as "don't retry" so the task fails with its original error + // rather than crashing the entire orchestration. + WorkerLogs.retryHandlerException(this._logger, ctx._instanceId, task.taskName, e); + return false; + } if (nextDelayMs !== undefined) { WorkerLogs.retryingTask(this._logger, ctx._instanceId, task.taskName, task.attemptCount); @@ -776,7 +785,16 @@ export class OrchestrationExecutor { } } else if (task instanceof RetryHandlerTask) { task.recordFailure(errorMessage, failureDetails); - const retryResult = await task.shouldRetry(ctx._currentUtcDatetime); + let retryResult: boolean | number; + try { + retryResult = await task.shouldRetry(ctx._currentUtcDatetime); + } catch (e: unknown) { + // The user-provided retry handler threw an exception. + // Treat this as "don't retry" so the task fails with its original error + // rather than crashing the entire orchestration. + WorkerLogs.retryHandlerException(this._logger, ctx._instanceId, task.taskName, e); + return false; + } // Only retry when the handler explicitly returns true or a finite number. // Using a positive check (=== true || finite number) instead of !== false diff --git a/packages/durabletask-js/test/orchestration_executor.spec.ts b/packages/durabletask-js/test/orchestration_executor.spec.ts index f1f0dba..741d231 100644 --- a/packages/durabletask-js/test/orchestration_executor.spec.ts +++ b/packages/durabletask-js/test/orchestration_executor.spec.ts @@ -1401,6 +1401,115 @@ describe("Orchestration Executor", () => { const completeAction = getAndValidateSingleCompleteOrchestrationAction(result); expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED); }); + + it("should fail the task (not crash orchestration) when retry handler throws an exception", async () => { + const { result: startResult, replay } = await startOrchestration( + async function* (ctx: OrchestrationContext): any { + const retryHandler = async (_retryCtx: any) => { + throw new Error("Handler exploded!"); + }; + return yield ctx.callActivity("flakyActivity", undefined, { retry: retryHandler }); + }, + ); + + expect(startResult.actions[0].hasScheduletask()).toBe(true); + + // Activity fails → handler throws → should NOT retry, task should fail with original error + const result = await replay( + [newTaskScheduledEvent(1, "flakyActivity")], + [newTaskFailedEvent(1, new Error("Original activity error"))], + ); + const completeAction = getAndValidateSingleCompleteOrchestrationAction(result); + // The orchestration fails with the original task error, not the handler exception + expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED); + expect(completeAction?.getFailuredetails()?.getErrormessage()).toContain("Original activity error"); + }); + + it("should allow orchestrator to catch TaskFailedError when retry handler throws", async () => { + const { result: startResult, replay } = await startOrchestration( + async function* (ctx: OrchestrationContext): any { + const retryHandler = async (_retryCtx: any) => { + throw new Error("Handler exploded!"); + }; + try { + yield ctx.callActivity("flakyActivity", undefined, { retry: retryHandler }); + } catch (e: any) { + return "caught: " + e.message; + } + }, + ); + + expect(startResult.actions[0].hasScheduletask()).toBe(true); + + // Activity fails → handler throws → task fails → orchestrator catches the error + const result = await replay( + [newTaskScheduledEvent(1, "flakyActivity")], + [newTaskFailedEvent(1, new Error("Original activity error"))], + ); + const completeAction = getAndValidateSingleCompleteOrchestrationAction(result); + expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_COMPLETED); + expect(completeAction?.getResult()?.getValue()).toContain("Original activity error"); + }); + + it("should fail the task (not crash orchestration) when handleFailure predicate throws", async () => { + const { RetryPolicy } = await import("../src/task/retry/retry-policy"); + + const { result: startResult, replay } = await startOrchestration( + async function* (ctx: OrchestrationContext): any { + const retryPolicy = new RetryPolicy({ + maxNumberOfAttempts: 3, + firstRetryIntervalInMilliseconds: 1000, + handleFailure: (_failure: any) => { + throw new Error("Predicate exploded!"); + }, + }); + return yield ctx.callActivity("flakyActivity", undefined, { retry: retryPolicy }); + }, + ); + + expect(startResult.actions[0].hasScheduletask()).toBe(true); + + // Activity fails → handleFailure throws → should NOT retry, task should fail with original error + const result = await replay( + [newTaskScheduledEvent(1, "flakyActivity")], + [newTaskFailedEvent(1, new Error("Original activity error"))], + ); + const completeAction = getAndValidateSingleCompleteOrchestrationAction(result); + expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_FAILED); + expect(completeAction?.getFailuredetails()?.getErrormessage()).toContain("Original activity error"); + }); + + it("should allow orchestrator to catch TaskFailedError when handleFailure predicate throws", async () => { + const { RetryPolicy } = await import("../src/task/retry/retry-policy"); + + const { result: startResult, replay } = await startOrchestration( + async function* (ctx: OrchestrationContext): any { + const retryPolicy = new RetryPolicy({ + maxNumberOfAttempts: 3, + firstRetryIntervalInMilliseconds: 1000, + handleFailure: (_failure: any) => { + throw new Error("Predicate exploded!"); + }, + }); + try { + yield ctx.callActivity("flakyActivity", undefined, { retry: retryPolicy }); + } catch (e: any) { + return "caught: " + e.message; + } + }, + ); + + expect(startResult.actions[0].hasScheduletask()).toBe(true); + + // Activity fails → predicate throws → task fails → orchestrator catches the error + const result = await replay( + [newTaskScheduledEvent(1, "flakyActivity")], + [newTaskFailedEvent(1, new Error("Original activity error"))], + ); + const completeAction = getAndValidateSingleCompleteOrchestrationAction(result); + expect(completeAction?.getOrchestrationstatus()).toEqual(pb.OrchestrationStatus.ORCHESTRATION_STATUS_COMPLETED); + expect(completeAction?.getResult()?.getValue()).toContain("Original activity error"); + }); }); it("should complete immediately when whenAll is called with an empty task array", async () => { From dd8f8616c4bd49bb93cba0e33a1ca930ad119dc1 Mon Sep 17 00:00:00 2001 From: wangbill Date: Tue, 24 Mar 2026 10:29:24 -0700 Subject: [PATCH 2/2] fix: address review - neutral log message and add error/stack to structured properties --- packages/durabletask-js/src/worker/logs.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/packages/durabletask-js/src/worker/logs.ts b/packages/durabletask-js/src/worker/logs.ts index 9e9beec..7f8795f 100644 --- a/packages/durabletask-js/src/worker/logs.ts +++ b/packages/durabletask-js/src/worker/logs.ts @@ -204,8 +204,13 @@ export function retryHandlerException(logger: Logger, instanceId: string, name: emitLog(logger, "warn", { eventId: EVENT_RETRY_HANDLER_EXCEPTION, category: CATEGORY_ORCHESTRATIONS, - properties: { instanceId, name }, - }, `${instanceId}: Retry handler for '${name}' threw an exception and will not be retried: ${msg}`); + properties: { + instanceId, + name, + error: msg, + ...(error instanceof Error && error.stack ? { stack: error.stack } : {}), + }, + }, `${instanceId}: Retry evaluation for '${name}' threw an exception and will not be retried: ${msg}`); } // ═══════════════════════════════════════════════════════════════════════════════