From 45e48877cc494f5b90a4df2e83b00533499aac01 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 17 Feb 2026 12:07:36 +0000 Subject: [PATCH 1/4] fix: prevent MVCC race in blockRunWithWaitpoint pending check Split the CTE in blockRunWithWaitpoint so the pending waitpoint check is a separate SQL statement. In READ COMMITTED isolation, each statement gets its own snapshot, so a separate SELECT sees the latest committed state from concurrent completeWaitpoint calls. Previously, the CTE did INSERT + pending check in one statement (one snapshot). If completeWaitpoint committed between the CTE start and the SELECT, the SELECT would still see PENDING due to the stale snapshot. Neither side would enqueue continueRunIfUnblocked, leaving the run stuck forever. --- .../src/engine/systems/waitpointSystem.ts | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts index c542be5aa4b..b97095c4f4b 100644 --- a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -399,8 +399,10 @@ export class WaitpointSystem { return await this.$.runLock.lock("blockRunWithWaitpoint", [runId], async () => { let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot(prisma, runId); - //block the run with the waitpoints, returning how many waitpoints are pending - const insert = await prisma.$queryRaw<{ pending_count: BigInt }[]>` + // Insert the blocking connections and the historical run connections. + // We use a CTE to do both inserts atomically. Data-modifying CTEs are + // always executed regardless of whether they're referenced in the outer query. + await prisma.$queryRaw` WITH inserted AS ( INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") SELECT @@ -425,12 +427,21 @@ export class WaitpointSystem { WHERE w.id IN (${Prisma.join($waitpoints)}) ON CONFLICT DO NOTHING ) + SELECT COUNT(*) FROM inserted`; + + // Check if the run is actually blocked using a separate query. + // This MUST be a separate statement from the CTE above because in READ COMMITTED + // isolation, each statement gets its own snapshot. The CTE's snapshot is taken when + // it starts, so if a concurrent completeWaitpoint commits during the CTE, the CTE + // won't see it. This fresh query gets a new snapshot that reflects the latest commits. + const pendingCheck = await prisma.$queryRaw<{ pending_count: BigInt }[]>` SELECT COUNT(*) as pending_count - FROM inserted i - JOIN "Waitpoint" w ON w.id = i."waitpointId" - WHERE w.status = 'PENDING';`; + FROM "Waitpoint" + WHERE id IN (${Prisma.join($waitpoints)}) + AND status = 'PENDING' + `; - const isRunBlocked = Number(insert.at(0)?.pending_count ?? 0) > 0; + const isRunBlocked = Number(pendingCheck.at(0)?.pending_count ?? 0) > 0; let newStatus: TaskRunExecutionStatus = "SUSPENDED"; if ( From ef26a93416da536946d3f62484daf37517193de2 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 17 Feb 2026 12:23:28 +0000 Subject: [PATCH 2/4] docs: add detailed comment explaining MVCC-safe two-statement design in blockRunWithWaitpoint --- .../src/engine/systems/waitpointSystem.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts index b97095c4f4b..f2ad85cc95b 100644 --- a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -366,6 +366,22 @@ export class WaitpointSystem { /** * Prevents a run from continuing until the waitpoint is completed. + * + * This method uses two separate SQL statements intentionally: + * + * 1. A CTE that INSERTs TaskRunWaitpoint rows (blocking connections) and + * _WaitpointRunConnections rows (historical connections). + * + * 2. A separate SELECT that checks if any of the requested waitpoints are still PENDING. + * + * These MUST be separate statements because of PostgreSQL MVCC in READ COMMITTED isolation: + * each statement gets its own snapshot. If a concurrent `completeWaitpoint` commits between + * the CTE starting and finishing, the CTE's snapshot won't see the COMPLETED status. By using + * a separate SELECT, we get a fresh snapshot that reflects the latest committed state. + * + * The pending check queries ALL requested waitpoint IDs (not just the ones actually inserted + * by the CTE). This is intentional: if a TaskRunWaitpoint row already existed (ON CONFLICT + * DO NOTHING skipped the insert), a still-PENDING waitpoint should still count as blocking. */ async blockRunWithWaitpoint({ runId, From 2cff7c0c94b68b44e375d383f9d1f5b8167ebf55 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 25 Feb 2026 17:05:50 +0000 Subject: [PATCH 3/4] add server changeset --- .server-changes/fix-blocking-waitpoint-race-condition.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/fix-blocking-waitpoint-race-condition.md diff --git a/.server-changes/fix-blocking-waitpoint-race-condition.md b/.server-changes/fix-blocking-waitpoint-race-condition.md new file mode 100644 index 00000000000..421784ebdfa --- /dev/null +++ b/.server-changes/fix-blocking-waitpoint-race-condition.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Fix a race condition in the waitpoint system where a run could be blocked by a completed waitpoint but never be resumed because of an PostgreSQL MVCC issue. This was most likely to occur when creating a waitpoint via `wait.forToken()` at the exact same moment as completing the token with `wait.completeToken()`. Other types of waitpoints (timed, child runs) were not affected. From a70c48e8d1c89488001729f1fcae24569d286482 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 25 Feb 2026 17:29:28 +0000 Subject: [PATCH 4/4] fixed grammer --- .server-changes/fix-blocking-waitpoint-race-condition.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.server-changes/fix-blocking-waitpoint-race-condition.md b/.server-changes/fix-blocking-waitpoint-race-condition.md index 421784ebdfa..37799b76082 100644 --- a/.server-changes/fix-blocking-waitpoint-race-condition.md +++ b/.server-changes/fix-blocking-waitpoint-race-condition.md @@ -3,4 +3,4 @@ area: webapp type: fix --- -Fix a race condition in the waitpoint system where a run could be blocked by a completed waitpoint but never be resumed because of an PostgreSQL MVCC issue. This was most likely to occur when creating a waitpoint via `wait.forToken()` at the exact same moment as completing the token with `wait.completeToken()`. Other types of waitpoints (timed, child runs) were not affected. +Fix a race condition in the waitpoint system where a run could be blocked by a completed waitpoint but never be resumed because of a PostgreSQL MVCC issue. This was most likely to occur when creating a waitpoint via `wait.forToken()` at the same moment as completing the token with `wait.completeToken()`. Other types of waitpoints (timed, child runs) were not affected.