From 6c97f9629ea2571cc39e4d2d14dc266b578fd946 Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 09:52:05 +0200 Subject: [PATCH 1/7] fix: don't wait for gateway port in /api/status to avoid CPU limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ensureGateway's waitForPort blocks for up to 180s. Even with a 25s Promise.race timeout, the underlying RPC continues running and exhausts the Worker's 30s CPU limit (error 1043). Fix: add waitForReady option to ensureGateway. When false, it starts the process but returns immediately without waitForPort. The /api/status handler uses this — the loading page polls every 2s and subsequent polls check if the port is up via the existing process check. --- src/gateway/process.ts | 51 +++++++++++++++++++++++++----------------- src/routes/public.ts | 24 +++++++++----------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/gateway/process.ts b/src/gateway/process.ts index 613fa6959..b0a2fa349 100644 --- a/src/gateway/process.ts +++ b/src/gateway/process.ts @@ -107,10 +107,17 @@ export async function findExistingGatewayProcess(sandbox: Sandbox): Promise { +export async function ensureGateway( + sandbox: Sandbox, + env: OpenClawEnv, + options?: { waitForReady?: boolean }, +): Promise { + const waitForReady = options?.waitForReady !== false; // Check if gateway is already running or starting const existingProcess = await findExistingGatewayProcess(sandbox); if (existingProcess) { @@ -174,28 +181,32 @@ export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise throw startErr; } - // Wait for the gateway to be ready - try { - console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT); - await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS }); - console.log('[Gateway] OpenClaw gateway is ready!'); - - const logs = await process.getLogs(); - if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout); - if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr); - } catch (e) { - console.error('[Gateway] waitForPort failed:', e); + if (waitForReady) { + // Wait for the gateway to be ready try { + console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT); + await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS }); + console.log('[Gateway] OpenClaw gateway is ready!'); + const logs = await process.getLogs(); - console.error('[Gateway] startup failed. Stderr:', logs.stderr); - console.error('[Gateway] startup failed. Stdout:', logs.stdout); - throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, { - cause: e, - }); - } catch (logErr) { - console.error('[Gateway] Failed to get logs:', logErr); - throw e; + if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout); + if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr); + } catch (e) { + console.error('[Gateway] waitForPort failed:', e); + try { + const logs = await process.getLogs(); + console.error('[Gateway] startup failed. Stderr:', logs.stderr); + console.error('[Gateway] startup failed. Stdout:', logs.stdout); + throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, { + cause: e, + }); + } catch (logErr) { + console.error('[Gateway] Failed to get logs:', logErr); + throw e; + } } + } else { + console.log('[Gateway] Process started (not waiting for ready):', process.id); } // Verify gateway is actually responding diff --git a/src/routes/public.ts b/src/routes/public.ts index ceacd5419..10a382926 100644 --- a/src/routes/public.ts +++ b/src/routes/public.ts @@ -50,23 +50,19 @@ publicRoutes.get('/api/status', async (c) => { console.error('[api/status] Restore failed:', restoreError); } - // Start the gateway synchronously with a short timeout. Workers have a - // 30s CPU limit — restoreIfNeeded uses ~1-3s, leaving ~25s for the - // gateway. If it doesn't start in time, the loading page retries. - // We use synchronous start instead of waitUntil because waitUntil is - // unreliable in the Durable Object context. + // Start the gateway but DON'T wait for it to be ready. + // ensureGateway with waitForReady:false just starts the process + // (fast RPC, ~2-5s) without blocking on waitForPort (which takes + // up to 180s and would exceed the 30s Worker CPU limit). + // The loading page polls every 2s — subsequent polls will find + // the process and check if the port is up. console.log('[api/status] No process found, starting gateway...'); try { - await Promise.race([ - ensureGateway(sandbox, c.env), - new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 25_000)), - ]); - process = await findExistingGatewayProcess(sandbox); - if (process) { - return c.json({ ok: true, status: 'running', processId: process.id }); - } + await ensureGateway(sandbox, c.env, { waitForReady: false }); } catch (err) { - console.log('[api/status] Gateway start timed out or failed, will retry on next poll'); + const msg = err instanceof Error ? err.message : String(err); + console.error('[api/status] Gateway start failed:', msg); + return c.json({ ok: false, status: 'start_failed', error: msg, restoreError }); } return c.json({ ok: false, status: 'starting', restoreError }); } From 805c631f15c742ebe789897dea966acb94cae146 Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 10:43:21 +0200 Subject: [PATCH 2/7] fix: add 15s timeout to containerFetch for HTML requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit containerFetch blocks until the container responds, which can take 30-60s+ on cold start. The browser gets a blank page because the Worker times out before containerFetch returns. Add a 15s timeout for HTML requests — on timeout, the catch block serves the loading page. --- src/index.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/index.ts b/src/index.ts index 6b4e39472..f3aed2dc7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -464,7 +464,19 @@ app.all('*', async (c) => { let httpResponse: Response; try { - httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); + // For HTML requests, add a timeout to avoid hanging on cold start. + // containerFetch blocks until the container responds, which can take + // 30-60s+ on cold start. If it takes too long, serve the loading page. + if (acceptsHtml) { + httpResponse = await Promise.race([ + sandbox.containerFetch(request, GATEWAY_PORT), + new Promise((_, reject) => + setTimeout(() => reject(new Error('proxy timeout')), 15_000), + ), + ]); + } else { + httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); + } } catch (err) { if (isGatewayCrashedError(err)) { console.log('[HTTP] Gateway crashed, attempting restore + restart and retry...'); From 8ee3fd4b582c73d1155b3acbc6360b9c0ef757eb Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 10:59:59 +0200 Subject: [PATCH 3/7] fix: add timeout to response body read for HTML requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If containerFetch returns headers but the body stream hangs (gateway partially initialized), httpResponse.text() blocks forever. The browser gets a blank page. Add a 10s timeout — on timeout, serve the loading page instead. --- src/index.ts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index f3aed2dc7..f78689d28 100644 --- a/src/index.ts +++ b/src/index.ts @@ -509,11 +509,22 @@ app.all('*', async (c) => { console.log('[HTTP] Response status:', httpResponse.status); // For HTML requests, verify we got actual content from the gateway. - // containerFetch can return a 200 with empty body if the gateway's + // containerFetch can return a 200 with empty/streaming body if the gateway's // HTTP handler hasn't fully initialized. Show the loading page instead // of a blank page that the user would be stuck on forever. if (acceptsHtml) { - const body = await httpResponse.text(); + let body: string; + try { + body = await Promise.race([ + httpResponse.text(), + new Promise((_, reject) => + setTimeout(() => reject(new Error('body read timeout')), 10_000), + ), + ]); + } catch { + console.log('[HTTP] Body read timed out, serving loading page'); + return c.html(loadingPageHtml); + } if (!body || body.length < 50) { console.log( `[HTTP] Empty/short response (${body.length} bytes) for HTML request, serving loading page`, From 38279cecf50dc5e8c7091856a998a48eb1bf5fcb Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 13:41:07 +0200 Subject: [PATCH 4/7] fix: skip env validation in E2E test mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'base' variant in CI was showing the 'Configuration Required' error page instead of the loading page because E2E_TEST_MODE didn't skip env validation. The AI gateway keys may not be set for all variants. The validateRequiredEnv function already checks isTestMode for CF Access vars — extend the middleware to skip validation entirely in E2E mode, matching the existing behavior in dev mode. --- src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index f78689d28..f127bb664 100644 --- a/src/index.ts +++ b/src/index.ts @@ -190,8 +190,8 @@ app.use('*', async (c, next) => { return next(); } - // Skip validation in dev mode - if (c.env.DEV_MODE === 'true') { + // Skip validation in dev/test mode + if (c.env.DEV_MODE === 'true' || c.env.E2E_TEST_MODE === 'true') { return next(); } From bed7d9dfc60ab09acd53e6d07389ecdde1e15077 Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 13:58:15 +0200 Subject: [PATCH 5/7] fix: check gateway process before proxying HTML to prevent blank page The blank page was caused by containerFetch hanging when the gateway wasn't ready. Even with timeouts, the uncancelled background RPC would exhaust the Worker CPU limit. Fix: for HTML requests, check if the gateway process exists first (3s timeout on findExistingGatewayProcess). If not running, serve the loading page immediately without calling containerFetch. The loading page handles polling, probing, and reloading. This completely avoids calling containerFetch when the gateway isn't ready, eliminating both the blank page and CPU limit issues. --- src/index.ts | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/index.ts b/src/index.ts index f127bb664..bb93f6640 100644 --- a/src/index.ts +++ b/src/index.ts @@ -262,10 +262,27 @@ app.all('*', async (c) => { const isWebSocketRequest = request.headers.get('Upgrade')?.toLowerCase() === 'websocket'; const acceptsHtml = request.headers.get('Accept')?.includes('text/html'); - // For browser HTML requests, always try the proxy first but with a fallback - // to the loading page. This avoids calling listProcesses() which can hang - // on cold start (the DO RPC takes 30-60s and kills the Worker via CPU limit). - // The loading page polls /api/status which handles restore + gateway start. + // For browser HTML requests, check if the gateway is running before proxying. + // If not running, serve the loading page immediately. The loading page polls + // /api/status which handles restore + gateway start. We use a very short timeout + // (3s) on findExistingGatewayProcess to avoid blocking — if it doesn't respond, + // we assume the gateway isn't ready. + if (!isWebSocketRequest && acceptsHtml) { + let gatewayReady = false; + try { + const proc = await Promise.race([ + findExistingGatewayProcess(sandbox), + new Promise((resolve) => setTimeout(() => resolve(null), 3_000)), + ]); + gatewayReady = proc !== null && proc.status === 'running'; + } catch { + // Treat as not ready + } + if (!gatewayReady) { + console.log('[PROXY] Gateway not ready for HTML request, serving loading page'); + return c.html(loadingPageHtml); + } + } // For non-WebSocket, non-HTML requests (API calls, static assets), we need // the gateway to be running. Restore first, then start. @@ -464,19 +481,7 @@ app.all('*', async (c) => { let httpResponse: Response; try { - // For HTML requests, add a timeout to avoid hanging on cold start. - // containerFetch blocks until the container responds, which can take - // 30-60s+ on cold start. If it takes too long, serve the loading page. - if (acceptsHtml) { - httpResponse = await Promise.race([ - sandbox.containerFetch(request, GATEWAY_PORT), - new Promise((_, reject) => - setTimeout(() => reject(new Error('proxy timeout')), 15_000), - ), - ]); - } else { - httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); - } + httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT); } catch (err) { if (isGatewayCrashedError(err)) { console.log('[HTTP] Gateway crashed, attempting restore + restart and retry...'); @@ -513,18 +518,7 @@ app.all('*', async (c) => { // HTTP handler hasn't fully initialized. Show the loading page instead // of a blank page that the user would be stuck on forever. if (acceptsHtml) { - let body: string; - try { - body = await Promise.race([ - httpResponse.text(), - new Promise((_, reject) => - setTimeout(() => reject(new Error('body read timeout')), 10_000), - ), - ]); - } catch { - console.log('[HTTP] Body read timed out, serving loading page'); - return c.html(loadingPageHtml); - } + const body = await httpResponse.text(); if (!body || body.length < 50) { console.log( `[HTTP] Empty/short response (${body.length} bytes) for HTML request, serving loading page`, From 3fa2c89dbf3c3bfc8341a131345bc90a710b952d Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 16:05:16 +0200 Subject: [PATCH 6/7] trigger CI reliability check From 51c56ce39e300951ed0caeee057b197f59c069ee Mon Sep 17 00:00:00 2001 From: Andreas Jansson Date: Sun, 29 Mar 2026 16:16:47 +0200 Subject: [PATCH 7/7] fix: poll for process shutdown after destroy-container instead of fixed sleep --- test/e2e/zzz_cron_wake.txt | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/e2e/zzz_cron_wake.txt b/test/e2e/zzz_cron_wake.txt index 13ba0ef32..b42bb070c 100644 --- a/test/e2e/zzz_cron_wake.txt +++ b/test/e2e/zzz_cron_wake.txt @@ -30,10 +30,16 @@ verify container is down %require === WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -# Check via debug/processes (does NOT trigger gateway restart like /api/status does) -sleep 3 -PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') -COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") +# Poll until no running processes (destroy may take a few seconds) +for i in $(seq 1 10); do + PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') + COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") + if [ "$COUNT" = "0" ]; then + echo "{\"running_processes\": $COUNT}" | jq . + exit 0 + fi + sleep 3 +done echo "{\"running_processes\": $COUNT}" | jq . --- {{ result: json object }}