diff --git a/src/gateway/process.ts b/src/gateway/process.ts index 613fa6959..b0a2fa349 100644 --- a/src/gateway/process.ts +++ b/src/gateway/process.ts @@ -107,10 +107,17 @@ export async function findExistingGatewayProcess(sandbox: Sandbox): Promise { +export async function ensureGateway( + sandbox: Sandbox, + env: OpenClawEnv, + options?: { waitForReady?: boolean }, +): Promise { + const waitForReady = options?.waitForReady !== false; // Check if gateway is already running or starting const existingProcess = await findExistingGatewayProcess(sandbox); if (existingProcess) { @@ -174,28 +181,32 @@ export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise throw startErr; } - // Wait for the gateway to be ready - try { - console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT); - await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS }); - console.log('[Gateway] OpenClaw gateway is ready!'); - - const logs = await process.getLogs(); - if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout); - if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr); - } catch (e) { - console.error('[Gateway] waitForPort failed:', e); + if (waitForReady) { + // Wait for the gateway to be ready try { + console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT); + await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS }); + console.log('[Gateway] OpenClaw gateway is ready!'); + const logs = await process.getLogs(); - console.error('[Gateway] startup failed. Stderr:', logs.stderr); - console.error('[Gateway] startup failed. Stdout:', logs.stdout); - throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, { - cause: e, - }); - } catch (logErr) { - console.error('[Gateway] Failed to get logs:', logErr); - throw e; + if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout); + if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr); + } catch (e) { + console.error('[Gateway] waitForPort failed:', e); + try { + const logs = await process.getLogs(); + console.error('[Gateway] startup failed. Stderr:', logs.stderr); + console.error('[Gateway] startup failed. Stdout:', logs.stdout); + throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, { + cause: e, + }); + } catch (logErr) { + console.error('[Gateway] Failed to get logs:', logErr); + throw e; + } } + } else { + console.log('[Gateway] Process started (not waiting for ready):', process.id); } // Verify gateway is actually responding diff --git a/src/index.ts b/src/index.ts index 6b4e39472..bb93f6640 100644 --- a/src/index.ts +++ b/src/index.ts @@ -190,8 +190,8 @@ app.use('*', async (c, next) => { return next(); } - // Skip validation in dev mode - if (c.env.DEV_MODE === 'true') { + // Skip validation in dev/test mode + if (c.env.DEV_MODE === 'true' || c.env.E2E_TEST_MODE === 'true') { return next(); } @@ -262,10 +262,27 @@ app.all('*', async (c) => { const isWebSocketRequest = request.headers.get('Upgrade')?.toLowerCase() === 'websocket'; const acceptsHtml = request.headers.get('Accept')?.includes('text/html'); - // For browser HTML requests, always try the proxy first but with a fallback - // to the loading page. This avoids calling listProcesses() which can hang - // on cold start (the DO RPC takes 30-60s and kills the Worker via CPU limit). - // The loading page polls /api/status which handles restore + gateway start. + // For browser HTML requests, check if the gateway is running before proxying. + // If not running, serve the loading page immediately. The loading page polls + // /api/status which handles restore + gateway start. We use a very short timeout + // (3s) on findExistingGatewayProcess to avoid blocking — if it doesn't respond, + // we assume the gateway isn't ready. + if (!isWebSocketRequest && acceptsHtml) { + let gatewayReady = false; + try { + const proc = await Promise.race([ + findExistingGatewayProcess(sandbox), + new Promise((resolve) => setTimeout(() => resolve(null), 3_000)), + ]); + gatewayReady = proc !== null && proc.status === 'running'; + } catch { + // Treat as not ready + } + if (!gatewayReady) { + console.log('[PROXY] Gateway not ready for HTML request, serving loading page'); + return c.html(loadingPageHtml); + } + } // For non-WebSocket, non-HTML requests (API calls, static assets), we need // the gateway to be running. Restore first, then start. @@ -497,7 +514,7 @@ app.all('*', async (c) => { console.log('[HTTP] Response status:', httpResponse.status); // For HTML requests, verify we got actual content from the gateway. - // containerFetch can return a 200 with empty body if the gateway's + // containerFetch can return a 200 with empty/streaming body if the gateway's // HTTP handler hasn't fully initialized. Show the loading page instead // of a blank page that the user would be stuck on forever. if (acceptsHtml) { diff --git a/src/routes/public.ts b/src/routes/public.ts index ceacd5419..10a382926 100644 --- a/src/routes/public.ts +++ b/src/routes/public.ts @@ -50,23 +50,19 @@ publicRoutes.get('/api/status', async (c) => { console.error('[api/status] Restore failed:', restoreError); } - // Start the gateway synchronously with a short timeout. Workers have a - // 30s CPU limit — restoreIfNeeded uses ~1-3s, leaving ~25s for the - // gateway. If it doesn't start in time, the loading page retries. - // We use synchronous start instead of waitUntil because waitUntil is - // unreliable in the Durable Object context. + // Start the gateway but DON'T wait for it to be ready. + // ensureGateway with waitForReady:false just starts the process + // (fast RPC, ~2-5s) without blocking on waitForPort (which takes + // up to 180s and would exceed the 30s Worker CPU limit). + // The loading page polls every 2s — subsequent polls will find + // the process and check if the port is up. console.log('[api/status] No process found, starting gateway...'); try { - await Promise.race([ - ensureGateway(sandbox, c.env), - new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 25_000)), - ]); - process = await findExistingGatewayProcess(sandbox); - if (process) { - return c.json({ ok: true, status: 'running', processId: process.id }); - } + await ensureGateway(sandbox, c.env, { waitForReady: false }); } catch (err) { - console.log('[api/status] Gateway start timed out or failed, will retry on next poll'); + const msg = err instanceof Error ? err.message : String(err); + console.error('[api/status] Gateway start failed:', msg); + return c.json({ ok: false, status: 'start_failed', error: msg, restoreError }); } return c.json({ ok: false, status: 'starting', restoreError }); } diff --git a/test/e2e/zzz_cron_wake.txt b/test/e2e/zzz_cron_wake.txt index 13ba0ef32..b42bb070c 100644 --- a/test/e2e/zzz_cron_wake.txt +++ b/test/e2e/zzz_cron_wake.txt @@ -30,10 +30,16 @@ verify container is down %require === WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt") -# Check via debug/processes (does NOT trigger gateway restart like /api/status does) -sleep 3 -PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') -COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") +# Poll until no running processes (destroy may take a few seconds) +for i in $(seq 1 10); do + PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}') + COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0") + if [ "$COUNT" = "0" ]; then + echo "{\"running_processes\": $COUNT}" | jq . + exit 0 + fi + sleep 3 +done echo "{\"running_processes\": $COUNT}" | jq . --- {{ result: json object }}