diff --git a/__tests__/unit/health.test.js b/__tests__/unit/health.test.js new file mode 100644 index 0000000..6419b41 --- /dev/null +++ b/__tests__/unit/health.test.js @@ -0,0 +1,115 @@ +import { evaluateHealth } from '../../src/health.js'; + +describe('evaluateHealth', () => { + it('reports healthy when no probes are provided', () => { + const result = evaluateHealth({}); + expect(result).toEqual({ ok: true, status: 'healthy', checks: {} }); + }); + + describe('scheduler probe', () => { + function fakeScheduler({ running, lastTickAt, intervalMs = 60_000 } = {}) { + return { + isRunning: () => running, + getLastTickAt: () => lastTickAt, + getIntervalMs: () => intervalMs + }; + } + + it('passes when not running (treated as not-yet-started)', () => { + const result = evaluateHealth({ scheduler: fakeScheduler({ running: false }) }); + expect(result.status).toBe('healthy'); + expect(result.checks.scheduler.ok).toBe(true); + }); + + it('reports degraded when running but no tick has completed yet', () => { + const result = evaluateHealth({ + scheduler: fakeScheduler({ running: true, lastTickAt: null }) + }); + expect(result.status).toBe('degraded'); + expect(result.checks.scheduler.status).toBe('warn'); + expect(result.ok).toBe(true); + }); + + it('passes when last tick is within 2× the configured interval', () => { + const ageMs = 30_000; // less than 2 * 60_000 + const result = evaluateHealth({ + scheduler: fakeScheduler({ running: true, lastTickAt: Date.now() - ageMs }) + }); + expect(result.status).toBe('healthy'); + expect(result.checks.scheduler.ok).toBe(true); + expect(result.checks.scheduler.detail.ageMs).toBeGreaterThanOrEqual(ageMs); + }); + + it('flags unhealthy when last tick is older than 2× the interval', () => { + const result = evaluateHealth({ + scheduler: fakeScheduler({ + running: true, + lastTickAt: Date.now() - 5 * 60_000, // way past 2× the 60s interval + intervalMs: 60_000 + }) + }); + expect(result.status).toBe('unhealthy'); + expect(result.ok).toBe(false); + expect(result.checks.scheduler.status).toBe('fail'); + }); + + it('reports degraded when scheduler probe itself throws', () => { + const result = evaluateHealth({ + scheduler: { + isRunning: () => { throw new Error('scheduler exploded'); }, + getLastTickAt: () => Date.now() + } + }); + expect(result.status).toBe('degraded'); + expect(result.checks.scheduler.status).toBe('error'); + expect(result.checks.scheduler.detail).toBe('scheduler exploded'); + }); + }); + + describe('kv probe', () => { + it('passes when ping() succeeds', () => { + const result = evaluateHealth({ kv: { ping: () => {} } }); + expect(result.status).toBe('healthy'); + expect(result.checks.kv.ok).toBe(true); + }); + + it('flags unhealthy when ping() throws', () => { + const result = evaluateHealth({ + kv: { ping: () => { throw new Error('SQLITE_BUSY'); } } + }); + expect(result.status).toBe('unhealthy'); + expect(result.ok).toBe(false); + expect(result.checks.kv.status).toBe('fail'); + expect(result.checks.kv.detail).toBe('SQLITE_BUSY'); + }); + + it('skips the check when ping is not callable', () => { + const result = evaluateHealth({ kv: {} }); + expect(result.checks.kv).toBeUndefined(); + }); + }); + + describe('disk probe', () => { + it('passes for a real directory with plenty of free space', () => { + // /tmp is virtually always > 100 MB free on dev/CI hosts. + const result = evaluateHealth({ dataDir: '/tmp' }); + expect(result.checks.disk.ok).toBe(true); + expect(result.checks.disk.detail.freeBytes).toBeGreaterThan(0); + }); + + it('reports degraded when statfs throws (e.g. nonexistent path)', () => { + const result = evaluateHealth({ dataDir: '/this/path/does/not/exist/temper' }); + expect(result.checks.disk.status).toBe('error'); + expect(result.status).toBe('degraded'); + }); + }); + + it('combines unhealthy + degraded → unhealthy (most-severe wins)', () => { + const result = evaluateHealth({ + kv: { ping: () => { throw new Error('down'); } }, // unhealthy + dataDir: '/this/does/not/exist' // degraded + }); + expect(result.status).toBe('unhealthy'); + expect(result.ok).toBe(false); + }); +}); diff --git a/src/app.js b/src/app.js index db1446d..62dd8d4 100644 --- a/src/app.js +++ b/src/app.js @@ -30,12 +30,18 @@ import { initTaskStore } from './task-store.js'; import { createScheduler } from './scheduler.js'; import { initKVStore } from './persistent-kv.js'; import { provisionRepo, parseIssueFormBody, validateProvisionRequest } from './provisioning.js'; +import { evaluateHealth } from './health.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); // Module-level state for the task store and scheduler let _taskStore = null; let _scheduler = null; +// Bug #8: /health uses _dedupStore.ping() as a SQLite liveness probe. +// Same instance that backs setIdempotencyStore — exposed here so the +// /health handlers can access it without reaching into idempotency.js. +let _dedupStore = null; +let _dataDir = null; function getTaskStore() { return _taskStore; } function getScheduler() { return _scheduler; } @@ -194,11 +200,17 @@ function createCustomRoutesHandler() { if (req.method === 'GET' && pathname === '/health') { applySecurityHeaders(res); + const probe = evaluateHealth({ + scheduler: _scheduler, + kv: _dedupStore, + dataDir: _dataDir + }); const healthData = { - status: 'healthy', + status: probe.status, timestamp: new Date().toISOString(), version: DEPLOY_SHA || '1.0.0', - queue: defaultQueue.stats() + queue: defaultQueue.stats(), + checks: probe.checks }; if (_taskStore) { healthData.tasks = _taskStore.getStats(); @@ -207,7 +219,8 @@ function createCustomRoutesHandler() { healthData.scheduler = { running: _scheduler.isRunning() }; } const body = JSON.stringify(healthData); - res.writeHead(200, { 'Content-Type': 'application/json' }); + // Bug #8: 503 lets PM2 / external uptime monitors restart on hang. + res.writeHead(probe.ok ? 200 : 503, { 'Content-Type': 'application/json' }); res.end(body); return true; } @@ -982,11 +995,17 @@ function registerApp(app, options = {}) { applySecurityMiddleware(router); router.get('/health', (req, res) => { + const probe = evaluateHealth({ + scheduler: _scheduler, + kv: _dedupStore, + dataDir: _dataDir + }); const healthData = { - status: 'healthy', + status: probe.status, timestamp: new Date().toISOString(), version: DEPLOY_SHA || '1.0.0', - queue: defaultQueue.stats() + queue: defaultQueue.stats(), + checks: probe.checks }; if (_taskStore) { healthData.tasks = _taskStore.getStats(); @@ -994,7 +1013,7 @@ function registerApp(app, options = {}) { if (_scheduler) { healthData.scheduler = { running: _scheduler.isRunning() }; } - res.status(200).json(healthData); + res.status(probe.ok ? 200 : 503).json(healthData); }); router.get('/webhook', (req, res) => { @@ -1035,11 +1054,13 @@ function initPersistence() { try { const dataDir = getDataDir(); if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir, { recursive: true }); + _dataDir = dataDir; const dedupStore = initKVStore(path.join(dataDir, 'dedup.db'), 'webhook_dedup', { ttlMs: 60 * 60 * 1000 }); setIdempotencyStore(dedupStore); + _dedupStore = dedupStore; const rateStore = initKVStore(path.join(dataDir, 'dedup.db'), 'ai_rate_limits', { ttlMs: 5 * 60 * 1000 diff --git a/src/health.js b/src/health.js new file mode 100644 index 0000000..d416681 --- /dev/null +++ b/src/health.js @@ -0,0 +1,120 @@ +/** + * Liveness probe for /health (Bug #8). + * + * The previous /health endpoint returned 200 unconditionally — which means + * PM2's restart-on-failed-healthcheck and any external uptime monitor were + * both blind to a hung scheduler, a locked SQLite DB, or a full data disk. + * + * The checks here are *liveness* signals (am I able to do my job RIGHT NOW), + * not readiness (have I finished initialising). All checks are best-effort: + * if a check itself throws unexpectedly, that check is reported as + * `'error'` and the overall verdict is `degraded` — we'd rather report + * something slightly wrong than mask a real outage. + * + * ok: every check passed + * degraded: a check is missing or threw; not actionable but worth + * flagging in dashboards. /health returns 200. + * unhealthy: a check has FAILED in a way that means the bot can't do + * its job (DB ping throws, scheduler hung, disk full). + * /health returns 503 so PM2 / uptime monitors restart us. + */ + +import fs from 'node:fs'; + +const MIN_FREE_DISK_BYTES = 100 * 1024 * 1024; // 100 MB — below this we +// risk losing the next sweep / commit. Threshold low because Netcup VM +// only has ~30 GB total. + +const SCHEDULER_STALE_FACTOR = 2; // a tick should land within 2× the +// configured interval — beyond that, the scheduler is hung. + +/** + * Synchronously evaluate liveness. Cheap (millisecond-range) by design — + * /health is hit by PM2 every few seconds. + * + * @param {object} probes + * @param {object} [probes.scheduler] - { isRunning(), getLastTickAt(), getIntervalMs() } + * @param {object} [probes.kv] - { ping() } — typically the dedup KV store + * @param {string} [probes.dataDir] - filesystem path checked for free space + * @returns {{ + * ok: boolean, + * status: 'healthy'|'degraded'|'unhealthy', + * checks: Record + * }} + */ +export function evaluateHealth(probes = {}) { + const checks = {}; + let unhealthy = false; + let degraded = false; + + // 1. Scheduler tick freshness --------------------------------------------- + if (probes.scheduler) { + try { + const isRunning = probes.scheduler.isRunning(); + const lastTickAt = probes.scheduler.getLastTickAt?.() ?? null; + const intervalMs = probes.scheduler.getIntervalMs?.() ?? 5 * 60 * 1000; + const staleAfter = intervalMs * SCHEDULER_STALE_FACTOR; + + if (!isRunning) { + // Scheduler was never started (or has been stopped). Not necessarily + // an outage — `start()` may not have been called yet (early in + // probot bootstrap). Don't flunk /health for it. + checks.scheduler = { ok: true, status: 'ok', detail: { running: false } }; + } else if (lastTickAt === null) { + // Started but no tick has completed yet — first tick fires + // immediately on start(), so we expect a value within milliseconds. + // Don't fail here either; report as warn so dashboards see it. + checks.scheduler = { ok: true, status: 'warn', detail: { running: true, lastTickAt: null } }; + degraded = true; + } else { + const ageMs = Date.now() - lastTickAt; + const stale = ageMs > staleAfter; + checks.scheduler = { + ok: !stale, + status: stale ? 'fail' : 'ok', + detail: { running: true, ageMs, staleAfterMs: staleAfter } + }; + if (stale) unhealthy = true; + } + } catch (err) { + checks.scheduler = { ok: false, status: 'error', detail: err.message }; + degraded = true; + } + } + + // 2. SQLite ping --------------------------------------------------------- + if (probes.kv && typeof probes.kv.ping === 'function') { + try { + probes.kv.ping(); + checks.kv = { ok: true, status: 'ok' }; + } catch (err) { + checks.kv = { ok: false, status: 'fail', detail: err.message }; + unhealthy = true; + } + } + + // 3. Free disk on the data directory ------------------------------------- + if (probes.dataDir && typeof fs.statfsSync === 'function') { + try { + const stat = fs.statfsSync(probes.dataDir); + const freeBytes = Number(stat.bavail) * Number(stat.bsize); + const ok = freeBytes >= MIN_FREE_DISK_BYTES; + checks.disk = { + ok, + status: ok ? 'ok' : 'fail', + detail: { freeBytes, thresholdBytes: MIN_FREE_DISK_BYTES } + }; + if (!ok) unhealthy = true; + } catch (err) { + checks.disk = { ok: false, status: 'error', detail: err.message }; + degraded = true; + } + } + + let status; + if (unhealthy) status = 'unhealthy'; + else if (degraded) status = 'degraded'; + else status = 'healthy'; + + return { ok: !unhealthy, status, checks }; +} diff --git a/src/persistent-kv.js b/src/persistent-kv.js index 86b505d..c38591c 100644 --- a/src/persistent-kv.js +++ b/src/persistent-kv.js @@ -77,5 +77,12 @@ export function initKVStore(dbPath, name, opts = {}) { db.close(); } - return { has, get, set, sweep, close, _db: db }; + // Bug #8: /health uses ping() as a SQLite liveness probe. Throws if + // the connection is dead, the file is locked beyond timeout, or disk + // I/O is jammed. SELECT 1 doesn't touch any table — minimum latency. + function ping() { + db.prepare('SELECT 1 AS ok').get(); + } + + return { has, get, set, sweep, close, ping, _db: db }; } diff --git a/src/scheduler.js b/src/scheduler.js index 7977bcb..827d6fc 100644 --- a/src/scheduler.js +++ b/src/scheduler.js @@ -23,6 +23,9 @@ function createScheduler(store, octokitOrFactory, options = {}) { const handlers = new Map(); let timer = null; let running = false; + // Bug #8: /health uses this to detect a hung scheduler. Updated at the + // *end* of every tick (success, no-op, or error caught inside tick). + let lastTickAt = null; async function getOctokit() { if (typeof octokitOrFactory === 'function') { @@ -98,6 +101,7 @@ function createScheduler(store, octokitOrFactory, options = {}) { } } finally { running = false; + lastTickAt = Date.now(); } } @@ -128,6 +132,17 @@ function createScheduler(store, octokitOrFactory, options = {}) { /** Check if the scheduler is running. */ isRunning() { return timer !== null; + }, + + /** Timestamp (ms since epoch) of the last completed tick, or null + * before the first tick finishes. Used by /health for liveness. */ + getLastTickAt() { + return lastTickAt; + }, + + /** Configured interval; /health uses 2× this as the staleness threshold. */ + getIntervalMs() { + return intervalMs; } }; }