Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions __tests__/unit/health.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import { evaluateHealth } from '../../src/health.js';

describe('evaluateHealth', () => {
it('reports healthy when no probes are provided', () => {
const result = evaluateHealth({});
expect(result).toEqual({ ok: true, status: 'healthy', checks: {} });
});

describe('scheduler probe', () => {
function fakeScheduler({ running, lastTickAt, intervalMs = 60_000 } = {}) {
return {
isRunning: () => running,
getLastTickAt: () => lastTickAt,
getIntervalMs: () => intervalMs
};
}

it('passes when not running (treated as not-yet-started)', () => {
const result = evaluateHealth({ scheduler: fakeScheduler({ running: false }) });
expect(result.status).toBe('healthy');
expect(result.checks.scheduler.ok).toBe(true);
});

it('reports degraded when running but no tick has completed yet', () => {
const result = evaluateHealth({
scheduler: fakeScheduler({ running: true, lastTickAt: null })
});
expect(result.status).toBe('degraded');
expect(result.checks.scheduler.status).toBe('warn');
expect(result.ok).toBe(true);
});

it('passes when last tick is within 2× the configured interval', () => {
const ageMs = 30_000; // less than 2 * 60_000
const result = evaluateHealth({
scheduler: fakeScheduler({ running: true, lastTickAt: Date.now() - ageMs })
});
expect(result.status).toBe('healthy');
expect(result.checks.scheduler.ok).toBe(true);
expect(result.checks.scheduler.detail.ageMs).toBeGreaterThanOrEqual(ageMs);
});

it('flags unhealthy when last tick is older than 2× the interval', () => {
const result = evaluateHealth({
scheduler: fakeScheduler({
running: true,
lastTickAt: Date.now() - 5 * 60_000, // way past 2× the 60s interval
intervalMs: 60_000
})
});
expect(result.status).toBe('unhealthy');
expect(result.ok).toBe(false);
expect(result.checks.scheduler.status).toBe('fail');
});

it('reports degraded when scheduler probe itself throws', () => {
const result = evaluateHealth({
scheduler: {
isRunning: () => { throw new Error('scheduler exploded'); },
getLastTickAt: () => Date.now()
}
});
expect(result.status).toBe('degraded');
expect(result.checks.scheduler.status).toBe('error');
expect(result.checks.scheduler.detail).toBe('scheduler exploded');
});
});

describe('kv probe', () => {
it('passes when ping() succeeds', () => {
const result = evaluateHealth({ kv: { ping: () => {} } });
expect(result.status).toBe('healthy');
expect(result.checks.kv.ok).toBe(true);
});

it('flags unhealthy when ping() throws', () => {
const result = evaluateHealth({
kv: { ping: () => { throw new Error('SQLITE_BUSY'); } }
});
expect(result.status).toBe('unhealthy');
expect(result.ok).toBe(false);
expect(result.checks.kv.status).toBe('fail');
expect(result.checks.kv.detail).toBe('SQLITE_BUSY');
});

it('skips the check when ping is not callable', () => {
const result = evaluateHealth({ kv: {} });
expect(result.checks.kv).toBeUndefined();
});
});

describe('disk probe', () => {
it('passes for a real directory with plenty of free space', () => {
// /tmp is virtually always > 100 MB free on dev/CI hosts.
const result = evaluateHealth({ dataDir: '/tmp' });
expect(result.checks.disk.ok).toBe(true);
expect(result.checks.disk.detail.freeBytes).toBeGreaterThan(0);
});

it('reports degraded when statfs throws (e.g. nonexistent path)', () => {
const result = evaluateHealth({ dataDir: '/this/path/does/not/exist/temper' });
expect(result.checks.disk.status).toBe('error');
expect(result.status).toBe('degraded');
});
});

it('combines unhealthy + degraded → unhealthy (most-severe wins)', () => {
const result = evaluateHealth({
kv: { ping: () => { throw new Error('down'); } }, // unhealthy
dataDir: '/this/does/not/exist' // degraded
});
expect(result.status).toBe('unhealthy');
expect(result.ok).toBe(false);
});
});
33 changes: 27 additions & 6 deletions src/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,18 @@ import { initTaskStore } from './task-store.js';
import { createScheduler } from './scheduler.js';
import { initKVStore } from './persistent-kv.js';
import { provisionRepo, parseIssueFormBody, validateProvisionRequest } from './provisioning.js';
import { evaluateHealth } from './health.js';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

// Module-level state for the task store and scheduler
let _taskStore = null;
let _scheduler = null;
// Bug #8: /health uses _dedupStore.ping() as a SQLite liveness probe.
// Same instance that backs setIdempotencyStore — exposed here so the
// /health handlers can access it without reaching into idempotency.js.
let _dedupStore = null;
let _dataDir = null;

function getTaskStore() { return _taskStore; }
function getScheduler() { return _scheduler; }
Expand Down Expand Up @@ -194,11 +200,17 @@ function createCustomRoutesHandler() {

if (req.method === 'GET' && pathname === '/health') {
applySecurityHeaders(res);
const probe = evaluateHealth({
scheduler: _scheduler,
kv: _dedupStore,
dataDir: _dataDir
});
const healthData = {
status: 'healthy',
status: probe.status,
timestamp: new Date().toISOString(),
version: DEPLOY_SHA || '1.0.0',
queue: defaultQueue.stats()
queue: defaultQueue.stats(),
checks: probe.checks
};
if (_taskStore) {
healthData.tasks = _taskStore.getStats();
Expand All @@ -207,7 +219,8 @@ function createCustomRoutesHandler() {
healthData.scheduler = { running: _scheduler.isRunning() };
}
const body = JSON.stringify(healthData);
res.writeHead(200, { 'Content-Type': 'application/json' });
// Bug #8: 503 lets PM2 / external uptime monitors restart on hang.
res.writeHead(probe.ok ? 200 : 503, { 'Content-Type': 'application/json' });
res.end(body);
return true;
}
Expand Down Expand Up @@ -982,19 +995,25 @@ function registerApp(app, options = {}) {
applySecurityMiddleware(router);

router.get('/health', (req, res) => {
const probe = evaluateHealth({
scheduler: _scheduler,
kv: _dedupStore,
dataDir: _dataDir
});
const healthData = {
status: 'healthy',
status: probe.status,
timestamp: new Date().toISOString(),
version: DEPLOY_SHA || '1.0.0',
queue: defaultQueue.stats()
queue: defaultQueue.stats(),
checks: probe.checks
};
if (_taskStore) {
healthData.tasks = _taskStore.getStats();
}
if (_scheduler) {
healthData.scheduler = { running: _scheduler.isRunning() };
}
res.status(200).json(healthData);
res.status(probe.ok ? 200 : 503).json(healthData);
});

router.get('/webhook', (req, res) => {
Expand Down Expand Up @@ -1035,11 +1054,13 @@ function initPersistence() {
try {
const dataDir = getDataDir();
if (!fs.existsSync(dataDir)) fs.mkdirSync(dataDir, { recursive: true });
_dataDir = dataDir;

const dedupStore = initKVStore(path.join(dataDir, 'dedup.db'), 'webhook_dedup', {
ttlMs: 60 * 60 * 1000
});
setIdempotencyStore(dedupStore);
_dedupStore = dedupStore;

const rateStore = initKVStore(path.join(dataDir, 'dedup.db'), 'ai_rate_limits', {
ttlMs: 5 * 60 * 1000
Expand Down
120 changes: 120 additions & 0 deletions src/health.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/**
* Liveness probe for /health (Bug #8).
*
* The previous /health endpoint returned 200 unconditionally — which means
* PM2's restart-on-failed-healthcheck and any external uptime monitor were
* both blind to a hung scheduler, a locked SQLite DB, or a full data disk.
*
* The checks here are *liveness* signals (am I able to do my job RIGHT NOW),
* not readiness (have I finished initialising). All checks are best-effort:
* if a check itself throws unexpectedly, that check is reported as
* `'error'` and the overall verdict is `degraded` — we'd rather report
* something slightly wrong than mask a real outage.
*
* ok: every check passed
* degraded: a check is missing or threw; not actionable but worth
* flagging in dashboards. /health returns 200.
* unhealthy: a check has FAILED in a way that means the bot can't do
* its job (DB ping throws, scheduler hung, disk full).
* /health returns 503 so PM2 / uptime monitors restart us.
*/

import fs from 'node:fs';

const MIN_FREE_DISK_BYTES = 100 * 1024 * 1024; // 100 MB — below this we
// risk losing the next sweep / commit. Threshold low because Netcup VM
// only has ~30 GB total.

const SCHEDULER_STALE_FACTOR = 2; // a tick should land within 2× the
// configured interval — beyond that, the scheduler is hung.

/**
* Synchronously evaluate liveness. Cheap (millisecond-range) by design —
* /health is hit by PM2 every few seconds.
*
* @param {object} probes
* @param {object} [probes.scheduler] - { isRunning(), getLastTickAt(), getIntervalMs() }
* @param {object} [probes.kv] - { ping() } — typically the dedup KV store
* @param {string} [probes.dataDir] - filesystem path checked for free space
* @returns {{
* ok: boolean,
* status: 'healthy'|'degraded'|'unhealthy',
* checks: Record<string, { ok: boolean, status: 'ok'|'warn'|'fail'|'error', detail?: any }>
* }}
*/
export function evaluateHealth(probes = {}) {
const checks = {};
let unhealthy = false;
let degraded = false;

// 1. Scheduler tick freshness ---------------------------------------------
if (probes.scheduler) {
try {
const isRunning = probes.scheduler.isRunning();
const lastTickAt = probes.scheduler.getLastTickAt?.() ?? null;
const intervalMs = probes.scheduler.getIntervalMs?.() ?? 5 * 60 * 1000;
const staleAfter = intervalMs * SCHEDULER_STALE_FACTOR;

if (!isRunning) {
// Scheduler was never started (or has been stopped). Not necessarily
// an outage — `start()` may not have been called yet (early in
// probot bootstrap). Don't flunk /health for it.
checks.scheduler = { ok: true, status: 'ok', detail: { running: false } };
} else if (lastTickAt === null) {
// Started but no tick has completed yet — first tick fires
// immediately on start(), so we expect a value within milliseconds.
// Don't fail here either; report as warn so dashboards see it.
checks.scheduler = { ok: true, status: 'warn', detail: { running: true, lastTickAt: null } };
degraded = true;
} else {
const ageMs = Date.now() - lastTickAt;
const stale = ageMs > staleAfter;
checks.scheduler = {
ok: !stale,
status: stale ? 'fail' : 'ok',
detail: { running: true, ageMs, staleAfterMs: staleAfter }
};
if (stale) unhealthy = true;
}
} catch (err) {
checks.scheduler = { ok: false, status: 'error', detail: err.message };
degraded = true;
}
}

// 2. SQLite ping ---------------------------------------------------------
if (probes.kv && typeof probes.kv.ping === 'function') {
try {
probes.kv.ping();
checks.kv = { ok: true, status: 'ok' };
} catch (err) {
checks.kv = { ok: false, status: 'fail', detail: err.message };
unhealthy = true;
}
}

// 3. Free disk on the data directory -------------------------------------
if (probes.dataDir && typeof fs.statfsSync === 'function') {
try {
const stat = fs.statfsSync(probes.dataDir);
const freeBytes = Number(stat.bavail) * Number(stat.bsize);
const ok = freeBytes >= MIN_FREE_DISK_BYTES;
checks.disk = {
ok,
status: ok ? 'ok' : 'fail',
detail: { freeBytes, thresholdBytes: MIN_FREE_DISK_BYTES }
};
if (!ok) unhealthy = true;
} catch (err) {
checks.disk = { ok: false, status: 'error', detail: err.message };
degraded = true;
}
}

let status;
if (unhealthy) status = 'unhealthy';
else if (degraded) status = 'degraded';
else status = 'healthy';

return { ok: !unhealthy, status, checks };
}
9 changes: 8 additions & 1 deletion src/persistent-kv.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,12 @@ export function initKVStore(dbPath, name, opts = {}) {
db.close();
}

return { has, get, set, sweep, close, _db: db };
// Bug #8: /health uses ping() as a SQLite liveness probe. Throws if
// the connection is dead, the file is locked beyond timeout, or disk
// I/O is jammed. SELECT 1 doesn't touch any table — minimum latency.
function ping() {
db.prepare('SELECT 1 AS ok').get();
}

return { has, get, set, sweep, close, ping, _db: db };
}
15 changes: 15 additions & 0 deletions src/scheduler.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ function createScheduler(store, octokitOrFactory, options = {}) {
const handlers = new Map();
let timer = null;
let running = false;
// Bug #8: /health uses this to detect a hung scheduler. Updated at the
// *end* of every tick (success, no-op, or error caught inside tick).
let lastTickAt = null;

async function getOctokit() {
if (typeof octokitOrFactory === 'function') {
Expand Down Expand Up @@ -98,6 +101,7 @@ function createScheduler(store, octokitOrFactory, options = {}) {
}
} finally {
running = false;
lastTickAt = Date.now();
}
}

Expand Down Expand Up @@ -128,6 +132,17 @@ function createScheduler(store, octokitOrFactory, options = {}) {
/** Check if the scheduler is running. */
isRunning() {
return timer !== null;
},

/** Timestamp (ms since epoch) of the last completed tick, or null
* before the first tick finishes. Used by /health for liveness. */
getLastTickAt() {
return lastTickAt;
},

/** Configured interval; /health uses 2× this as the staleness threshold. */
getIntervalMs() {
return intervalMs;
}
};
}
Expand Down
Loading