From 20dd768be918bd1c98416017bc2f69e7118d0f35 Mon Sep 17 00:00:00 2001 From: Luke Melia Date: Wed, 20 May 2026 00:30:00 -0400 Subject: [PATCH 1/2] CS-11182: widen cache invalidation to every reindex entry point MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original CS-11182 fix only fired the L2 `module_transpile_cache` bulk tombstone from `Realm.startReindex`'s post-completion `.then` — which covered POST `/_full-reindex` and `/_reindex` but nothing else. Production reindexes triggered via the operator-action endpoints (`/_grafana-reindex`, `/_grafana-full-reindex`, `/_post-deployment`), the publish-realm flow (`Realm.fullIndex`), and direct `enqueueReindexRealmJob` calls all bypassed `startReindex`. On staging today, a Grafana-button reindex of the base realm completed without ever tombstoning the L2 rows, so clients continued to be served pre-deploy bytes. Emit `notifyAllFileChanges(dbAdapter, realmURL)` from the worker side of `fromScratchIndex`, right after `IndexRunner.fromScratch` returns. The existing `realm_file_changes` wildcard listener on every replica then calls `realm.clearLocalSourceCaches()` — synchronous L1 wipe plus the fire-and-forget L2 bulk tombstone. One chokepoint covers every from-scratch trigger uniformly, including future ones. The regression test in `module-cache-race-test.ts` drives a reindex through `realm.realmIndexUpdater.fullIndex` (the bypass path that never wires up the original `startReindex` callback) and asserts the L2 rows are tombstoned. Verified to fail without the fix and pass with it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/module-cache-race-test.ts | 147 ++++++++++++++++++ packages/runtime-common/tasks/indexer.ts | 15 ++ 2 files changed, 162 insertions(+) diff --git a/packages/realm-server/tests/module-cache-race-test.ts b/packages/realm-server/tests/module-cache-race-test.ts index d8f959d80f..0198fbc7da 100644 --- a/packages/realm-server/tests/module-cache-race-test.ts +++ b/packages/realm-server/tests/module-cache-race-test.ts @@ -11,9 +11,11 @@ import { SupportedMimeType, param, query, + userInitiatedPriority, } from '@cardstack/runtime-common'; import type { PgAdapter } from '@cardstack/postgres'; import { ModuleCacheCoordinator } from '../lib/module-cache-coordination'; +import { RealmFileChangesListener } from '../lib/realm-file-changes-listener'; import { setupPermissionedRealmCached, setupDB, @@ -1300,4 +1302,149 @@ module(basename(__filename), function () { }); }, ); + + // CS-11182 follow-up: the original fix only fired the L2 bulk + // tombstone from `Realm.startReindex`'s post-completion `.then`, which + // only covers `POST /_full-reindex` / `POST /_reindex`. + // Production reindexes triggered via the operator-action endpoints + // (`/_grafana-reindex`, `/_grafana-full-reindex`, `/_post-deployment`) + // and the publish-realm flow (`Realm.fullIndex`) all bypass + // `startReindex` and so left the L2 row live with pre-reindex bytes. + // The wider fix emits `notifyAllFileChanges(dbAdapter, realmURL)` from + // the worker side of the `from-scratch-index` task — every replica's + // `realm_file_changes` wildcard listener then drops L1 and fires the + // L2 bulk tombstone. This test exercises the bypass path + // (`realmIndexUpdater.fullIndex`, which never wires up the + // `startReindex` callback) and pins the new cross-replica behavior. + module( + 'Worker-side notify covers reindexes that bypass Realm.startReindex (CS-11182)', + function (hooks) { + let realmURL = new URL('http://127.0.0.1:4444/test/'); + let testRealm: Realm; + let request: RealmRequest; + let dbAdapter: PgAdapter; + let listener: RealmFileChangesListener | undefined; + + function onRealmSetup(args: { + testRealm: Realm; + testRealmHttpServer: Server; + request: SuperTest; + dbAdapter: PgAdapter; + }) { + testRealm = args.testRealm; + request = withRealmPath(args.request, realmURL); + dbAdapter = args.dbAdapter; + } + + setupPermissionedRealmCached(hooks, { + fixture: 'blank', + realmURL, + permissions: { + '*': ['read', 'write'], + user: ['read', 'write', 'realm-owner'], + '@node-test_realm:localhost': ['read', 'realm-owner'], + }, + onRealmSetup, + }); + + hooks.beforeEach(async function () { + // Production wires `RealmFileChangesListener` up in `main.ts`; the + // permissioned-realm test fixture doesn't, so set up the equivalent + // here. Without it, the worker's NOTIFY would fire into the void + // and no replica would receive the wildcard wipe — the test + // would erroneously pass on the listener side regardless of the + // worker-side emit. + listener = new RealmFileChangesListener({ + dbAdapter, + lookupMountedRealm: (url) => + url === realmURL.href ? testRealm : undefined, + }); + await listener.start(); + }); + + hooks.afterEach(async function () { + await listener?.shutDown(); + listener = undefined; + }); + + const reindexSource = ` + import { contains, field, CardDef, Component } from "https://cardstack.com/base/card-api"; + import StringField from "https://cardstack.com/base/string"; + + export class WorkerNotifyCard extends CardDef { + @field name = contains(StringField); + static isolated = class Isolated extends Component { + + } + } + `; + + function authHeader() { + return `Bearer ${createJWT(testRealm, 'user', ['read', 'write'])}`; + } + + async function countLiveRowsForRealm(): Promise { + let rows = (await query(dbAdapter, [ + 'SELECT COUNT(*)::int AS n FROM module_transpile_cache WHERE realm_url =', + param(realmURL.href), + 'AND body IS NOT NULL', + ])) as { n: number }[]; + return rows[0]?.n ?? 0; + } + + async function seedL2Row(modulePath: string): Promise { + await testRealm.write(modulePath, reindexSource); + let response = await request + .get(`/${modulePath}`) + .set('Accept', SupportedMimeType.All) + .set('Authorization', authHeader()); + if (response.status !== 200) { + throw new Error( + `seedL2Row: expected 200 for /${modulePath}, got ${response.status}`, + ); + } + } + + async function waitForZeroLiveRows(timeoutMs = 5000): Promise { + // The worker emits NOTIFY synchronously after batch.done(); the + // listener's clearLocalSourceCaches fires-and-forgets the L2 bulk + // tombstone. Both legs settle quickly but neither is on the + // job.done critical path. Poll briefly so the assertion isn't + // racing the tombstone landing. + const started = Date.now(); + while (true) { + const n = await countLiveRowsForRealm(); + if (n === 0) return n; + if (Date.now() - started > timeoutMs) return n; + await new Promise((resolve) => setTimeout(resolve, 50)); + } + } + + test('realmIndexUpdater.fullIndex (no startReindex .then wired up) still tombstones L2 rows via the worker-side NOTIFY', async function (assert) { + await seedL2Row('worker-notify.gts'); + assert.ok( + (await countLiveRowsForRealm()) >= 1, + 'precondition: at least one live L2 row before reindex', + ); + + // Bypass `Realm.startReindex` (which DOES wire up the cache-drop + // .then per the original CS-11182 fix) and go straight through + // `RealmIndexUpdater.fullIndex`. This mirrors the production + // bypass paths (`handle-reindex.ts:reindex`, the `full-reindex` + // queue task, `Realm.fullIndex`) — none of them touch the + // `startReindex` chain. With only the original fix in place this + // assertion would fail; the worker-side `notifyAllFileChanges` + // is what makes it pass. + await testRealm.realmIndexUpdater.fullIndex(userInitiatedPriority); + + assert.strictEqual( + await waitForZeroLiveRows(), + 0, + 'L2 rows tombstoned by the worker-side NOTIFY even though startReindex never ran', + ); + }); + }, + ); }); diff --git a/packages/runtime-common/tasks/indexer.ts b/packages/runtime-common/tasks/indexer.ts index aa8c584e6e..7b2c1f0b67 100644 --- a/packages/runtime-common/tasks/indexer.ts +++ b/packages/runtime-common/tasks/indexer.ts @@ -2,6 +2,7 @@ import type * as JSONTypes from 'json-typescript'; import type { Task, WorkerArgs } from './index'; import { jobIdentity, + notifyAllFileChanges, userIdFromUsername, fetchUserPermissions, type RealmPermissions, @@ -365,6 +366,20 @@ const fromScratchIndex: Task = ({ args.realmURL }:\n${JSON.stringify(stats, null, 2)}`, ); + // CS-11182: emit the cross-replica `:*` wildcard so every + // mounted Realm drops its in-memory `#sourceCache` / `#transpiledModuleCache` + // and fires the L2 `module_transpile_cache` bulk tombstone for this + // realm. This is the single chokepoint that every from-scratch + // reindex flows through — startReindex's post-completion `.then` + // (the original fix) only covered POST /_full-reindex and + // POST /_reindex; the Grafana `/_grafana-reindex`, + // `/_grafana-full-reindex`, `/_post-deployment`, publish-realm + // `Realm.fullIndex`, and direct `enqueueReindexRealmJob` paths all + // bypassed it, leaving stale L1+L2 even after a successful reindex. + // Doing it here covers them all uniformly. Best-effort: failures + // fall back to a bounded staleness window because the next + // reader's transpile path re-tombstones the L2 row. + await notifyAllFileChanges(dbAdapter, args.realmURL); reportStatus(args.jobInfo, 'finish'); return { invalidations, From 6d491f30390fd8f1ced9b10fb2e4d89eb80e9631 Mon Sep 17 00:00:00 2001 From: Luke Melia Date: Wed, 20 May 2026 07:05:37 -0400 Subject: [PATCH 2/2] Fix CI: poll for fire-and-forget L2 tombstone, drop while(true) Two CI-only failures on the previous commit: - `while (true)` in `waitForZeroLiveRows` tripped `no-constant-condition`. Rewrote the loop to test the condition explicitly so the lint rule is satisfied (and the code reads more directly). - The pre-existing failure-isolation test (`reindex still tombstones L2 rows when clearRealmDefinitions throws`) asserted `countLiveRowsForRealm() === 0` immediately after `await testRealm.reindex()` returned, but `#dropAllTranspiledModuleCacheEntries` fires the L2 bulk UPDATE as `void` (fire-and-forget). Locally the UPDATE landed before the next query; on slower CI runners it didn't, so the test was flaky. Both tests in that sub-module now go through the same poll-with-timeout helper, matching the new worker-side test. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/module-cache-race-test.ts | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/packages/realm-server/tests/module-cache-race-test.ts b/packages/realm-server/tests/module-cache-race-test.ts index 0198fbc7da..5811249e7a 100644 --- a/packages/realm-server/tests/module-cache-race-test.ts +++ b/packages/realm-server/tests/module-cache-race-test.ts @@ -1255,6 +1255,20 @@ module(basename(__filename), function () { } } + // `#dropAllTranspiledModuleCacheEntries` fires the L2 bulk DELETE as + // a fire-and-forget — the .then chain doesn't await it. Poll briefly + // so the assertion isn't racing the UPDATE landing on slower CI + // machines. + async function waitForZeroLiveRows(timeoutMs = 5000): Promise { + let started = Date.now(); + let n = await countLiveRowsForRealm(); + while (n > 0 && Date.now() - started <= timeoutMs) { + await new Promise((resolve) => setTimeout(resolve, 50)); + n = await countLiveRowsForRealm(); + } + return n; + } + test('reindex tombstones live L2 rows for the realm', async function (assert) { await seedL2Row('reindex-happy.gts'); assert.ok( @@ -1265,7 +1279,7 @@ module(basename(__filename), function () { await testRealm.reindex(); assert.strictEqual( - await countLiveRowsForRealm(), + await waitForZeroLiveRows(), 0, 'reindex tombstoned every live L2 row for the realm', ); @@ -1295,7 +1309,7 @@ module(basename(__filename), function () { } assert.strictEqual( - await countLiveRowsForRealm(), + await waitForZeroLiveRows(), 0, 'bulk L2 tombstone ran even though clearRealmDefinitions threw', ); @@ -1413,13 +1427,13 @@ module(basename(__filename), function () { // tombstone. Both legs settle quickly but neither is on the // job.done critical path. Poll briefly so the assertion isn't // racing the tombstone landing. - const started = Date.now(); - while (true) { - const n = await countLiveRowsForRealm(); - if (n === 0) return n; - if (Date.now() - started > timeoutMs) return n; + let started = Date.now(); + let n = await countLiveRowsForRealm(); + while (n > 0 && Date.now() - started <= timeoutMs) { await new Promise((resolve) => setTimeout(resolve, 50)); + n = await countLiveRowsForRealm(); } + return n; } test('realmIndexUpdater.fullIndex (no startReindex .then wired up) still tombstones L2 rows via the worker-side NOTIFY', async function (assert) {