diff --git a/packages/realm-server/tests/module-cache-race-test.ts b/packages/realm-server/tests/module-cache-race-test.ts index d8f959d80f..5811249e7a 100644 --- a/packages/realm-server/tests/module-cache-race-test.ts +++ b/packages/realm-server/tests/module-cache-race-test.ts @@ -11,9 +11,11 @@ import { SupportedMimeType, param, query, + userInitiatedPriority, } from '@cardstack/runtime-common'; import type { PgAdapter } from '@cardstack/postgres'; import { ModuleCacheCoordinator } from '../lib/module-cache-coordination'; +import { RealmFileChangesListener } from '../lib/realm-file-changes-listener'; import { setupPermissionedRealmCached, setupDB, @@ -1253,6 +1255,20 @@ module(basename(__filename), function () { } } + // `#dropAllTranspiledModuleCacheEntries` fires the L2 bulk DELETE as + // a fire-and-forget — the .then chain doesn't await it. Poll briefly + // so the assertion isn't racing the UPDATE landing on slower CI + // machines. + async function waitForZeroLiveRows(timeoutMs = 5000): Promise { + let started = Date.now(); + let n = await countLiveRowsForRealm(); + while (n > 0 && Date.now() - started <= timeoutMs) { + await new Promise((resolve) => setTimeout(resolve, 50)); + n = await countLiveRowsForRealm(); + } + return n; + } + test('reindex tombstones live L2 rows for the realm', async function (assert) { await seedL2Row('reindex-happy.gts'); assert.ok( @@ -1263,7 +1279,7 @@ module(basename(__filename), function () { await testRealm.reindex(); assert.strictEqual( - await countLiveRowsForRealm(), + await waitForZeroLiveRows(), 0, 'reindex tombstoned every live L2 row for the realm', ); @@ -1293,11 +1309,156 @@ module(basename(__filename), function () { } assert.strictEqual( - await countLiveRowsForRealm(), + await waitForZeroLiveRows(), 0, 'bulk L2 tombstone ran even though clearRealmDefinitions threw', ); }); }, ); + + // CS-11182 follow-up: the original fix only fired the L2 bulk + // tombstone from `Realm.startReindex`'s post-completion `.then`, which + // only covers `POST /_full-reindex` / `POST /_reindex`. + // Production reindexes triggered via the operator-action endpoints + // (`/_grafana-reindex`, `/_grafana-full-reindex`, `/_post-deployment`) + // and the publish-realm flow (`Realm.fullIndex`) all bypass + // `startReindex` and so left the L2 row live with pre-reindex bytes. + // The wider fix emits `notifyAllFileChanges(dbAdapter, realmURL)` from + // the worker side of the `from-scratch-index` task — every replica's + // `realm_file_changes` wildcard listener then drops L1 and fires the + // L2 bulk tombstone. This test exercises the bypass path + // (`realmIndexUpdater.fullIndex`, which never wires up the + // `startReindex` callback) and pins the new cross-replica behavior. + module( + 'Worker-side notify covers reindexes that bypass Realm.startReindex (CS-11182)', + function (hooks) { + let realmURL = new URL('http://127.0.0.1:4444/test/'); + let testRealm: Realm; + let request: RealmRequest; + let dbAdapter: PgAdapter; + let listener: RealmFileChangesListener | undefined; + + function onRealmSetup(args: { + testRealm: Realm; + testRealmHttpServer: Server; + request: SuperTest; + dbAdapter: PgAdapter; + }) { + testRealm = args.testRealm; + request = withRealmPath(args.request, realmURL); + dbAdapter = args.dbAdapter; + } + + setupPermissionedRealmCached(hooks, { + fixture: 'blank', + realmURL, + permissions: { + '*': ['read', 'write'], + user: ['read', 'write', 'realm-owner'], + '@node-test_realm:localhost': ['read', 'realm-owner'], + }, + onRealmSetup, + }); + + hooks.beforeEach(async function () { + // Production wires `RealmFileChangesListener` up in `main.ts`; the + // permissioned-realm test fixture doesn't, so set up the equivalent + // here. Without it, the worker's NOTIFY would fire into the void + // and no replica would receive the wildcard wipe — the test + // would erroneously pass on the listener side regardless of the + // worker-side emit. + listener = new RealmFileChangesListener({ + dbAdapter, + lookupMountedRealm: (url) => + url === realmURL.href ? testRealm : undefined, + }); + await listener.start(); + }); + + hooks.afterEach(async function () { + await listener?.shutDown(); + listener = undefined; + }); + + const reindexSource = ` + import { contains, field, CardDef, Component } from "https://cardstack.com/base/card-api"; + import StringField from "https://cardstack.com/base/string"; + + export class WorkerNotifyCard extends CardDef { + @field name = contains(StringField); + static isolated = class Isolated extends Component { + + } + } + `; + + function authHeader() { + return `Bearer ${createJWT(testRealm, 'user', ['read', 'write'])}`; + } + + async function countLiveRowsForRealm(): Promise { + let rows = (await query(dbAdapter, [ + 'SELECT COUNT(*)::int AS n FROM module_transpile_cache WHERE realm_url =', + param(realmURL.href), + 'AND body IS NOT NULL', + ])) as { n: number }[]; + return rows[0]?.n ?? 0; + } + + async function seedL2Row(modulePath: string): Promise { + await testRealm.write(modulePath, reindexSource); + let response = await request + .get(`/${modulePath}`) + .set('Accept', SupportedMimeType.All) + .set('Authorization', authHeader()); + if (response.status !== 200) { + throw new Error( + `seedL2Row: expected 200 for /${modulePath}, got ${response.status}`, + ); + } + } + + async function waitForZeroLiveRows(timeoutMs = 5000): Promise { + // The worker emits NOTIFY synchronously after batch.done(); the + // listener's clearLocalSourceCaches fires-and-forgets the L2 bulk + // tombstone. Both legs settle quickly but neither is on the + // job.done critical path. Poll briefly so the assertion isn't + // racing the tombstone landing. + let started = Date.now(); + let n = await countLiveRowsForRealm(); + while (n > 0 && Date.now() - started <= timeoutMs) { + await new Promise((resolve) => setTimeout(resolve, 50)); + n = await countLiveRowsForRealm(); + } + return n; + } + + test('realmIndexUpdater.fullIndex (no startReindex .then wired up) still tombstones L2 rows via the worker-side NOTIFY', async function (assert) { + await seedL2Row('worker-notify.gts'); + assert.ok( + (await countLiveRowsForRealm()) >= 1, + 'precondition: at least one live L2 row before reindex', + ); + + // Bypass `Realm.startReindex` (which DOES wire up the cache-drop + // .then per the original CS-11182 fix) and go straight through + // `RealmIndexUpdater.fullIndex`. This mirrors the production + // bypass paths (`handle-reindex.ts:reindex`, the `full-reindex` + // queue task, `Realm.fullIndex`) — none of them touch the + // `startReindex` chain. With only the original fix in place this + // assertion would fail; the worker-side `notifyAllFileChanges` + // is what makes it pass. + await testRealm.realmIndexUpdater.fullIndex(userInitiatedPriority); + + assert.strictEqual( + await waitForZeroLiveRows(), + 0, + 'L2 rows tombstoned by the worker-side NOTIFY even though startReindex never ran', + ); + }); + }, + ); }); diff --git a/packages/runtime-common/tasks/indexer.ts b/packages/runtime-common/tasks/indexer.ts index aa8c584e6e..7b2c1f0b67 100644 --- a/packages/runtime-common/tasks/indexer.ts +++ b/packages/runtime-common/tasks/indexer.ts @@ -2,6 +2,7 @@ import type * as JSONTypes from 'json-typescript'; import type { Task, WorkerArgs } from './index'; import { jobIdentity, + notifyAllFileChanges, userIdFromUsername, fetchUserPermissions, type RealmPermissions, @@ -365,6 +366,20 @@ const fromScratchIndex: Task = ({ args.realmURL }:\n${JSON.stringify(stats, null, 2)}`, ); + // CS-11182: emit the cross-replica `:*` wildcard so every + // mounted Realm drops its in-memory `#sourceCache` / `#transpiledModuleCache` + // and fires the L2 `module_transpile_cache` bulk tombstone for this + // realm. This is the single chokepoint that every from-scratch + // reindex flows through — startReindex's post-completion `.then` + // (the original fix) only covered POST /_full-reindex and + // POST /_reindex; the Grafana `/_grafana-reindex`, + // `/_grafana-full-reindex`, `/_post-deployment`, publish-realm + // `Realm.fullIndex`, and direct `enqueueReindexRealmJob` paths all + // bypassed it, leaving stale L1+L2 even after a successful reindex. + // Doing it here covers them all uniformly. Best-effort: failures + // fall back to a bounded staleness window because the next + // reader's transpile path re-tombstones the L2 row. + await notifyAllFileChanges(dbAdapter, args.realmURL); reportStatus(args.jobInfo, 'finish'); return { invalidations,