From 4a1879532bcb9f0a90a3aa899feda305e9228543 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 8 May 2026 10:02:56 +0000 Subject: [PATCH] fix(registry): crawler probes all registered agent types, not just sales MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `startPeriodicCrawl` was seeded once at startup with a snapshot of sales-only agents. Signals, buying, and creative agents enrolled in the member profile were excluded from the periodic health + capability probe and stuck permanently on stale "degraded" status with "No health snapshot yet". Agents registered after server startup were also never picked up until restart. Fixes both root causes: - Filter bug: `listAgents("sales")` → `listAgents()` so all registered agent types enter `refreshAgentSnapshots` on every cycle. - Static-list bug: callback-getter pattern re-fetches agents on every tick so newly enrolled agents appear within one crawl cycle. `POST /api/crawler/run` (admin on-demand trigger) updated to match so manual and scheduled runs behave identically. `members_only` agents remain intentionally excluded from the periodic crawl (public-facing registry surface is the target); per-agent owner refresh continues through `POST /api/registry/agents/:url/refresh`. Closes #4213 https://claude.ai/code/session_01Qm6rGjKvgavkV9rynMebXx --- ...fix-crawler-probe-all-registered-agents.md | 4 +++ server/src/crawler.ts | 16 ++++----- server/src/http.ts | 36 ++++++++++--------- 3 files changed, 32 insertions(+), 24 deletions(-) create mode 100644 .changeset/fix-crawler-probe-all-registered-agents.md diff --git a/.changeset/fix-crawler-probe-all-registered-agents.md b/.changeset/fix-crawler-probe-all-registered-agents.md new file mode 100644 index 0000000000..1a297104b8 --- /dev/null +++ b/.changeset/fix-crawler-probe-all-registered-agents.md @@ -0,0 +1,4 @@ +--- +--- + +Fix registry crawler skipping non-sales registered agents for health/capability snapshots. The periodic crawl now re-fetches all registered agents on every tick instead of capturing only sales agents at startup, so signals/buying/creative agents get probed without a server restart. diff --git a/server/src/crawler.ts b/server/src/crawler.ts index 2a2b9cab18..3df612ce7e 100644 --- a/server/src/crawler.ts +++ b/server/src/crawler.ts @@ -162,14 +162,14 @@ export class CrawlerService { } } - startPeriodicCrawl(agents: Agent[], intervalMinutes: number = 60) { - // Initial crawl - this.crawlAllAgents(agents); - - // Periodic crawl - this.intervalId = setInterval(() => { - this.crawlAllAgents(agents); - }, intervalMinutes * 60 * 1000); + startPeriodicCrawl(getAgents: () => Promise, intervalMinutes: number = 60) { + const run = () => + getAgents() + .then(agents => this.crawlAllAgents(agents)) + .catch(err => log.error({ err }, 'Periodic crawl failed')); + + run(); + this.intervalId = setInterval(run, intervalMinutes * 60 * 1000); log.info({ intervalMinutes }, 'Periodic crawl started'); } diff --git a/server/src/http.ts b/server/src/http.ts index 1190692717..09ed717fae 100644 --- a/server/src/http.ts +++ b/server/src/http.ts @@ -2078,12 +2078,15 @@ export class HTTPServer { // outbound traffic to every registered agent. Per-agent refresh is // available to owners at POST /api/registry/agents/:encodedUrl/refresh. this.app.post("/api/crawler/run", requireAuth, requireAdmin, async (req, res) => { - // Crawler iterates sales agents — they're the ones with publisher - // authorizations and list_authorized_properties responses to walk. - // Pre-#3540 this filter was inverted (matched 'buying' against the - // accidentally-aligned classification); see #3774 for the sweep - // that closed the remaining gaps. - const agents = await this.agentService.listAgents("sales"); + // Full-registry crawl: all registered agents. Sales agents drive the + // publisher adagents.json walk; all agent types get health + capability + // snapshots via refreshAgentSnapshots. Mirrors the periodic-crawl scope + // added in #4213 so a manual admin run and the scheduled run behave + // identically. `viewerHasApiAccess` defaults to false — members_only + // agents are excluded from both paths intentionally (periodic crawl + // probes the public-facing registry surface; refreshSingleAgent covers + // owner-triggered probes for members_only agents). + const agents = await this.agentService.listAgents(); const result = await this.crawler.crawlAllAgents(agents); res.json(result); }); @@ -9031,16 +9034,17 @@ ${p.category ? `${p.category}\n` : ''}${publishedUrl}< logger.info({ isWorker }, 'Process role resolved'); if (isWorker) { - // Start periodic property crawler for sales agents — they're the - // ones with publisher authorizations and list_authorized_properties - // responses to walk. Pre-#3540 this filtered on 'buying' (inverted- - // but-aligned with the classification bug); see #3774 for the - // sweep that closed remaining gaps. - const salesAgents = await this.agentService.listAgents("sales"); - if (salesAgents.length > 0) { - logger.debug({ salesAgentCount: salesAgents.length }, 'Starting property crawler'); - this.crawler.startPeriodicCrawl(salesAgents, 360); // Crawl every 6 hours - } + // Start periodic registry crawler for all registered agents. Re-fetches + // the agent list on every tick so newly registered agents are picked up + // without a restart. Sales agents drive publisher adagents.json + // discovery; signals/buying/creative agents still need health + + // capability snapshots on the same cycle. `viewerHasApiAccess` defaults + // to false — members_only agents are intentionally excluded from the + // periodic crawl (the public-facing registry surface is the target); + // owner-triggered probes for members_only agents go through + // POST /api/registry/agents/:encodedUrl/refresh. Fixes #4213. + logger.debug('Starting registry crawler'); + this.crawler.startPeriodicCrawl(() => this.agentService.listAgents(), 360); // Crawl every 6 hours // Crawl catalog domains for adagents.json (demand-driven queue) this.crawler.startPeriodicCatalogCrawl(30); // Process queue every 30 minutes