diff --git a/.changeset/fix-crawler-probe-all-registered-agents.md b/.changeset/fix-crawler-probe-all-registered-agents.md new file mode 100644 index 0000000000..1a297104b8 --- /dev/null +++ b/.changeset/fix-crawler-probe-all-registered-agents.md @@ -0,0 +1,4 @@ +--- +--- + +Fix registry crawler skipping non-sales registered agents for health/capability snapshots. The periodic crawl now re-fetches all registered agents on every tick instead of capturing only sales agents at startup, so signals/buying/creative agents get probed without a server restart. diff --git a/server/src/crawler.ts b/server/src/crawler.ts index 2a2b9cab18..3df612ce7e 100644 --- a/server/src/crawler.ts +++ b/server/src/crawler.ts @@ -162,14 +162,14 @@ export class CrawlerService { } } - startPeriodicCrawl(agents: Agent[], intervalMinutes: number = 60) { - // Initial crawl - this.crawlAllAgents(agents); - - // Periodic crawl - this.intervalId = setInterval(() => { - this.crawlAllAgents(agents); - }, intervalMinutes * 60 * 1000); + startPeriodicCrawl(getAgents: () => Promise, intervalMinutes: number = 60) { + const run = () => + getAgents() + .then(agents => this.crawlAllAgents(agents)) + .catch(err => log.error({ err }, 'Periodic crawl failed')); + + run(); + this.intervalId = setInterval(run, intervalMinutes * 60 * 1000); log.info({ intervalMinutes }, 'Periodic crawl started'); } diff --git a/server/src/http.ts b/server/src/http.ts index 1190692717..09ed717fae 100644 --- a/server/src/http.ts +++ b/server/src/http.ts @@ -2078,12 +2078,15 @@ export class HTTPServer { // outbound traffic to every registered agent. Per-agent refresh is // available to owners at POST /api/registry/agents/:encodedUrl/refresh. this.app.post("/api/crawler/run", requireAuth, requireAdmin, async (req, res) => { - // Crawler iterates sales agents — they're the ones with publisher - // authorizations and list_authorized_properties responses to walk. - // Pre-#3540 this filter was inverted (matched 'buying' against the - // accidentally-aligned classification); see #3774 for the sweep - // that closed the remaining gaps. - const agents = await this.agentService.listAgents("sales"); + // Full-registry crawl: all registered agents. Sales agents drive the + // publisher adagents.json walk; all agent types get health + capability + // snapshots via refreshAgentSnapshots. Mirrors the periodic-crawl scope + // added in #4213 so a manual admin run and the scheduled run behave + // identically. `viewerHasApiAccess` defaults to false — members_only + // agents are excluded from both paths intentionally (periodic crawl + // probes the public-facing registry surface; refreshSingleAgent covers + // owner-triggered probes for members_only agents). + const agents = await this.agentService.listAgents(); const result = await this.crawler.crawlAllAgents(agents); res.json(result); }); @@ -9031,16 +9034,17 @@ ${p.category ? `${p.category}\n` : ''}${publishedUrl}< logger.info({ isWorker }, 'Process role resolved'); if (isWorker) { - // Start periodic property crawler for sales agents — they're the - // ones with publisher authorizations and list_authorized_properties - // responses to walk. Pre-#3540 this filtered on 'buying' (inverted- - // but-aligned with the classification bug); see #3774 for the - // sweep that closed remaining gaps. - const salesAgents = await this.agentService.listAgents("sales"); - if (salesAgents.length > 0) { - logger.debug({ salesAgentCount: salesAgents.length }, 'Starting property crawler'); - this.crawler.startPeriodicCrawl(salesAgents, 360); // Crawl every 6 hours - } + // Start periodic registry crawler for all registered agents. Re-fetches + // the agent list on every tick so newly registered agents are picked up + // without a restart. Sales agents drive publisher adagents.json + // discovery; signals/buying/creative agents still need health + + // capability snapshots on the same cycle. `viewerHasApiAccess` defaults + // to false — members_only agents are intentionally excluded from the + // periodic crawl (the public-facing registry surface is the target); + // owner-triggered probes for members_only agents go through + // POST /api/registry/agents/:encodedUrl/refresh. Fixes #4213. + logger.debug('Starting registry crawler'); + this.crawler.startPeriodicCrawl(() => this.agentService.listAgents(), 360); // Crawl every 6 hours // Crawl catalog domains for adagents.json (demand-driven queue) this.crawler.startPeriodicCatalogCrawl(30); // Process queue every 30 minutes