From 7f48e811bb99262a6b5e55f907d3bbf0fd066a47 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sat, 11 Apr 2026 19:34:47 -0400 Subject: [PATCH] fix: scope URL discovery to a given subpath when supplied --- .../llms-txt-links-markdown.ts | 41 +++-- .../llms-txt-links-resolve.ts | 15 +- src/helpers/get-page-urls.ts | 51 ++++++- .../checks/llms-txt-links-markdown.test.ts | 44 ++++++ .../checks/llms-txt-links-resolve.test.ts | 47 ++++++ test/unit/helpers/get-page-urls.test.ts | 143 ++++++++++++++++++ 6 files changed, 320 insertions(+), 21 deletions(-) diff --git a/src/checks/content-discoverability/llms-txt-links-markdown.ts b/src/checks/content-discoverability/llms-txt-links-markdown.ts index 59e4c24..fd8ff5a 100644 --- a/src/checks/content-discoverability/llms-txt-links-markdown.ts +++ b/src/checks/content-discoverability/llms-txt-links-markdown.ts @@ -1,5 +1,6 @@ import { registerCheck } from '../registry.js'; import { extractMarkdownLinks } from './llms-txt-valid.js'; +import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js'; import { toMdUrls } from '../../helpers/to-md-urls.js'; import { looksLikeMarkdown } from '../../helpers/detect-markdown.js'; import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; @@ -36,35 +37,45 @@ async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise(); for (const file of discovered) { const links = extractMarkdownLinks(file.content); for (const link of links) { if (link.url.startsWith('http://') || link.url.startsWith('https://')) { - try { - const linkOrigin = new URL(link.url).origin; - if (linkOrigin === siteOrigin) { - if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url); - } else { - if (!crossOriginLinks.includes(link.url)) crossOriginLinks.push(link.url); - } - } catch { - if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url); - } + allExtractedUrls.add(link.url); + } + } + } + const scopedUrls = filterByPathPrefix(Array.from(allExtractedUrls), getPathFilterBase(ctx)); + + const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; + const sameOriginLinks: string[] = []; + const crossOriginLinks: string[] = []; + for (const url of scopedUrls) { + try { + const linkOrigin = new URL(url).origin; + if (linkOrigin === siteOrigin) { + sameOriginLinks.push(url); + } else { + crossOriginLinks.push(url); } + } catch { + sameOriginLinks.push(url); } } const totalLinks = sameOriginLinks.length + crossOriginLinks.length; if (totalLinks === 0) { + const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, ''); + const filteredOut = allExtractedUrls.size > 0 && baseUrlPath && baseUrlPath !== '/'; return { id: 'llms-txt-links-markdown', category: 'content-discoverability', status: 'skip', - message: 'No HTTP(S) links found in llms.txt', + message: filteredOut + ? `llms.txt contains ${allExtractedUrls.size} link${allExtractedUrls.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}` + : 'No HTTP(S) links found in llms.txt', }; } diff --git a/src/checks/content-discoverability/llms-txt-links-resolve.ts b/src/checks/content-discoverability/llms-txt-links-resolve.ts index 09546f0..a09c10d 100644 --- a/src/checks/content-discoverability/llms-txt-links-resolve.ts +++ b/src/checks/content-discoverability/llms-txt-links-resolve.ts @@ -1,6 +1,7 @@ import { registerCheck } from '../registry.js'; import { LINK_RESOLVE_THRESHOLD } from '../../constants.js'; import { extractMarkdownLinks } from './llms-txt-valid.js'; +import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js'; import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js'; interface LinkCheckResult { @@ -35,12 +36,20 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise } } - if (allLinks.size === 0) { + // Scope links to the baseUrl path prefix so that docs at a subpath + // (e.g. /docs) don't include unrelated site content from root llms.txt. + const scopedUrls = filterByPathPrefix(Array.from(allLinks.keys()), getPathFilterBase(ctx)); + + if (scopedUrls.length === 0) { + const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, ''); + const filteredOut = allLinks.size > 0 && baseUrlPath && baseUrlPath !== '/'; return { id: 'llms-txt-links-resolve', category: 'content-discoverability', status: 'skip', - message: 'No HTTP(S) links found in llms.txt', + message: filteredOut + ? `llms.txt contains ${allLinks.size} link${allLinks.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}` + : 'No HTTP(S) links found in llms.txt', }; } @@ -48,7 +57,7 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise const siteOrigin = ctx.effectiveOrigin ?? ctx.origin; const sameOriginLinks: string[] = []; const crossOriginLinks: string[] = []; - for (const url of allLinks.keys()) { + for (const url of scopedUrls) { try { const linkOrigin = new URL(url).origin; if (linkOrigin === siteOrigin) { diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 6ead14a..a096a3f 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -309,6 +309,43 @@ export async function getUrlsFromSitemap( return urls; } +/** + * Get the base URL for path-prefix filtering, accounting for cross-host redirects. + * + * When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com), + * the original baseUrl path doesn't apply to the redirected host, so we return the + * effectiveOrigin (a root URL) which makes path filtering a no-op. + */ +export function getPathFilterBase(ctx: CheckContext): string { + return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin + ? ctx.effectiveOrigin + : ctx.baseUrl; +} + +/** + * Filter URLs to those under the baseUrl's path prefix. + * + * When the input URL has a non-root path (e.g. `https://plaid.com/docs`), + * only URLs whose pathname starts with that prefix are kept. This prevents + * blog posts, marketing pages, and other non-docs content from polluting + * the URL pool when llms.txt or sitemaps cover the entire domain. + * + * Root URLs (path is `/`) pass all same-origin URLs through unfiltered. + */ +export function filterByPathPrefix(urls: string[], baseUrl: string): string[] { + const baseUrlPath = new URL(baseUrl).pathname.replace(/\/$/, ''); + if (!baseUrlPath || baseUrlPath === '') return urls; + + return urls.filter((url) => { + try { + const parsed = new URL(url); + return parsed.pathname === baseUrlPath || parsed.pathname.startsWith(baseUrlPath + '/'); + } catch { + return true; // keep malformed URLs rather than silently dropping them + } + }); +} + /** * Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl. * @@ -316,23 +353,31 @@ export async function getUrlsFromSitemap( * 1. llms.txt links (from previous check results) * 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback) * 3. baseUrl fallback + * + * All discovered URLs are filtered to the baseUrl's path prefix so that + * docs at a subpath (e.g. `/docs`) don't include unrelated site content. */ export async function getPageUrls(ctx: CheckContext): Promise { const warnings: string[] = []; + const filterBase = getPathFilterBase(ctx); + // 1. Try llms.txt links from cached results (if llms-txt-exists ran) const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx); - if (cachedUrls.length > 0) return { urls: cachedUrls, warnings }; + const scopedCachedUrls = filterByPathPrefix(cachedUrls, filterBase); + if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings }; // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run) if (!ctx.previousResults.has('llms-txt-exists')) { const fetchedUrls = await fetchLlmsTxtUrls(ctx); - if (fetchedUrls.length > 0) return { urls: fetchedUrls, warnings }; + const scopedFetchedUrls = filterByPathPrefix(fetchedUrls, filterBase); + if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings }; } // 3. Try sitemap const sitemapUrls = await getUrlsFromSitemap(ctx, warnings); - if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings }; + const scopedSitemapUrls = filterByPathPrefix(sitemapUrls, filterBase); + if (scopedSitemapUrls.length > 0) return { urls: scopedSitemapUrls, warnings }; // 4. Fallback return { urls: [ctx.baseUrl], warnings }; diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts index e63951c..19ed51d 100644 --- a/test/unit/checks/llms-txt-links-markdown.test.ts +++ b/test/unit/checks/llms-txt-links-markdown.test.ts @@ -263,6 +263,50 @@ Just text, no links here. expect(result.status).toBe('fail'); }); + // ── Path-prefix scoping ── + + it('scopes links to baseUrl path prefix', async () => { + // llms.txt has both docs and non-docs links; only docs links should be tested + const content = `# Site\n- [Intro](http://scope-md.local/docs/intro.md): Intro\n- [Blog](http://scope-md.local/blog/post.md): Blog\n- [Careers](http://scope-md.local/careers): Careers\n`; + const ctx = createContext('http://scope-md.local/docs', { requestDelay: 0 }); + const discovered: DiscoveredFile[] = [ + { url: 'http://scope-md.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + // Only the /docs/intro.md link should be tested (markdownRate 100%) + expect(result.details?.testedLinks).toBe(1); + expect(result.details?.markdownRate).toBe(100); + }); + + it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => { + const content = `# Site\n- [Blog](http://scope-md2.local/blog/post): Blog\n`; + const ctx = createContext('http://scope-md2.local/docs', { requestDelay: 0 }); + const discovered: DiscoveredFile[] = [ + { url: 'http://scope-md2.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + + const result = await check.run(ctx); + expect(result.status).toBe('skip'); + expect(result.message).toContain('1 link'); + expect(result.message).toContain('none are under /docs'); + }); + it('uses toMdUrls to find .md variants (handles trailing slash and .html)', async () => { server.use( http.head( diff --git a/test/unit/checks/llms-txt-links-resolve.test.ts b/test/unit/checks/llms-txt-links-resolve.test.ts index 8e362c6..934b810 100644 --- a/test/unit/checks/llms-txt-links-resolve.test.ts +++ b/test/unit/checks/llms-txt-links-resolve.test.ts @@ -194,6 +194,53 @@ Just text, no links. expect(result.message).toContain('rate-limited (HTTP 429)'); }); + // ── Path-prefix scoping ── + + it('scopes links to baseUrl path prefix', async () => { + server.use( + http.head('http://scope-res.local/docs/page1', () => new HttpResponse(null, { status: 200 })), + ); + + // llms.txt has both docs and non-docs links; only docs links should be tested + const content = `# Site\n- [Page](http://scope-res.local/docs/page1): Page\n- [Blog](http://scope-res.local/blog/post): Blog\n`; + const ctx = createContext('http://scope-res.local/docs', { requestDelay: 0 }); + const discovered: DiscoveredFile[] = [ + { url: 'http://scope-res.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + + const result = await check.run(ctx); + expect(result.status).toBe('pass'); + // Only the /docs/page1 link should be tested + expect(result.details?.sameOrigin).toMatchObject({ tested: 1, resolved: 1 }); + }); + + it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => { + const content = `# Site\n- [Blog](http://scope-res2.local/blog/post): Blog\n`; + const ctx = createContext('http://scope-res2.local/docs', { requestDelay: 0 }); + const discovered: DiscoveredFile[] = [ + { url: 'http://scope-res2.local/llms.txt', content, status: 200, redirected: false }, + ]; + ctx.previousResults.set('llms-txt-exists', { + id: 'llms-txt-exists', + category: 'content-discoverability', + status: 'pass', + message: 'Found', + details: { discoveredFiles: discovered }, + }); + + const result = await check.run(ctx); + expect(result.status).toBe('skip'); + expect(result.message).toContain('1 link'); + expect(result.message).toContain('none are under /docs'); + }); + it('includes "sampled" in message when results are sampled', async () => { const links = Array.from( { length: 5 }, diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index edb79b8..daf0d78 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -6,6 +6,7 @@ import { discoverAndSamplePages, parseSitemapUrls, parseSitemapDirectives, + filterByPathPrefix, } from '../../../src/helpers/get-page-urls.js'; import { MAX_SITEMAP_URLS } from '../../../src/constants.js'; import { createContext } from '../../../src/runner.js'; @@ -89,6 +90,64 @@ Sitemap: https://example.com/sitemap-blog.xml }); }); +describe('filterByPathPrefix', () => { + it('filters URLs to those under the path prefix', () => { + const urls = [ + 'https://example.com/docs/intro', + 'https://example.com/docs/guide', + 'https://example.com/blog/post1', + 'https://example.com/careers', + ]; + const result = filterByPathPrefix(urls, 'https://example.com/docs'); + expect(result).toEqual(['https://example.com/docs/intro', 'https://example.com/docs/guide']); + }); + + it('includes the exact baseUrl path itself', () => { + const urls = ['https://example.com/docs', 'https://example.com/docs/page']; + const result = filterByPathPrefix(urls, 'https://example.com/docs'); + expect(result).toEqual(['https://example.com/docs', 'https://example.com/docs/page']); + }); + + it('passes all URLs through when baseUrl is at the root', () => { + const urls = [ + 'https://example.com/docs/intro', + 'https://example.com/blog/post1', + 'https://example.com/careers', + ]; + const result = filterByPathPrefix(urls, 'https://example.com'); + expect(result).toEqual(urls); + }); + + it('passes all URLs through when baseUrl has a trailing slash root', () => { + const urls = ['https://example.com/a', 'https://example.com/b']; + const result = filterByPathPrefix(urls, 'https://example.com/'); + expect(result).toEqual(urls); + }); + + it('does not match partial path segments', () => { + // /docs-extra should NOT match /docs prefix + const urls = ['https://example.com/docs/page', 'https://example.com/docs-extra/page']; + const result = filterByPathPrefix(urls, 'https://example.com/docs'); + expect(result).toEqual(['https://example.com/docs/page']); + }); + + it('handles deeper path prefixes', () => { + const urls = [ + 'https://example.com/api/v2/docs/page', + 'https://example.com/api/v2/other', + 'https://example.com/api/v1/docs/page', + ]; + const result = filterByPathPrefix(urls, 'https://example.com/api/v2/docs'); + expect(result).toEqual(['https://example.com/api/v2/docs/page']); + }); + + it('keeps malformed URLs rather than dropping them', () => { + const urls = ['not-a-url', 'https://example.com/docs/page']; + const result = filterByPathPrefix(urls, 'https://example.com/docs'); + expect(result).toEqual(['not-a-url', 'https://example.com/docs/page']); + }); +}); + describe('getPageUrls', () => { function makeCtx(baseUrl = 'http://test.local', llmsTxtContent?: string) { const ctx = createContext(baseUrl, { requestDelay: 0 }); @@ -621,6 +680,90 @@ describe('getPageUrls', () => { // ── Existing sitemap tests ── + // ── Path-prefix scoping ── + + it('scopes llms.txt URLs to the baseUrl path prefix', async () => { + const content = `# Docs\n> Summary\n## Links\n- [Intro](http://scope-test.local/docs/intro): Intro\n- [Guide](http://scope-test.local/docs/guide): Guide\n- [Blog](http://scope-test.local/blog/post1): A blog post\n- [Careers](http://scope-test.local/careers): Careers page\n`; + const ctx = makeCtx('http://scope-test.local/docs', content); + + const result = await getPageUrls(ctx); + expect(result.urls).toEqual([ + 'http://scope-test.local/docs/intro', + 'http://scope-test.local/docs/guide', + ]); + }); + + it('does not filter when baseUrl is at the root', async () => { + const content = `# Docs\n- [A](http://root-scope.local/docs/a): A\n- [B](http://root-scope.local/blog/b): B\n`; + const ctx = makeCtx('http://root-scope.local', content); + + const result = await getPageUrls(ctx); + expect(result.urls).toEqual([ + 'http://root-scope.local/docs/a', + 'http://root-scope.local/blog/b', + ]); + }); + + it('scopes sitemap URLs to the baseUrl path prefix', async () => { + server.use( + http.get( + 'http://sitemap-scope.local/robots.txt', + () => new HttpResponse('', { status: 404 }), + ), + http.get( + 'http://sitemap-scope.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://sitemap-scope.local/docs/intro + http://sitemap-scope.local/docs/guide + http://sitemap-scope.local/blog/post1 + http://sitemap-scope.local/careers +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const ctx = makeCtx('http://sitemap-scope.local/docs'); + const result = await getPageUrls(ctx); + expect(result.urls).toEqual([ + 'http://sitemap-scope.local/docs/intro', + 'http://sitemap-scope.local/docs/guide', + ]); + }); + + it('skips path filtering when effectiveOrigin differs from origin (cross-host redirect)', async () => { + // Simulate: user provides example.com/docs, which redirects to docs.example.com + // llms.txt on docs.example.com has links at root paths, not under /docs + const content = `# Docs\n- [Intro](http://xhost.local/intro): Intro\n- [Guide](http://xhost.local/guide): Guide\n`; + const ctx = makeCtx('http://original.local/docs', content); + // Simulate cross-host redirect detection + ctx.effectiveOrigin = 'http://xhost.local'; + + const result = await getPageUrls(ctx); + // Without the cross-host bypass, these would be filtered out (not under /docs) + expect(result.urls).toContain('http://xhost.local/intro'); + expect(result.urls).toContain('http://xhost.local/guide'); + expect(result.urls).toHaveLength(2); + }); + + it('falls back to baseUrl when path scoping filters out all discovered URLs', async () => { + // llms.txt has only non-docs URLs + const content = `# Site\n- [Blog](http://filter-all.local/blog/post): Post\n- [About](http://filter-all.local/about): About\n`; + + server.use( + http.get('http://filter-all.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get('http://filter-all.local/sitemap.xml', () => new HttpResponse('', { status: 404 })), + ); + + const ctx = makeCtx('http://filter-all.local/docs', content); + const result = await getPageUrls(ctx); + // Path filtering removed all llms.txt URLs, no sitemap available → fallback + expect(result.urls).toEqual(['http://filter-all.local/docs']); + }); + it('processes non-gzipped sitemaps alongside gzipped ones from robots.txt', async () => { server.use( http.get(