From 7f48e811bb99262a6b5e55f907d3bbf0fd066a47 Mon Sep 17 00:00:00 2001
From: dacharyc <dc@dacharycarey.com>
Date: Sat, 11 Apr 2026 19:34:47 -0400
Subject: [PATCH] fix: scope URL discovery to a given subpath when supplied

---
 .../llms-txt-links-markdown.ts                |  41 +++--
 .../llms-txt-links-resolve.ts                 |  15 +-
 src/helpers/get-page-urls.ts                  |  51 ++++++-
 .../checks/llms-txt-links-markdown.test.ts    |  44 ++++++
 .../checks/llms-txt-links-resolve.test.ts     |  47 ++++++
 test/unit/helpers/get-page-urls.test.ts       | 143 ++++++++++++++++++
 6 files changed, 320 insertions(+), 21 deletions(-)
diff --git a/src/checks/content-discoverability/llms-txt-links-markdown.ts b/src/checks/content-discoverability/llms-txt-links-markdown.ts
index 59e4c24..fd8ff5a 100644
--- a/src/checks/content-discoverability/llms-txt-links-markdown.ts
+++ b/src/checks/content-discoverability/llms-txt-links-markdown.ts
@@ -1,5 +1,6 @@
 import { registerCheck } from '../registry.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
+import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
 import { toMdUrls } from '../../helpers/to-md-urls.js';
 import { looksLikeMarkdown } from '../../helpers/detect-markdown.js';
 import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
@@ -36,35 +37,45 @@ async function checkLlmsTxtLinksMarkdown(ctx: CheckContext): Promise<CheckResult
     };
   }
 
-  // Collect unique links and partition by origin
-  const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
-  const sameOriginLinks: string[] = [];
-  const crossOriginLinks: string[] = [];
+  // Collect unique links, scope to baseUrl path prefix, and partition by origin
+  const allExtractedUrls = new Set<string>();
   for (const file of discovered) {
     const links = extractMarkdownLinks(file.content);
     for (const link of links) {
       if (link.url.startsWith('http://') || link.url.startsWith('https://')) {
-        try {
-          const linkOrigin = new URL(link.url).origin;
-          if (linkOrigin === siteOrigin) {
-            if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
-          } else {
-            if (!crossOriginLinks.includes(link.url)) crossOriginLinks.push(link.url);
-          }
-        } catch {
-          if (!sameOriginLinks.includes(link.url)) sameOriginLinks.push(link.url);
-        }
+        allExtractedUrls.add(link.url);
+      }
+    }
+  }
+  const scopedUrls = filterByPathPrefix(Array.from(allExtractedUrls), getPathFilterBase(ctx));
+
+  const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
+  const sameOriginLinks: string[] = [];
+  const crossOriginLinks: string[] = [];
+  for (const url of scopedUrls) {
+    try {
+      const linkOrigin = new URL(url).origin;
+      if (linkOrigin === siteOrigin) {
+        sameOriginLinks.push(url);
+      } else {
+        crossOriginLinks.push(url);
       }
+    } catch {
+      sameOriginLinks.push(url);
     }
   }
 
   const totalLinks = sameOriginLinks.length + crossOriginLinks.length;
   if (totalLinks === 0) {
+    const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
+    const filteredOut = allExtractedUrls.size > 0 && baseUrlPath && baseUrlPath !== '/';
     return {
       id: 'llms-txt-links-markdown',
       category: 'content-discoverability',
       status: 'skip',
-      message: 'No HTTP(S) links found in llms.txt',
+      message: filteredOut
+        ? `llms.txt contains ${allExtractedUrls.size} link${allExtractedUrls.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
+        : 'No HTTP(S) links found in llms.txt',
     };
   }
 
diff --git a/src/checks/content-discoverability/llms-txt-links-resolve.ts b/src/checks/content-discoverability/llms-txt-links-resolve.ts
index 09546f0..a09c10d 100644
--- a/src/checks/content-discoverability/llms-txt-links-resolve.ts
+++ b/src/checks/content-discoverability/llms-txt-links-resolve.ts
@@ -1,6 +1,7 @@
 import { registerCheck } from '../registry.js';
 import { LINK_RESOLVE_THRESHOLD } from '../../constants.js';
 import { extractMarkdownLinks } from './llms-txt-valid.js';
+import { filterByPathPrefix, getPathFilterBase } from '../../helpers/get-page-urls.js';
 import type { CheckContext, CheckResult, DiscoveredFile } from '../../types.js';
 
 interface LinkCheckResult {
@@ -35,12 +36,20 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise<CheckResult>
     }
   }
 
-  if (allLinks.size === 0) {
+  // Scope links to the baseUrl path prefix so that docs at a subpath
+  // (e.g. /docs) don't include unrelated site content from root llms.txt.
+  const scopedUrls = filterByPathPrefix(Array.from(allLinks.keys()), getPathFilterBase(ctx));
+
+  if (scopedUrls.length === 0) {
+    const baseUrlPath = new URL(ctx.baseUrl).pathname.replace(/\/$/, '');
+    const filteredOut = allLinks.size > 0 && baseUrlPath && baseUrlPath !== '/';
     return {
       id: 'llms-txt-links-resolve',
       category: 'content-discoverability',
       status: 'skip',
-      message: 'No HTTP(S) links found in llms.txt',
+      message: filteredOut
+        ? `llms.txt contains ${allLinks.size} link${allLinks.size === 1 ? '' : 's'}, but none are under ${baseUrlPath}`
+        : 'No HTTP(S) links found in llms.txt',
     };
   }
 
@@ -48,7 +57,7 @@ async function checkLlmsTxtLinksResolve(ctx: CheckContext): Promise<CheckResult>
   const siteOrigin = ctx.effectiveOrigin ?? ctx.origin;
   const sameOriginLinks: string[] = [];
   const crossOriginLinks: string[] = [];
-  for (const url of allLinks.keys()) {
+  for (const url of scopedUrls) {
     try {
       const linkOrigin = new URL(url).origin;
       if (linkOrigin === siteOrigin) {
diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts
index 6ead14a..a096a3f 100644
--- a/src/helpers/get-page-urls.ts
+++ b/src/helpers/get-page-urls.ts
@@ -309,6 +309,43 @@ export async function getUrlsFromSitemap(
   return urls;
 }
 
+/**
+ * Get the base URL for path-prefix filtering, accounting for cross-host redirects.
+ *
+ * When a cross-host redirect is in play (e.g. example.com/docs → docs.example.com),
+ * the original baseUrl path doesn't apply to the redirected host, so we return the
+ * effectiveOrigin (a root URL) which makes path filtering a no-op.
+ */
+export function getPathFilterBase(ctx: CheckContext): string {
+  return ctx.effectiveOrigin && ctx.effectiveOrigin !== ctx.origin
+    ? ctx.effectiveOrigin
+    : ctx.baseUrl;
+}
+
+/**
+ * Filter URLs to those under the baseUrl's path prefix.
+ *
+ * When the input URL has a non-root path (e.g. `https://plaid.com/docs`),
+ * only URLs whose pathname starts with that prefix are kept. This prevents
+ * blog posts, marketing pages, and other non-docs content from polluting
+ * the URL pool when llms.txt or sitemaps cover the entire domain.
+ *
+ * Root URLs (path is `/`) pass all same-origin URLs through unfiltered.
+ */
+export function filterByPathPrefix(urls: string[], baseUrl: string): string[] {
+  const baseUrlPath = new URL(baseUrl).pathname.replace(/\/$/, '');
+  if (!baseUrlPath || baseUrlPath === '') return urls;
+
+  return urls.filter((url) => {
+    try {
+      const parsed = new URL(url);
+      return parsed.pathname === baseUrlPath || parsed.pathname.startsWith(baseUrlPath + '/');
+    } catch {
+      return true; // keep malformed URLs rather than silently dropping them
+    }
+  });
+}
+
 /**
  * Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl.
  *
@@ -316,23 +353,31 @@ export async function getUrlsFromSitemap(
  * 1. llms.txt links (from previous check results)
  * 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback)
  * 3. baseUrl fallback
+ *
+ * All discovered URLs are filtered to the baseUrl's path prefix so that
+ * docs at a subpath (e.g. `/docs`) don't include unrelated site content.
  */
 export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {
   const warnings: string[] = [];
 
+  const filterBase = getPathFilterBase(ctx);
+
   // 1. Try llms.txt links from cached results (if llms-txt-exists ran)
   const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx);
-  if (cachedUrls.length > 0) return { urls: cachedUrls, warnings };
+  const scopedCachedUrls = filterByPathPrefix(cachedUrls, filterBase);
+  if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings };
 
   // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run)
   if (!ctx.previousResults.has('llms-txt-exists')) {
     const fetchedUrls = await fetchLlmsTxtUrls(ctx);
-    if (fetchedUrls.length > 0) return { urls: fetchedUrls, warnings };
+    const scopedFetchedUrls = filterByPathPrefix(fetchedUrls, filterBase);
+    if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings };
   }
 
   // 3. Try sitemap
   const sitemapUrls = await getUrlsFromSitemap(ctx, warnings);
-  if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings };
+  const scopedSitemapUrls = filterByPathPrefix(sitemapUrls, filterBase);
+  if (scopedSitemapUrls.length > 0) return { urls: scopedSitemapUrls, warnings };
 
   // 4. Fallback
   return { urls: [ctx.baseUrl], warnings };
diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts
index e63951c..19ed51d 100644
--- a/test/unit/checks/llms-txt-links-markdown.test.ts
+++ b/test/unit/checks/llms-txt-links-markdown.test.ts
@@ -263,6 +263,50 @@ Just text, no links here.
     expect(result.status).toBe('fail');
   });
 
+  // ── Path-prefix scoping ──
+
+  it('scopes links to baseUrl path prefix', async () => {
+    // llms.txt has both docs and non-docs links; only docs links should be tested
+    const content = `# Site\n- [Intro](http://scope-md.local/docs/intro.md): Intro\n- [Blog](http://scope-md.local/blog/post.md): Blog\n- [Careers](http://scope-md.local/careers): Careers\n`;
+    const ctx = createContext('http://scope-md.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-md.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    // Only the /docs/intro.md link should be tested (markdownRate 100%)
+    expect(result.details?.testedLinks).toBe(1);
+    expect(result.details?.markdownRate).toBe(100);
+  });
+
+  it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
+    const content = `# Site\n- [Blog](http://scope-md2.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-md2.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-md2.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('skip');
+    expect(result.message).toContain('1 link');
+    expect(result.message).toContain('none are under /docs');
+  });
+
   it('uses toMdUrls to find .md variants (handles trailing slash and .html)', async () => {
     server.use(
       http.head(
diff --git a/test/unit/checks/llms-txt-links-resolve.test.ts b/test/unit/checks/llms-txt-links-resolve.test.ts
index 8e362c6..934b810 100644
--- a/test/unit/checks/llms-txt-links-resolve.test.ts
+++ b/test/unit/checks/llms-txt-links-resolve.test.ts
@@ -194,6 +194,53 @@ Just text, no links.
     expect(result.message).toContain('rate-limited (HTTP 429)');
   });
 
+  // ── Path-prefix scoping ──
+
+  it('scopes links to baseUrl path prefix', async () => {
+    server.use(
+      http.head('http://scope-res.local/docs/page1', () => new HttpResponse(null, { status: 200 })),
+    );
+
+    // llms.txt has both docs and non-docs links; only docs links should be tested
+    const content = `# Site\n- [Page](http://scope-res.local/docs/page1): Page\n- [Blog](http://scope-res.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-res.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-res.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('pass');
+    // Only the /docs/page1 link should be tested
+    expect(result.details?.sameOrigin).toMatchObject({ tested: 1, resolved: 1 });
+  });
+
+  it('skips with descriptive message when all links are outside the baseUrl path prefix', async () => {
+    const content = `# Site\n- [Blog](http://scope-res2.local/blog/post): Blog\n`;
+    const ctx = createContext('http://scope-res2.local/docs', { requestDelay: 0 });
+    const discovered: DiscoveredFile[] = [
+      { url: 'http://scope-res2.local/llms.txt', content, status: 200, redirected: false },
+    ];
+    ctx.previousResults.set('llms-txt-exists', {
+      id: 'llms-txt-exists',
+      category: 'content-discoverability',
+      status: 'pass',
+      message: 'Found',
+      details: { discoveredFiles: discovered },
+    });
+
+    const result = await check.run(ctx);
+    expect(result.status).toBe('skip');
+    expect(result.message).toContain('1 link');
+    expect(result.message).toContain('none are under /docs');
+  });
+
   it('includes "sampled" in message when results are sampled', async () => {
     const links = Array.from(
       { length: 5 },
diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts
index edb79b8..daf0d78 100644
--- a/test/unit/helpers/get-page-urls.test.ts
+++ b/test/unit/helpers/get-page-urls.test.ts
@@ -6,6 +6,7 @@ import {
   discoverAndSamplePages,
   parseSitemapUrls,
   parseSitemapDirectives,
+  filterByPathPrefix,
 } from '../../../src/helpers/get-page-urls.js';
 import { MAX_SITEMAP_URLS } from '../../../src/constants.js';
 import { createContext } from '../../../src/runner.js';
@@ -89,6 +90,64 @@ Sitemap: https://example.com/sitemap-blog.xml
   });
 });
 
+describe('filterByPathPrefix', () => {
+  it('filters URLs to those under the path prefix', () => {
+    const urls = [
+      'https://example.com/docs/intro',
+      'https://example.com/docs/guide',
+      'https://example.com/blog/post1',
+      'https://example.com/careers',
+    ];
+    const result = filterByPathPrefix(urls, 'https://example.com/docs');
+    expect(result).toEqual(['https://example.com/docs/intro', 'https://example.com/docs/guide']);
+  });
+
+  it('includes the exact baseUrl path itself', () => {
+    const urls = ['https://example.com/docs', 'https://example.com/docs/page'];
+    const result = filterByPathPrefix(urls, 'https://example.com/docs');
+    expect(result).toEqual(['https://example.com/docs', 'https://example.com/docs/page']);
+  });
+
+  it('passes all URLs through when baseUrl is at the root', () => {
+    const urls = [
+      'https://example.com/docs/intro',
+      'https://example.com/blog/post1',
+      'https://example.com/careers',
+    ];
+    const result = filterByPathPrefix(urls, 'https://example.com');
+    expect(result).toEqual(urls);
+  });
+
+  it('passes all URLs through when baseUrl has a trailing slash root', () => {
+    const urls = ['https://example.com/a', 'https://example.com/b'];
+    const result = filterByPathPrefix(urls, 'https://example.com/');
+    expect(result).toEqual(urls);
+  });
+
+  it('does not match partial path segments', () => {
+    // /docs-extra should NOT match /docs prefix
+    const urls = ['https://example.com/docs/page', 'https://example.com/docs-extra/page'];
+    const result = filterByPathPrefix(urls, 'https://example.com/docs');
+    expect(result).toEqual(['https://example.com/docs/page']);
+  });
+
+  it('handles deeper path prefixes', () => {
+    const urls = [
+      'https://example.com/api/v2/docs/page',
+      'https://example.com/api/v2/other',
+      'https://example.com/api/v1/docs/page',
+    ];
+    const result = filterByPathPrefix(urls, 'https://example.com/api/v2/docs');
+    expect(result).toEqual(['https://example.com/api/v2/docs/page']);
+  });
+
+  it('keeps malformed URLs rather than dropping them', () => {
+    const urls = ['not-a-url', 'https://example.com/docs/page'];
+    const result = filterByPathPrefix(urls, 'https://example.com/docs');
+    expect(result).toEqual(['not-a-url', 'https://example.com/docs/page']);
+  });
+});
+
 describe('getPageUrls', () => {
   function makeCtx(baseUrl = 'http://test.local', llmsTxtContent?: string) {
     const ctx = createContext(baseUrl, { requestDelay: 0 });
@@ -621,6 +680,90 @@ describe('getPageUrls', () => {
 
   // ── Existing sitemap tests ──
 
+  // ── Path-prefix scoping ──
+
+  it('scopes llms.txt URLs to the baseUrl path prefix', async () => {
+    const content = `# Docs\n> Summary\n## Links\n- [Intro](http://scope-test.local/docs/intro): Intro\n- [Guide](http://scope-test.local/docs/guide): Guide\n- [Blog](http://scope-test.local/blog/post1): A blog post\n- [Careers](http://scope-test.local/careers): Careers page\n`;
+    const ctx = makeCtx('http://scope-test.local/docs', content);
+
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toEqual([
+      'http://scope-test.local/docs/intro',
+      'http://scope-test.local/docs/guide',
+    ]);
+  });
+
+  it('does not filter when baseUrl is at the root', async () => {
+    const content = `# Docs\n- [A](http://root-scope.local/docs/a): A\n- [B](http://root-scope.local/blog/b): B\n`;
+    const ctx = makeCtx('http://root-scope.local', content);
+
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toEqual([
+      'http://root-scope.local/docs/a',
+      'http://root-scope.local/blog/b',
+    ]);
+  });
+
+  it('scopes sitemap URLs to the baseUrl path prefix', async () => {
+    server.use(
+      http.get(
+        'http://sitemap-scope.local/robots.txt',
+        () => new HttpResponse('', { status: 404 }),
+      ),
+      http.get(
+        'http://sitemap-scope.local/sitemap.xml',
+        () =>
+          new HttpResponse(
+            `<?xml version="1.0"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>http://sitemap-scope.local/docs/intro</loc></url>
+  <url><loc>http://sitemap-scope.local/docs/guide</loc></url>
+  <url><loc>http://sitemap-scope.local/blog/post1</loc></url>
+  <url><loc>http://sitemap-scope.local/careers</loc></url>
+</urlset>`,
+            { status: 200, headers: { 'Content-Type': 'application/xml' } },
+          ),
+      ),
+    );
+
+    const ctx = makeCtx('http://sitemap-scope.local/docs');
+    const result = await getPageUrls(ctx);
+    expect(result.urls).toEqual([
+      'http://sitemap-scope.local/docs/intro',
+      'http://sitemap-scope.local/docs/guide',
+    ]);
+  });
+
+  it('skips path filtering when effectiveOrigin differs from origin (cross-host redirect)', async () => {
+    // Simulate: user provides example.com/docs, which redirects to docs.example.com
+    // llms.txt on docs.example.com has links at root paths, not under /docs
+    const content = `# Docs\n- [Intro](http://xhost.local/intro): Intro\n- [Guide](http://xhost.local/guide): Guide\n`;
+    const ctx = makeCtx('http://original.local/docs', content);
+    // Simulate cross-host redirect detection
+    ctx.effectiveOrigin = 'http://xhost.local';
+
+    const result = await getPageUrls(ctx);
+    // Without the cross-host bypass, these would be filtered out (not under /docs)
+    expect(result.urls).toContain('http://xhost.local/intro');
+    expect(result.urls).toContain('http://xhost.local/guide');
+    expect(result.urls).toHaveLength(2);
+  });
+
+  it('falls back to baseUrl when path scoping filters out all discovered URLs', async () => {
+    // llms.txt has only non-docs URLs
+    const content = `# Site\n- [Blog](http://filter-all.local/blog/post): Post\n- [About](http://filter-all.local/about): About\n`;
+
+    server.use(
+      http.get('http://filter-all.local/robots.txt', () => new HttpResponse('', { status: 404 })),
+      http.get('http://filter-all.local/sitemap.xml', () => new HttpResponse('', { status: 404 })),
+    );
+
+    const ctx = makeCtx('http://filter-all.local/docs', content);
+    const result = await getPageUrls(ctx);
+    // Path filtering removed all llms.txt URLs, no sitemap available → fallback
+    expect(result.urls).toEqual(['http://filter-all.local/docs']);
+  });
+
   it('processes non-gzipped sitemaps alongside gzipped ones from robots.txt', async () => {
     server.use(
       http.get(