From ae277c5fc195b0fc3d895589b7ca6b6ec6558b8e Mon Sep 17 00:00:00 2001 From: Ben Kremer Date: Tue, 24 Feb 2026 22:41:05 +0700 Subject: [PATCH 1/2] feat(auto-review): add deduplication detection subagent Add conditional subagent that detects near-duplicate files in PRs using n-gram Jaccard similarity. Spawns when newly added files share >70% similarity with existing repo files or other added files. - Heuristic script with configurable threshold, binary/small file exclusion, docs/test-only skip, label and force overrides - Agent spec with severity scale, false-positive guardrails, DRY refactoring suggestions - 34 tests covering n-gram computation, similarity, decision logic, filesystem helpers, and GitHub API calls - Integrated into action.yml pipeline and findings extraction --- AGENTS.md | 1 + claude/auto-review/action.yml | 51 +++ .../agents/review-deduplication.md | 79 ++++ .../should-spawn-deduplication.test.js | 340 ++++++++++++++++++ .../scripts/extract-findings-from-comment.js | 3 +- .../scripts/should-spawn-deduplication.js | 310 ++++++++++++++++ 6 files changed, 783 insertions(+), 1 deletion(-) create mode 100644 claude/auto-review/agents/review-deduplication.md create mode 100644 claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js create mode 100644 claude/auto-review/scripts/should-spawn-deduplication.js diff --git a/AGENTS.md b/AGENTS.md index 4d5dae1..982bc66 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -53,6 +53,7 @@ Specialized review tasks run as conditional subagents to keep the main review co - **License Compliance** (`agents/review-license-compliance.md`) — spawned when dependency manifest/lockfiles change. Heuristic: `scripts/should-spawn-license-compliance.js`. Findings use `lic-` prefixed IDs. - **Data Classification** (`agents/review-data-classification.md`) — spawned when infrastructure, secret/env files, DB schemas, or API routes change, or when patches contain sensitive data keywords. Heuristic: `scripts/should-spawn-data-classification.js`. Findings use `dcl-` prefixed IDs. +- **Deduplication** (`agents/review-deduplication.md`) — spawned when newly added files have >70% n-gram Jaccard similarity with existing repo files or other added files. Heuristic: `scripts/should-spawn-deduplication.js`. Findings use `dup-` prefixed IDs. ### Workflows diff --git a/claude/auto-review/action.yml b/claude/auto-review/action.yml index aa3cc8a..09ebe4d 100644 --- a/claude/auto-review/action.yml +++ b/claude/auto-review/action.yml @@ -35,6 +35,10 @@ inputs: description: "Force data classification agent regardless of heuristic" required: false default: "false" + force_deduplication_agent: + description: "Force deduplication agent regardless of heuristic" + required: false + default: "false" runs: using: "composite" @@ -90,6 +94,25 @@ runs: echo "DATA_CLASSIFICATION_REASON=$REASON" >> $GITHUB_ENV echo "Data classification agent: spawn=$SPAWN reason=\"$REASON\"" + - name: Determine if deduplication agent should spawn + shell: bash + env: + GH_TOKEN: ${{ github.token }} + GITHUB_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_EVENT_PATH: ${{ github.event_path }} + FORCE_DEDUPLICATION_AGENT: ${{ inputs.force_deduplication_agent }} + run: | + SCRIPT_PATH="${{ github.action_path }}/scripts/should-spawn-deduplication.js" + RESULT=$(node "$SCRIPT_PATH") + SPAWN=$(echo "$RESULT" | jq -r '.spawn') + REASON=$(echo "$RESULT" | jq -r '.reason') + SIMILAR_PAIRS=$(echo "$RESULT" | jq -c '.similarPairs // []') + echo "SPAWN_DEDUPLICATION=$SPAWN" >> $GITHUB_ENV + echo "DEDUPLICATION_REASON=$REASON" >> $GITHUB_ENV + echo "DEDUP_SIMILAR_PAIRS=$SIMILAR_PAIRS" >> $GITHUB_ENV + echo "Deduplication agent: spawn=$SPAWN reason=\"$REASON\"" + - name: Set up review prompt shell: bash run: | @@ -250,6 +273,34 @@ runs: - Sort all findings by severity: CRITICAL > HIGH > MEDIUM > LOW" fi + # Conditionally add deduplication subagent instructions + if [[ "$SPAWN_DEDUPLICATION" == "true" ]]; then + PROMPT="$PROMPT + + --- + + ## DEDUPLICATION SUBAGENT + + Based on PR analysis: ${DEDUPLICATION_REASON} + + Similar file pairs detected: + ${DEDUP_SIMILAR_PAIRS} + + Spawn ONE specialized subagent to analyze code duplication. + + ### Instructions: + Use the Task tool with subagent_type=\"general-purpose\" to launch the agent. In the prompt include: + 1. \"Read your spec file at ${{ github.action_path }}/agents/review-deduplication.md and follow its instructions.\" + 2. PR number: ${{ github.event.pull_request.number }}, Repository: ${{ github.repository }} + 3. The list of changed files in this PR + 4. The similar file pairs data: ${DEDUP_SIMILAR_PAIRS} + + After the agent completes, merge its findings into your consolidated output. + - Use the agent's dup- prefixed IDs as-is + - Deduplicate if you found the same issue independently (prefer dup- prefixed ID) + - Sort all findings by severity: CRITICAL > HIGH > MEDIUM > LOW" + fi + # Add project context if [[ -n "${{ inputs.project_context }}" ]]; then PROMPT="$PROMPT diff --git a/claude/auto-review/agents/review-deduplication.md b/claude/auto-review/agents/review-deduplication.md new file mode 100644 index 0000000..98d226e --- /dev/null +++ b/claude/auto-review/agents/review-deduplication.md @@ -0,0 +1,79 @@ +# Deduplication Review Agent + +You are a specialized reviewer that detects near-duplicate and copy-pasted code introduced in a PR. Your job is to identify files or code blocks that share high structural similarity with existing repository code or with other newly added files, and recommend DRY refactoring. + +## Focus Areas + +### 1. Exact / Near-Exact Duplicates +Files that are identical or differ only in trivial ways (whitespace, comments, variable names). These are the strongest signals of copy-paste. + +### 2. Structural Duplicates +Files with the same control flow, function signatures, or class structure but different domain-specific values. Common in handler/controller/route files that were cloned from a template. + +### 3. Partial Duplicates +Significant code blocks (>20 lines) duplicated across files — utility functions, configuration blocks, error handling patterns, API call wrappers. + +### 4. Cross-Boundary Duplication +Same logic implemented in multiple layers (e.g., validation duplicated in frontend and backend, or identical transformations in multiple services). + +## Refactoring Suggestions + +When recommending fixes, prefer these strategies (in order): + +1. **Extract shared module** — Move common logic into a shared file imported by both consumers +2. **Composition** — Extract shared behavior into composable functions/hooks/mixins +3. **Generics / parameterization** — Make one implementation configurable instead of maintaining two near-identical copies +4. **Configuration-driven** — Replace duplicated code with data-driven dispatch (config objects, maps, registries) +5. **Code generation** — If the duplication is intentional scaffolding, suggest a generator/template + +## False-Positive Guardrails + +**CRITICAL: Minimize false positives. Follow these rules strictly:** + +- **Don't flag test fixtures or test data**: Test files often legitimately contain similar structures for different test cases +- **Don't flag boilerplate / scaffolding**: Framework-required files (e.g., `__init__.py`, `index.ts` barrel exports, `package.json`) naturally look similar +- **Don't flag generated code**: Files with generation headers, lock files, or clearly auto-generated content +- **Don't flag config files**: Multiple similar config files (webpack, eslint, tsconfig) for different packages in a monorepo +- **Don't flag protocol implementations**: Standards-compliant implementations (OpenAPI handlers, GraphQL resolvers) may share structure by design +- **Don't flag small files**: Files under 20 lines are too small to warrant deduplication concern +- **Read both files fully** before concluding they are duplicates. Similarity in structure does not always mean duplicated logic. +- **Consider the cost of abstraction**: If deduplicating would create a fragile shared dependency or reduce clarity, note this trade-off + +## Severity Scale + +- **CRITICAL**: >90% identical content, both files >100 lines — clear copy-paste that must be deduplicated +- **HIGH**: >70% similar, non-trivial files (>50 lines) — strong candidate for shared module extraction +- **MEDIUM**: Moderate structural overlap with meaningful shared logic blocks — worth refactoring +- **LOW**: Partial overlap or similar patterns — worth noting for future consideration + +## Output Format + +Use the same `#### Issue N:` format as the main review. **All IDs MUST use the `dup-` prefix.** + +``` +#### Issue N: Brief description of the duplication +**ID:** dup-{file-slug}-{semantic-slug}-{hash} +**File:** path/to/new-file.ext:line +**Severity:** CRITICAL/HIGH/MEDIUM/LOW +**Category:** duplication + +**Context:** +- **Pattern:** What duplication was detected (exact copy, structural clone, shared block) +- **Risk:** Maintenance burden — changes must be synchronized across N locations +- **Impact:** Divergence risk, bug duplication, increased code surface area +- **Trigger:** When one copy is updated but the other is forgotten + +**Recommendation:** How to deduplicate (extract module, parameterize, compose, etc.) +``` + +**ID Generation:** `dup-{filename}-{2-4-key-terms}-{SHA256(path+desc).substr(0,4)}` +Examples: +- `dup-handler-clone-user-api-a3f1` +- `dup-config-identical-webpack-b2c4` +- `dup-utils-shared-parse-logic-e7d2` + +## If No Duplication Issues Found + +If you find no duplication issues after thorough analysis, respond with exactly: + +"No duplication issues found." diff --git a/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js b/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js new file mode 100644 index 0000000..f50341a --- /dev/null +++ b/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js @@ -0,0 +1,340 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { + computeNGrams, + jaccardSimilarity, + shouldSpawnDeduplication, + listRepoFilesByExtension, + fetchPrFiles, + fetchPrLabels, +} from '../should-spawn-deduplication.js'; +import { ghApi } from '../lib/github-utils.js'; +import { spawnSync } from 'child_process'; + +vi.mock('../lib/github-utils.js', async () => { + const actual = await vi.importActual('../lib/github-utils.js'); + return { + ...actual, + ghApi: vi.fn(), + }; +}); + +vi.mock('child_process', async () => { + const actual = await vi.importActual('child_process'); + return { + ...actual, + spawnSync: vi.fn(actual.spawnSync), + }; +}); + +// ---- computeNGrams -------------------------------------------------------- + +describe('computeNGrams', () => { + it('should return correct n-grams for a simple string', () => { + const grams = computeNGrams('abcdef'); + expect(grams).toEqual(new Set(['abcde', 'bcdef'])); + }); + + it('should return empty set for string shorter than n', () => { + const grams = computeNGrams('abc'); + expect(grams.size).toBe(0); + }); + + it('should normalize whitespace', () => { + const grams1 = computeNGrams('ab cd ef'); + const grams2 = computeNGrams('ab cd ef'); + expect(grams1).toEqual(grams2); + }); + + it('should return empty set for empty string', () => { + const grams = computeNGrams(''); + expect(grams.size).toBe(0); + }); + + it('should support custom n parameter', () => { + const grams = computeNGrams('abcdef', 3); + expect(grams).toEqual(new Set(['abc', 'bcd', 'cde', 'def'])); + }); + + it('should return single n-gram when string length equals n', () => { + const grams = computeNGrams('abcde'); + expect(grams).toEqual(new Set(['abcde'])); + }); +}); + +// ---- jaccardSimilarity ---------------------------------------------------- + +describe('jaccardSimilarity', () => { + it('should return 1.0 for identical sets', () => { + const set = new Set(['a', 'b', 'c']); + expect(jaccardSimilarity(set, set)).toBe(1.0); + }); + + it('should return 0.0 for disjoint sets', () => { + const a = new Set(['a', 'b']); + const b = new Set(['c', 'd']); + expect(jaccardSimilarity(a, b)).toBe(0.0); + }); + + it('should return correct value for partial overlap', () => { + const a = new Set(['a', 'b', 'c']); + const b = new Set(['b', 'c', 'd']); + // intersection=2, union=4 + expect(jaccardSimilarity(a, b)).toBe(0.5); + }); + + it('should return 0.0 for two empty sets', () => { + expect(jaccardSimilarity(new Set(), new Set())).toBe(0); + }); + + it('should return 0.0 when one set is empty', () => { + expect(jaccardSimilarity(new Set(['a']), new Set())).toBe(0); + }); +}); + +// ---- shouldSpawnDeduplication --------------------------------------------- + +describe('shouldSpawnDeduplication', () => { + // ---- Skip / force conditions ---- + + it('should not spawn when skip-review label present', () => { + const files = [{ filename: 'src/new.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, { labels: ['skip-review'] }); + expect(result.spawn).toBe(false); + expect(result.reason).toBe('skip-review label present'); + }); + + it('should spawn when force flag set', () => { + const files = [{ filename: 'README.md', status: 'modified' }]; + const result = shouldSpawnDeduplication(files, { force: true }); + expect(result.spawn).toBe(true); + expect(result.reason).toContain('forced'); + expect(result.similarPairs).toEqual([]); + }); + + it('should not spawn for empty files array', () => { + const result = shouldSpawnDeduplication([]); + expect(result.spawn).toBe(false); + expect(result.reason).toBe('No files in PR'); + }); + + it('should not spawn for null files', () => { + const result = shouldSpawnDeduplication(null); + expect(result.spawn).toBe(false); + expect(result.reason).toBe('No files in PR'); + }); + + it('should spawn on deduplication label with empty pairs', () => { + const files = [{ filename: 'src/app.js', status: 'modified' }]; + const result = shouldSpawnDeduplication(files, { labels: ['deduplication'] }); + expect(result.spawn).toBe(true); + expect(result.reason).toBe('deduplication label'); + expect(result.similarPairs).toEqual([]); + }); + + // ---- No added files ---- + + it('should not spawn when no added files', () => { + const files = [{ filename: 'src/existing.js', status: 'modified' }]; + const result = shouldSpawnDeduplication(files); + expect(result.spawn).toBe(false); + expect(result.reason).toBe('No added files in PR'); + }); + + // ---- Docs-only / test-only exclusion ---- + + it('should not spawn for docs-only changes', () => { + const files = [{ filename: 'README.md', status: 'added' }]; + const result = shouldSpawnDeduplication(files); + expect(result.spawn).toBe(false); + expect(result.reason).toContain('documentation-only'); + }); + + it('should not spawn for test-only changes', () => { + const files = [{ filename: 'src/__tests__/foo.test.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files); + expect(result.spawn).toBe(false); + expect(result.reason).toContain('test-only'); + }); + + // ---- Similarity detection ---- + + it('should spawn when added file is highly similar to repo file', () => { + const content = 'function handleRequest(req, res) {\n const data = req.body;\n validate(data);\n process(data);\n res.json({ ok: true });\n}\n'; + const files = [{ filename: 'src/new-handler.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/new-handler.js', content]]), + repoFileContents: new Map([['src/old-handler.js', content]]), + }); + expect(result.spawn).toBe(true); + expect(result.similarPairs.length).toBeGreaterThan(0); + expect(result.similarPairs[0].similarity).toBeGreaterThanOrEqual(0.7); + expect(result.similarPairs[0].newFile).toBe('src/new-handler.js'); + expect(result.similarPairs[0].existingFile).toBe('src/old-handler.js'); + }); + + it('should not spawn when similarity is below threshold', () => { + const addedContent = 'const x = 1;\nconst y = 2;\nconst z = 3;\nconst w = 4;\nconst v = 5;\n'; + const repoContent = 'function totally() {\n different();\n code();\n here();\n really();\n}\n'; + const files = [{ filename: 'src/new.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/new.js', addedContent]]), + repoFileContents: new Map([['src/existing.js', repoContent]]), + }); + expect(result.spawn).toBe(false); + expect(result.reason).toContain('No similar file pairs'); + }); + + it('should detect new-vs-new similarity', () => { + const content = 'export function process(input) {\n const validated = validate(input);\n const transformed = transform(validated);\n return save(transformed);\n}\n'; + const files = [ + { filename: 'src/a.js', status: 'added' }, + { filename: 'src/b.js', status: 'added' }, + ]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([ + ['src/a.js', content], + ['src/b.js', content], + ]), + repoFileContents: new Map(), + }); + expect(result.spawn).toBe(true); + expect(result.similarPairs.length).toBe(1); + expect(result.similarPairs[0].similarity).toBe(1); + }); + + it('should return correct similarPairs data structure', () => { + const content = 'line one\nline two\nline three\nline four\nline five\n'; + const files = [{ filename: 'src/new.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/new.js', content]]), + repoFileContents: new Map([['src/existing.js', content]]), + }); + expect(result.spawn).toBe(true); + const pair = result.similarPairs[0]; + expect(pair).toHaveProperty('newFile'); + expect(pair).toHaveProperty('existingFile'); + expect(pair).toHaveProperty('similarity'); + expect(typeof pair.similarity).toBe('number'); + }); + + it('should cap similar pairs at 20', () => { + // Create 25 repo files identical to the added file + const content = 'export const handler = (req, res) => {\n res.send("ok");\n return true;\n // padding\n // more padding\n}\n'; + const repoContents = new Map(); + for (let i = 0; i < 25; i++) { + repoContents.set(`src/clone-${i}.js`, content); + } + const files = [{ filename: 'src/new.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/new.js', content]]), + repoFileContents: repoContents, + }); + expect(result.spawn).toBe(true); + expect(result.similarPairs.length).toBeLessThanOrEqual(20); + }); + + // ---- Binary / small file exclusion ---- + + it('should skip binary extensions', () => { + const files = [{ filename: 'assets/logo.png', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['assets/logo.png', 'binary data here but lets pretend\nline2\nline3\nline4\nline5\n']]), + repoFileContents: new Map(), + }); + expect(result.spawn).toBe(false); + expect(result.reason).toContain('No eligible added files'); + }); + + it('should skip files with fewer than 5 lines', () => { + const files = [{ filename: 'src/tiny.js', status: 'added' }]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/tiny.js', 'one\ntwo\nthree']]), + repoFileContents: new Map(), + }); + expect(result.spawn).toBe(false); + expect(result.reason).toContain('No eligible added files'); + }); +}); + +// ---- listRepoFilesByExtension --------------------------------------------- + +describe('listRepoFilesByExtension', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('should return file paths for given extension', () => { + spawnSync.mockReturnValue({ + stdout: './src/foo.js\n./src/bar.js\n', + status: 0, + error: null, + }); + const result = listRepoFilesByExtension('.js'); + expect(result).toEqual(['./src/foo.js', './src/bar.js']); + }); + + it('should pass exclusion dirs to find command', () => { + spawnSync.mockReturnValue({ stdout: '', status: 0, error: null }); + listRepoFilesByExtension('.ts'); + const args = spawnSync.mock.calls[0][1]; + expect(args).toContain('*/node_modules/*'); + expect(args).toContain('*/.git/*'); + expect(args).toContain('*/vendor/*'); + }); + + it('should handle empty result', () => { + spawnSync.mockReturnValue({ stdout: '', status: 0, error: null }); + const result = listRepoFilesByExtension('.rs'); + expect(result).toEqual([]); + }); + + it('should return empty array on error', () => { + spawnSync.mockReturnValue({ stdout: '', status: 1, error: new Error('fail') }); + const result = listRepoFilesByExtension('.js'); + expect(result).toEqual([]); + }); +}); + +// ---- fetchPrFiles / fetchPrLabels ----------------------------------------- + +describe('fetchPrFiles', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('should call ghApi with correct endpoint', () => { + ghApi.mockReturnValue([{ filename: 'test.js' }]); + const context = { repo: { owner: 'org', repo: 'repo' }, issue: { number: 42 } }; + const result = fetchPrFiles(context); + expect(ghApi).toHaveBeenCalledWith('/repos/org/repo/pulls/42/files'); + expect(result).toEqual([{ filename: 'test.js' }]); + }); + + it('should return empty array when ghApi returns null', () => { + ghApi.mockReturnValue(null); + const context = { repo: { owner: 'org', repo: 'repo' }, issue: { number: 1 } }; + const result = fetchPrFiles(context); + expect(result).toEqual([]); + }); +}); + +describe('fetchPrLabels', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('should call ghApi and return label names', () => { + ghApi.mockReturnValue([{ name: 'bug' }, { name: 'deduplication' }]); + const context = { repo: { owner: 'org', repo: 'repo' }, issue: { number: 42 } }; + const result = fetchPrLabels(context); + expect(ghApi).toHaveBeenCalledWith('/repos/org/repo/issues/42/labels'); + expect(result).toEqual(['bug', 'deduplication']); + }); + + it('should return empty array when ghApi returns null', () => { + ghApi.mockReturnValue(null); + const context = { repo: { owner: 'org', repo: 'repo' }, issue: { number: 1 } }; + const result = fetchPrLabels(context); + expect(result).toEqual([]); + }); +}); diff --git a/claude/auto-review/scripts/extract-findings-from-comment.js b/claude/auto-review/scripts/extract-findings-from-comment.js index cb45c1d..42193df 100644 --- a/claude/auto-review/scripts/extract-findings-from-comment.js +++ b/claude/auto-review/scripts/extract-findings-from-comment.js @@ -62,8 +62,9 @@ export function parseClaudeComment(commentBody) { 'brk': 'review-breaking-changes', 'lic': 'review-license-compliance', 'dcl': 'review-data-classification', + 'dup': 'review-deduplication', }; - const agentPrefixMatch = finding.id.match(/^(brk|lic|dcl)-/); + const agentPrefixMatch = finding.id.match(/^(brk|lic|dcl|dup)-/); if (agentPrefixMatch) { finding.agent = agentPrefixMap[agentPrefixMatch[1]]; } diff --git a/claude/auto-review/scripts/should-spawn-deduplication.js b/claude/auto-review/scripts/should-spawn-deduplication.js new file mode 100644 index 0000000..375226a --- /dev/null +++ b/claude/auto-review/scripts/should-spawn-deduplication.js @@ -0,0 +1,310 @@ +#!/usr/bin/env node + +/** + * Determine whether the deduplication subagent should be spawned + * based on n-gram Jaccard similarity of newly added files vs existing repo files. + * + * Outputs JSON: { spawn: boolean, reason: string, similarPairs: Array } + */ + +import fs from 'fs'; +import { spawnSync } from 'child_process'; +import { ghApi, loadGitHubContext, createLogger } from './lib/github-utils.js'; + +const logger = createLogger('should-spawn-deduplication.js'); + +// ---- Constants ------------------------------------------------------------ + +const SIMILARITY_THRESHOLD = 0.7; +const MAX_SIMILAR_PAIRS = 20; +const MAX_REPO_FILES_PER_EXT = 500; +const MIN_FILE_LINES = 5; +const MAX_FILE_LINES = 10000; + +const BINARY_EXTENSIONS = new Set([ + '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.svg', + '.woff', '.woff2', '.ttf', '.eot', + '.pdf', '.zip', '.tar', '.gz', '.lock', +]); + +const EXCLUDED_DIRS = [ + 'node_modules', 'vendor', 'dist', 'build', '.git', + '__pycache__', '.terraform', '.next', 'coverage', '.cache', +]; + +const DOCS_ONLY_REGEX = /\.(md|txt|rst|adoc)$/i; +const TEST_ONLY_REGEX = /(\/__tests__\/|\.test\.|\.spec\.|test\/|tests\/|__mocks__\/)/i; + +// ---- N-gram / similarity helpers ------------------------------------------ + +/** + * Compute character-level n-gram shingles from whitespace-normalized text. + * @param {string} text - Input text + * @param {number} n - Shingle size (default 5) + * @returns {Set} + */ +export function computeNGrams(text, n = 5) { + const normalized = text.replace(/\s+/g, ' ').trim(); + if (normalized.length < n) return new Set(); + const grams = new Set(); + for (let i = 0; i <= normalized.length - n; i++) { + grams.add(normalized.substring(i, i + n)); + } + return grams; +} + +/** + * Compute Jaccard similarity between two sets. + * @param {Set} set1 + * @param {Set} set2 + * @returns {number} 0-1 + */ +export function jaccardSimilarity(set1, set2) { + if (set1.size === 0 && set2.size === 0) return 0; + let intersection = 0; + const [smaller, larger] = set1.size <= set2.size ? [set1, set2] : [set2, set1]; + for (const item of smaller) { + if (larger.has(item)) intersection++; + } + const union = set1.size + set2.size - intersection; + if (union === 0) return 0; + return intersection / union; +} + +// ---- Filesystem helpers --------------------------------------------------- + +/** + * List repo files matching a given extension, excluding common non-source dirs. + * @param {string} extension - File extension including dot (e.g. '.js') + * @returns {string[]} + */ +export function listRepoFilesByExtension(extension) { + const excludeArgs = EXCLUDED_DIRS.flatMap(dir => ['-path', `*/${dir}/*`, '-o']); + // Remove trailing '-o' + excludeArgs.pop(); + + const result = spawnSync('find', [ + '.', '(', ...excludeArgs, ')', '-prune', '-o', + '-type', 'f', '-name', `*${extension}`, '-print', + ], { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], maxBuffer: 10 * 1024 * 1024 }); + + if (result.error || result.status !== 0) return []; + + return result.stdout + .split('\n') + .map(l => l.trim()) + .filter(Boolean) + .slice(0, MAX_REPO_FILES_PER_EXT); +} + +// ---- Core decision function ----------------------------------------------- + +/** + * Determine whether the deduplication agent should be spawned. + * + * @param {Array} files - PR file objects from GitHub API + * @param {Object} metadata + * @param {string[]} [metadata.labels] + * @param {boolean} [metadata.force] + * @param {Object} options - Injectable content maps for testing + * @param {Map} [options.addedFileContents] + * @param {Map} [options.repoFileContents] + * @returns {{ spawn: boolean, reason: string, similarPairs: Array<{newFile: string, existingFile: string, similarity: number}> }} + */ +export function shouldSpawnDeduplication(files, metadata = {}, options = {}) { + const { labels = [], force = false } = metadata; + + // Force override + if (force) { + return { spawn: true, reason: 'forced via input', similarPairs: [] }; + } + + // Skip conditions + if (labels.includes('skip-review')) { + return { spawn: false, reason: 'skip-review label present', similarPairs: [] }; + } + + if (!files || files.length === 0) { + return { spawn: false, reason: 'No files in PR', similarPairs: [] }; + } + + // Label trigger — subagent will do its own analysis + if (labels.includes('deduplication')) { + return { spawn: true, reason: 'deduplication label', similarPairs: [] }; + } + + // Filter to added files only + const addedFiles = files.filter(f => f.status === 'added'); + if (addedFiles.length === 0) { + return { spawn: false, reason: 'No added files in PR', similarPairs: [] }; + } + + // Docs-only / test-only exclusion (across all PR files, not just added) + const allDocs = files.every(f => DOCS_ONLY_REGEX.test(f.filename)); + if (allDocs) { + return { spawn: false, reason: 'All files are documentation-only', similarPairs: [] }; + } + + const allTests = files.every(f => TEST_ONLY_REGEX.test(f.filename)); + if (allTests) { + return { spawn: false, reason: 'All files are test-only', similarPairs: [] }; + } + + // Build content + n-gram maps for added files + const addedContents = options.addedFileContents || new Map(); + const repoContents = options.repoFileContents || new Map(); + const addedNGrams = new Map(); + + for (const file of addedFiles) { + const ext = extname(file.filename); + if (BINARY_EXTENSIONS.has(ext)) continue; + + let content = addedContents.get(file.filename); + if (content === undefined) { + try { + content = fs.readFileSync(file.filename, 'utf8'); + } catch { + continue; + } + } + + const lines = content.split('\n').length; + if (lines < MIN_FILE_LINES || lines > MAX_FILE_LINES) continue; + + addedNGrams.set(file.filename, computeNGrams(content)); + } + + if (addedNGrams.size === 0) { + return { spawn: false, reason: 'No eligible added files for similarity check', similarPairs: [] }; + } + + const similarPairs = []; + + // Compare added files vs repo files + const checkedExtensions = new Set(); + for (const [addedPath, addedGrams] of addedNGrams) { + const ext = extname(addedPath); + + // Discover repo files for this extension (once per extension) + if (!checkedExtensions.has(ext)) { + checkedExtensions.add(ext); + if (!repoContents.size) { + const repoPaths = listRepoFilesByExtension(ext); + for (const rp of repoPaths) { + // Normalize ./prefix + const normalized = rp.startsWith('./') ? rp.slice(2) : rp; + if (addedNGrams.has(normalized)) continue; // skip self + try { + repoContents.set(normalized, fs.readFileSync(rp, 'utf8')); + } catch { + // skip unreadable + } + } + } + } + + for (const [repoPath, repoContent] of repoContents) { + if (extname(repoPath) !== ext) continue; + const repoGrams = computeNGrams(repoContent); + const sim = jaccardSimilarity(addedGrams, repoGrams); + if (sim >= SIMILARITY_THRESHOLD) { + similarPairs.push({ newFile: addedPath, existingFile: repoPath, similarity: Math.round(sim * 1000) / 1000 }); + } + } + } + + // Compare added files vs each other + const addedPaths = [...addedNGrams.keys()]; + for (let i = 0; i < addedPaths.length; i++) { + for (let j = i + 1; j < addedPaths.length; j++) { + const sim = jaccardSimilarity(addedNGrams.get(addedPaths[i]), addedNGrams.get(addedPaths[j])); + if (sim >= SIMILARITY_THRESHOLD) { + similarPairs.push({ newFile: addedPaths[i], existingFile: addedPaths[j], similarity: Math.round(sim * 1000) / 1000 }); + } + } + } + + // Sort descending, cap + similarPairs.sort((a, b) => b.similarity - a.similarity); + const capped = similarPairs.slice(0, MAX_SIMILAR_PAIRS); + + if (capped.length === 0) { + return { spawn: false, reason: 'No similar file pairs above threshold', similarPairs: [] }; + } + + const topSim = capped[0].similarity; + return { + spawn: true, + reason: `${capped.length} similar pair(s) found (top similarity: ${topSim})`, + similarPairs: capped, + }; +} + +// ---- Utility -------------------------------------------------------------- + +function extname(filepath) { + const idx = filepath.lastIndexOf('.'); + return idx === -1 ? '' : filepath.slice(idx).toLowerCase(); +} + +// ---- GitHub API helpers --------------------------------------------------- + +/** + * Fetch PR files from GitHub API + * @param {Object} context - GitHub context + * @returns {Array} PR files + */ +export function fetchPrFiles(context) { + return ghApi( + `/repos/${context.repo.owner}/${context.repo.repo}/pulls/${context.issue.number}/files` + ) || []; +} + +/** + * Fetch PR labels from GitHub API + * @param {Object} context - GitHub context + * @returns {string[]} Label names + */ +export function fetchPrLabels(context) { + const labels = ghApi( + `/repos/${context.repo.owner}/${context.repo.repo}/issues/${context.issue.number}/labels` + ) || []; + return labels.map(l => l.name); +} + +// ---- CLI entry point ------------------------------------------------------ + +/** + * Main entry point + */ +export function main() { + const context = loadGitHubContext(); + + if (!context.issue.number) { + const result = { spawn: false, reason: 'Not a pull request event', similarPairs: [] }; + console.log(JSON.stringify(result)); + return result; + } + + const force = process.env.FORCE_DEDUPLICATION_AGENT === 'true'; + const files = fetchPrFiles(context); + const labels = fetchPrLabels(context); + + const result = shouldSpawnDeduplication(files, { labels, force }); + + logger.error(`Decision: spawn=${result.spawn}, reason="${result.reason}", pairs=${result.similarPairs.length}`); + console.log(JSON.stringify(result)); + + return result; +} + +// Execute main() only when run directly +if (import.meta.url === `file://${process.argv[1]}`) { + try { + main(); + } catch (error) { + logger.error(`Error: ${error.message}`); + console.log(JSON.stringify({ spawn: false, reason: `Error: ${error.message}`, similarPairs: [] })); + process.exit(0); + } +} From a98ece25af96b4acef0dfdf8f7840e60dc77cf91 Mon Sep 17 00:00:00 2001 From: Ben Kremer Date: Tue, 24 Feb 2026 22:52:09 +0700 Subject: [PATCH 2/2] fix: remove inner repoContents guard breaking multi-extension loading The `if (!repoContents.size)` guard prevented repo file loading for 2nd+ extensions since the shared Map was already populated. The outer `checkedExtensions` Set already deduplicates per extension. Also add multi-extension test to cover this scenario. --- .../should-spawn-deduplication.test.js | 17 ++++++++++++++++ .../scripts/should-spawn-deduplication.js | 20 +++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js b/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js index f50341a..3cd68a7 100644 --- a/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js +++ b/claude/auto-review/scripts/__tests__/should-spawn-deduplication.test.js @@ -233,6 +233,23 @@ describe('shouldSpawnDeduplication', () => { expect(result.similarPairs.length).toBeLessThanOrEqual(20); }); + // ---- Multi-extension scenario ---- + + it('should detect similarity across different extensions in same PR', () => { + const content = 'export function handler(req, res) {\n validate(req);\n process(req);\n res.json({ ok: true });\n return;\n}\n'; + const files = [ + { filename: 'src/new.js', status: 'added' }, + { filename: 'src/new.ts', status: 'added' }, + ]; + const result = shouldSpawnDeduplication(files, {}, { + addedFileContents: new Map([['src/new.js', content], ['src/new.ts', content]]), + repoFileContents: new Map([['src/old.js', content], ['src/old.ts', content]]), + }); + expect(result.spawn).toBe(true); + // Should find: new.js vs old.js, new.ts vs old.ts, plus cross-ext new-vs-new + expect(result.similarPairs.length).toBeGreaterThanOrEqual(2); + }); + // ---- Binary / small file exclusion ---- it('should skip binary extensions', () => { diff --git a/claude/auto-review/scripts/should-spawn-deduplication.js b/claude/auto-review/scripts/should-spawn-deduplication.js index 375226a..c19070c 100644 --- a/claude/auto-review/scripts/should-spawn-deduplication.js +++ b/claude/auto-review/scripts/should-spawn-deduplication.js @@ -188,17 +188,15 @@ export function shouldSpawnDeduplication(files, metadata = {}, options = {}) { // Discover repo files for this extension (once per extension) if (!checkedExtensions.has(ext)) { checkedExtensions.add(ext); - if (!repoContents.size) { - const repoPaths = listRepoFilesByExtension(ext); - for (const rp of repoPaths) { - // Normalize ./prefix - const normalized = rp.startsWith('./') ? rp.slice(2) : rp; - if (addedNGrams.has(normalized)) continue; // skip self - try { - repoContents.set(normalized, fs.readFileSync(rp, 'utf8')); - } catch { - // skip unreadable - } + const repoPaths = listRepoFilesByExtension(ext); + for (const rp of repoPaths) { + // Normalize ./prefix + const normalized = rp.startsWith('./') ? rp.slice(2) : rp; + if (addedNGrams.has(normalized)) continue; // skip self + try { + repoContents.set(normalized, fs.readFileSync(rp, 'utf8')); + } catch { + // skip unreadable } } }