From 0d2c7821198b7d2b70025dc1017e103a5b9f7ff1 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Fri, 1 May 2026 19:18:18 +0200 Subject: [PATCH] fix: slop regex tightening + quote-or-die whitespace normalisation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Why Wave-1 LLM prompt engineer flagged this as Bug #12 in `docs/agent-fleet/bugs.md`. Two related problems with the slop filter: 1. **Bare-word patterns** (`\bcould\b`, `\bshould\b`, `\bmay\b`, `\bmight\b`, `\bconsider\s+/`) were too greedy. They dropped legitimate findings like "this could panic on null deref at line 42", "function should not be called from async context", "parseInt without radix might return NaN for hex strings". Real concrete claims that contained any of those bare words were silently filtered out. 2. **Quote-or-die was whitespace-fragile.** `String.includes(needle)` matched only on exact byte equality. Models routinely paraphrase whitespace — CRLF↔LF mismatches, tab→spaces normalisation, trailing whitespace stripped. Real findings were dropped because the model's `quoted_line` had a different leading-whitespace shape than the diff. ## What ### Slop filter rewritten as phrase patterns Bare-word triggers replaced with phrase shapes that distinguish actual filler from concrete claims: Before: /\bcould\b/i (kills "could panic at line 42") After: /\bit\s+(?:might|could)\s+be\s+(?:wise|good|...)/i /\b(?:we|you)\s+should\s+(?:consider|...)/i /\bconsider\s+\w+ing\b/i ("consider validating" gerund form) ...and a dozen more anchored to surrounding context. Plus added explicit phrase patterns for: "in general", "in some cases", "worth noting", "suggests that", "it would be wise", "improve maintainability", "needs more testing", and bare hedging openers like "it seems"/"arguably"/"likely". ### Quote-or-die now normalises before lookup New `normaliseForQuote(s)` exported function: - CRLF → LF - strips per-line trailing whitespace - collapses runs of internal whitespace within a line - drops empty lines Both the diff and the model's `quoted_line` are normalised before `String.includes`, so paraphrased whitespace no longer drops real findings. Semantic content is what matters. ## Source Bug #12, wave-1 LLM prompt engineer (`docs/agent-fleet/bugs.md`). ## Test plan - [x] 839 tests pass (was 834; net +5: added "it would be wise" / "worth noting" slop coverage; added "could panic at line 42" / "parseInt may return NaN" / "function should not be called from async context" non-slop coverage; added "this might lead to issues" removed from slop list as deliberately too generic to catch reliably) - [x] eslint clean - [ ] After deploy: a non-trivial AI review on a real PR should retain findings that previously got dropped because the model paraphrased whitespace or used "could"/"should"/"may" in a concrete claim. ## Risk & rollout - Risk: medium-low. The slop filter now passes some content it previously rejected. Net effect is *more* findings surfaced, including some that may be soft. The strict-JSON contract + verdict-from-findings still bound output quality. - Rollout: self-update on merge. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 (1M context) --- __tests__/unit/ai-review-prompt.test.js | 12 +++- src/ai-review-prompt.js | 86 ++++++++++++++++++------- 2 files changed, 74 insertions(+), 24 deletions(-) diff --git a/__tests__/unit/ai-review-prompt.test.js b/__tests__/unit/ai-review-prompt.test.js index 0d5ee44..3e6e9d5 100644 --- a/__tests__/unit/ai-review-prompt.test.js +++ b/__tests__/unit/ai-review-prompt.test.js @@ -56,7 +56,6 @@ describe('isSlop', () => { it.each([ 'consider edge cases here', 'developer should validate input', - 'this might lead to issues', 'could possibly fail in some cases', 'ensure proper error handling', 'in general, this is fine', @@ -64,6 +63,8 @@ describe('isSlop', () => { 'unable to determine without seeing other files', 'you should consider refactoring', 'enhance validation around the boundary', + 'it would be wise to add a fallback', + 'worth noting that this branch is untested', ])('flags hedging/slop: %s', (s) => { expect(isSlop(s)).toBe(true); }); @@ -73,6 +74,15 @@ describe('isSlop', () => { 'config.yml:165 sets pr_labels including "dependencies" but the PR adds no actual dep update.', 'src/foo.js:42 dereferences userId before validating req.body is an object.', 'The mutex is acquired but never released on the early-return path at line 88.', + // Concrete claims that contain the words "could"/"might"/"should"/"may" + // are no longer rejected wholesale — wave-1 LLM agent found that bare-word + // matching dropped legitimate findings. The new rules look for hedging + // *phrases* (e.g., "we should consider", "it might be wise"), not the + // bare words. + 'this could panic on null deref at line 42', + 'function should not be called from async context — it blocks', + 'the loop may overflow when n exceeds Number.MAX_SAFE_INTEGER', + 'parseInt without radix might return NaN for hex strings', ])('passes a concrete claim: %s', (s) => { expect(isSlop(s)).toBe(false); }); diff --git a/src/ai-review-prompt.js b/src/ai-review-prompt.js index 50f3dcc..d07450f 100644 --- a/src/ai-review-prompt.js +++ b/src/ai-review-prompt.js @@ -109,27 +109,41 @@ export function buildResponseFormat() { }; } +// Slop patterns target *phrase shapes* of low-information review noise, not +// individual words. The earlier list killed legitimate findings like "this +// could panic on null deref at line 42" because of bare `\bcould\b`. Wave-1 +// LLM agent flagged this regression class as Bug #12 in +// `docs/agent-fleet/bugs.md`. Each pattern below is anchored to a context +// that distinguishes filler ("we could improve X") from a concrete claim +// ("a null deref could happen here"). const HEDGE_OR_SLOP_PATTERNS = [ - /\bmight\b/i, - /\bcould\b/i, - /\bmay\b/i, - /\bpossibly\b/i, - /\bperhaps\b/i, - /\bgenerally\b/i, - /\btypically\b/i, - /\bconsider\s+/i, - /\bshould\s+/i, - /developer should/i, - /ensure proper error handling/i, - /consider edge cases/i, - /implement secure practices/i, - /(more|additional)\s+context/i, - /(unable|insufficient)\s+(to|context)/i, - /improve (robustness|validation)/i, - /enhance (the )?(schema|validation)/i, - /review and refactor/i, - /in (most )?cases/i, - /in general/i + // Self-deprecation / non-claims + /\b(?:perhaps|possibly)\s+(?:we|you|the developer|consider)/i, + /\bit\s+(?:might|may|could)\s+be\s+(?:wise|good|better|worth)/i, + /\bit\s+would\s+be\s+(?:wise|good|better|worth)/i, + /\b(?:we|you)\s+should\s+(?:consider|think about|review|look into)/i, + /\bconsider\s+\w+ing\b/i, // "consider validating", "consider adding tests" — gerund-form filler + /\b(?:typically|generally|usually)\s+,?\s+(?:we|you|one|it)/i, + /\bin\s+general\b/i, + /\bin\s+(?:some|most|all|certain)\s+cases\b/i, + /\bworth\s+(?:noting|considering|reviewing)/i, + /\bsuggests?\s+that\b/i, + + // Classic filler advice + /developer\s+should/i, + /ensure\s+proper\s+error\s+handling/i, + /consider\s+edge\s+cases/i, + /implement\s+secure\s+practices/i, + /(?:more|additional)\s+context/i, + /(?:unable|insufficient)\s+(?:to|context)/i, + /improve\s+(?:robustness|validation|maintainability|readability)/i, + /enhance\s+(?:the\s+)?(?:schema|validation|robustness)/i, + /review\s+and\s+refactor/i, + /needs?\s+(?:more|additional|further)\s+testing/i, + /add\s+(?:proper|appropriate)\s+(?:error\s+handling|validation|tests?)\b/i, + + // Pure hedge openers (claim starts with hedging without a target) + /^\s*(?:it\s+seems|it\s+appears|arguably|likely|probably)\b/i ]; export function isSlop(text) { @@ -137,6 +151,26 @@ export function isSlop(text) { return HEDGE_OR_SLOP_PATTERNS.some((re) => re.test(text)); } +/** + * Normalise a string for verbatim diff lookup. Strips trailing whitespace + * per line, normalises CRLF→LF, collapses runs of internal whitespace + * within a line. Models tend to paraphrase whitespace; the semantic + * content matters, not the spacing. + * + * Used by `filterFindings` for the quote-or-die check: the model's + * `quoted_line` and the candidate diff segment are both normalised before + * `String.includes`. + */ +export function normaliseForQuote(s) { + if (typeof s !== 'string') return ''; + return s + .replace(/\r\n/g, '\n') + .split('\n') + .map((line) => line.replace(/[ \t]+$/, '').replace(/\s+/g, ' ').trim()) + .filter((line) => line.length > 0) + .join('\n'); +} + const ALLOWED_VERDICTS = new Set(['approve', 'comment', 'request_changes']); /** @@ -201,11 +235,17 @@ export function tryParseReview(rawText) { */ export function filterFindings(findings, diffText) { if (!Array.isArray(findings)) return []; + // Normalise once per call. The model frequently paraphrases whitespace + // (CRLF↔LF, tab→spaces, trailing spaces stripped); without normalising + // we drop legitimate findings on a `String.includes` mismatch. + const haystack = (typeof diffText === 'string' && diffText.length > 0) + ? normaliseForQuote(diffText) + : ''; return findings.filter((f) => { if (isSlop(f.claim)) return false; - if (typeof diffText === 'string' && diffText.length > 0) { - const needle = f.quoted_line.trim(); - if (needle && !diffText.includes(needle)) return false; + if (haystack) { + const needle = normaliseForQuote(f.quoted_line); + if (needle && !haystack.includes(needle)) return false; } return true; });