Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion __tests__/unit/ai-review-prompt.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,15 @@ describe('isSlop', () => {
it.each([
'consider edge cases here',
'developer should validate input',
'this might lead to issues',
'could possibly fail in some cases',
'ensure proper error handling',
'in general, this is fine',
'more context is needed',
'unable to determine without seeing other files',
'you should consider refactoring',
'enhance validation around the boundary',
'it would be wise to add a fallback',
'worth noting that this branch is untested',
])('flags hedging/slop: %s', (s) => {
expect(isSlop(s)).toBe(true);
});
Expand All @@ -73,6 +74,15 @@ describe('isSlop', () => {
'config.yml:165 sets pr_labels including "dependencies" but the PR adds no actual dep update.',
'src/foo.js:42 dereferences userId before validating req.body is an object.',
'The mutex is acquired but never released on the early-return path at line 88.',
// Concrete claims that contain the words "could"/"might"/"should"/"may"
// are no longer rejected wholesale — wave-1 LLM agent found that bare-word
// matching dropped legitimate findings. The new rules look for hedging
// *phrases* (e.g., "we should consider", "it might be wise"), not the
// bare words.
'this could panic on null deref at line 42',
'function should not be called from async context — it blocks',
'the loop may overflow when n exceeds Number.MAX_SAFE_INTEGER',
'parseInt without radix might return NaN for hex strings',
])('passes a concrete claim: %s', (s) => {
expect(isSlop(s)).toBe(false);
});
Expand Down
86 changes: 63 additions & 23 deletions src/ai-review-prompt.js
Original file line number Diff line number Diff line change
Expand Up @@ -109,34 +109,68 @@ export function buildResponseFormat() {
};
}

// Slop patterns target *phrase shapes* of low-information review noise, not
// individual words. The earlier list killed legitimate findings like "this
// could panic on null deref at line 42" because of bare `\bcould\b`. Wave-1
// LLM agent flagged this regression class as Bug #12 in
// `docs/agent-fleet/bugs.md`. Each pattern below is anchored to a context
// that distinguishes filler ("we could improve X") from a concrete claim
// ("a null deref could happen here").
const HEDGE_OR_SLOP_PATTERNS = [
/\bmight\b/i,
/\bcould\b/i,
/\bmay\b/i,
/\bpossibly\b/i,
/\bperhaps\b/i,
/\bgenerally\b/i,
/\btypically\b/i,
/\bconsider\s+/i,
/\bshould\s+/i,
/developer should/i,
/ensure proper error handling/i,
/consider edge cases/i,
/implement secure practices/i,
/(more|additional)\s+context/i,
/(unable|insufficient)\s+(to|context)/i,
/improve (robustness|validation)/i,
/enhance (the )?(schema|validation)/i,
/review and refactor/i,
/in (most )?cases/i,
/in general/i
// Self-deprecation / non-claims
/\b(?:perhaps|possibly)\s+(?:we|you|the developer|consider)/i,
/\bit\s+(?:might|may|could)\s+be\s+(?:wise|good|better|worth)/i,
/\bit\s+would\s+be\s+(?:wise|good|better|worth)/i,
/\b(?:we|you)\s+should\s+(?:consider|think about|review|look into)/i,
/\bconsider\s+\w+ing\b/i, // "consider validating", "consider adding tests" — gerund-form filler
/\b(?:typically|generally|usually)\s+,?\s+(?:we|you|one|it)/i,
/\bin\s+general\b/i,
/\bin\s+(?:some|most|all|certain)\s+cases\b/i,
/\bworth\s+(?:noting|considering|reviewing)/i,
/\bsuggests?\s+that\b/i,

// Classic filler advice
/developer\s+should/i,
/ensure\s+proper\s+error\s+handling/i,
/consider\s+edge\s+cases/i,
/implement\s+secure\s+practices/i,
/(?:more|additional)\s+context/i,
/(?:unable|insufficient)\s+(?:to|context)/i,
/improve\s+(?:robustness|validation|maintainability|readability)/i,
/enhance\s+(?:the\s+)?(?:schema|validation|robustness)/i,
/review\s+and\s+refactor/i,
/needs?\s+(?:more|additional|further)\s+testing/i,
/add\s+(?:proper|appropriate)\s+(?:error\s+handling|validation|tests?)\b/i,

// Pure hedge openers (claim starts with hedging without a target)
/^\s*(?:it\s+seems|it\s+appears|arguably|likely|probably)\b/i
];

export function isSlop(text) {
if (typeof text !== 'string') return true;
return HEDGE_OR_SLOP_PATTERNS.some((re) => re.test(text));
}

/**
* Normalise a string for verbatim diff lookup. Strips trailing whitespace
* per line, normalises CRLF→LF, collapses runs of internal whitespace
* within a line. Models tend to paraphrase whitespace; the semantic
* content matters, not the spacing.
*
* Used by `filterFindings` for the quote-or-die check: the model's
* `quoted_line` and the candidate diff segment are both normalised before
* `String.includes`.
*/
export function normaliseForQuote(s) {
if (typeof s !== 'string') return '';
return s
.replace(/\r\n/g, '\n')
.split('\n')
.map((line) => line.replace(/[ \t]+$/, '').replace(/\s+/g, ' ').trim())
.filter((line) => line.length > 0)
.join('\n');
}

const ALLOWED_VERDICTS = new Set(['approve', 'comment', 'request_changes']);

/**
Expand Down Expand Up @@ -201,11 +235,17 @@ export function tryParseReview(rawText) {
*/
export function filterFindings(findings, diffText) {
if (!Array.isArray(findings)) return [];
// Normalise once per call. The model frequently paraphrases whitespace
// (CRLF↔LF, tab→spaces, trailing spaces stripped); without normalising
// we drop legitimate findings on a `String.includes` mismatch.
const haystack = (typeof diffText === 'string' && diffText.length > 0)
? normaliseForQuote(diffText)
: '';
return findings.filter((f) => {
if (isSlop(f.claim)) return false;
if (typeof diffText === 'string' && diffText.length > 0) {
const needle = f.quoted_line.trim();
if (needle && !diffText.includes(needle)) return false;
if (haystack) {
const needle = normaliseForQuote(f.quoted_line);
if (needle && !haystack.includes(needle)) return false;
}
return true;
});
Expand Down
Loading