diff --git a/.claude/agents/compliance.md b/.claude/agents/compliance.md new file mode 100644 index 0000000..262021c --- /dev/null +++ b/.claude/agents/compliance.md @@ -0,0 +1,8 @@ +--- +name: compliance +model: claude-sonnet-4-5 +description: CLAUDE.md compliance specialist +--- + +Audit changed files against relevant CLAUDE.md guidance. +Return only JSON findings with concrete rule references. diff --git a/.claude/agents/quality.md b/.claude/agents/quality.md new file mode 100644 index 0000000..9b64144 --- /dev/null +++ b/.claude/agents/quality.md @@ -0,0 +1,8 @@ +--- +name: quality +model: claude-opus-4-6 +description: Code quality specialist for correctness and reliability +--- + +Find high-signal correctness, reliability, and performance issues. +Return only JSON findings. diff --git a/.claude/agents/security.md b/.claude/agents/security.md new file mode 100644 index 0000000..2295274 --- /dev/null +++ b/.claude/agents/security.md @@ -0,0 +1,8 @@ +--- +name: security +model: claude-opus-4-6 +description: Security specialist for exploitable vulnerabilities +--- + +Find exploitable vulnerabilities in changed code with concrete attack paths. +Return only JSON findings including exploit preconditions and trust boundary. diff --git a/.claude/agents/triage.md b/.claude/agents/triage.md new file mode 100644 index 0000000..69fa84c --- /dev/null +++ b/.claude/agents/triage.md @@ -0,0 +1,8 @@ +--- +name: triage +model: claude-haiku-4-5 +description: Fast PR triage for skip/continue decisions +--- + +Determine whether review can be skipped safely. +Return only JSON with `skip_review`, `reason`, and `risk_level`. diff --git a/.claude/agents/validator.md b/.claude/agents/validator.md new file mode 100644 index 0000000..1d69298 --- /dev/null +++ b/.claude/agents/validator.md @@ -0,0 +1,8 @@ +--- +name: validator +model: claude-sonnet-4-5 +description: Finding validation and deduplication specialist +--- + +Validate candidate findings with strict confidence and impact criteria. +Return only JSON decisions for keep/drop. diff --git a/.claude/commands/review.md b/.claude/commands/review.md index 50ae979..9a9e14f 100644 --- a/.claude/commands/review.md +++ b/.claude/commands/review.md @@ -40,6 +40,12 @@ To do this, follow these steps precisely: Agent 4: Opus security agent Look for security vulnerabilities in the introduced code. This includes injection, auth bypass, data exposure, unsafe deserialization, or other exploitable issues. Only look for issues that fall within the changed code. + Security evidence requirements for every reported issue: + - Include a concrete exploit or abuse path. + - Include attacker preconditions. + - Identify the impacted trust boundary or sensitive asset. + - Provide an actionable mitigation. + **CRITICAL: We only want HIGH SIGNAL issues.** Flag issues where: - The code will fail to compile or parse (syntax errors, type errors, missing imports, unresolved references) - The code will definitely produce wrong results regardless of inputs (clear logic errors) @@ -52,6 +58,7 @@ To do this, follow these steps precisely: - Subjective suggestions or improvements - Security issues that depend on speculative inputs or unverified assumptions - Denial of Service (DoS) or rate limiting issues without concrete exploitability + - Findings based only on diff snippets without validating surrounding repository context If you are not certain an issue is real, do not flag it. False positives erode trust and waste reviewer time. diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 64a20e9..9be849d 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -29,3 +29,4 @@ jobs: comment-pr: true upload-results: true claude-api-key: ${{ secrets.CLAUDE_API_KEY }} + trigger-on-commit: true diff --git a/.gitignore b/.gitignore index 8b8dd8e..dd3b326 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,22 @@ +# OS-generated files +.DS_Store +Thumbs.db + # Cache directories .cache/ +.pytest_cache/ # Python __pycache__/ *.py[cod] *$py.class *.pyc +.python-version +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ # Output files *.csv @@ -21,4 +32,17 @@ env/ claudecode/claudecode-prompt.txt eval_results/ -.env \ No newline at end of file +.env +.env.* + +# Editor / IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Node / Bun +node_modules/ + +# Logs +*.log diff --git a/README.md b/README.md index 2474766..b55c320 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Nutrient Code Reviewer -An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent approach for both code quality (correctness, reliability, performance, maintainability, testing) and security in a single pass. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. +An AI-powered code review GitHub Action using Claude to analyze code changes. Uses a unified multi-agent, multi-phase approach for both code quality (correctness, reliability, performance, maintainability, testing) and security. This action provides intelligent, context-aware review for pull requests using Anthropic's Claude Code tool for deep semantic analysis. Based on the original work from [anthropics/claude-code-security-review](https://github.com/anthropics/claude-code-security-review). @@ -111,7 +111,12 @@ This action is not hardened against prompt injection attacks and should only be | `comment-pr` | Whether to comment on PRs with findings | `true` | No | | `upload-results` | Whether to upload results as artifacts | `true` | No | | `exclude-directories` | Comma-separated list of directories to exclude from scanning | None | No | -| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.5. | `claude-opus-4-5-20251101` | No | +| `claude-model` | Claude [model name](https://docs.anthropic.com/en/docs/about-claude/models/overview#model-names) to use. Defaults to Opus 4.6. | `claude-opus-4-6` | No | +| `model-triage` | Model used for triage phase (skip/continue decision). | `claude-haiku-4-5` | No | +| `model-compliance` | Model used for CLAUDE.md compliance phase. | `claude-sonnet-4-5` | No | +| `model-quality` | Model used for code quality phase. | `claude-opus-4-6` | No | +| `model-security` | Model used for security phase. | `claude-opus-4-6` | No | +| `model-validation` | Model used for finding validation phase. | `claude-sonnet-4-5` | No | | `claudecode-timeout` | Timeout for ClaudeCode analysis in minutes | `20` | No | | `run-every-commit` | Run ClaudeCode on every commit (skips cache check). Warning: May increase false positives on PRs with many commits. **Deprecated**: Use `trigger-on-commit` instead. | `false` | No | | `trigger-on-open` | Run review when PR is first opened | `true` | No | @@ -127,6 +132,7 @@ This action is not hardened against prompt injection attacks and should only be | `skip-draft-prs` | Skip code review on draft pull requests | `true` | No | | `app-slug` | GitHub App slug for bot mention detection. If using `actions/create-github-app-token@v1.9.0+`, pass `${{ steps.app-token.outputs.app-slug }}`. Otherwise defaults to `github-actions`. | `github-actions` | No | | `require-label` | Only run review if this label is present. Leave empty to review all PRs. Add `labeled` to your workflow `pull_request` types to trigger on label addition. | None | No | +| `max-diff-lines` | Maximum inline diff lines included as prompt anchor; repository tool reads are still required in all cases. | `5000` | No | ### Action Outputs @@ -294,11 +300,12 @@ claudecode/ ### Workflow -1. **PR Analysis**: When a pull request is opened, Claude analyzes the diff to understand what changed -2. **Contextual Review**: Claude examines the code changes in context, understanding the purpose and potential impacts -3. **Finding Generation**: Issues are identified with detailed explanations, severity ratings, and remediation guidance -4. **False Positive Filtering**: Advanced filtering removes low-impact or false positive prone findings to reduce noise -5. **PR Comments**: Findings are posted as review comments on the specific lines of code +1. **Triage Phase**: A fast triage pass determines if review should proceed. +2. **Context Discovery**: Claude discovers relevant CLAUDE.md files, hotspots, and risky code paths. +3. **Specialist Review**: Dedicated compliance, quality, and security phases run with configurable models. +4. **Validation Phase**: Candidate findings are validated and deduplicated for high signal. +5. **False Positive Filtering**: Additional filtering removes low-impact noise. +6. **PR Comments**: Findings are posted as review comments on specific lines in the PR. ## Review Capabilities diff --git a/action.yml b/action.yml index ec846aa..a5708a5 100644 --- a/action.yml +++ b/action.yml @@ -29,10 +29,35 @@ inputs: default: '' claude-model: - description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-20250514)' + description: 'Claude model to use for code review analysis (e.g., claude-sonnet-4-5)' required: false default: '' + model-triage: + description: 'Model for triage phase' + required: false + default: 'claude-haiku-4-5' + + model-compliance: + description: 'Model for CLAUDE.md compliance phase' + required: false + default: 'claude-sonnet-4-5' + + model-quality: + description: 'Model for code quality phase' + required: false + default: 'claude-opus-4-6' + + model-security: + description: 'Model for security phase' + required: false + default: 'claude-opus-4-6' + + model-validation: + description: 'Model for validation phase' + required: false + default: 'claude-sonnet-4-5' + run-every-commit: description: 'DEPRECATED: Use trigger-on-commit instead. Run ClaudeCode on every commit (skips cache check). Warning: This may lead to more false positives on PRs with many commits as the AI analyzes the same code multiple times.' required: false @@ -351,6 +376,11 @@ runs: CUSTOM_REVIEW_INSTRUCTIONS: ${{ inputs.custom-review-instructions }} CUSTOM_SECURITY_SCAN_INSTRUCTIONS: ${{ inputs.custom-security-scan-instructions }} CLAUDE_MODEL: ${{ inputs.claude-model }} + MODEL_TRIAGE: ${{ inputs.model-triage }} + MODEL_COMPLIANCE: ${{ inputs.model-compliance }} + MODEL_QUALITY: ${{ inputs.model-quality }} + MODEL_SECURITY: ${{ inputs.model-security }} + MODEL_VALIDATION: ${{ inputs.model-validation }} CLAUDECODE_TIMEOUT: ${{ inputs.claudecode-timeout }} MAX_DIFF_LINES: ${{ inputs.max-diff-lines }} ACTION_PATH: ${{ github.action_path }} diff --git a/claudecode/__init__.py b/claudecode/__init__.py index d0cbfdc..07611ed 100644 --- a/claudecode/__init__.py +++ b/claudecode/__init__.py @@ -12,11 +12,16 @@ from claudecode.github_action_audit import ( GitHubActionClient, SimpleClaudeRunner, + get_review_model_config, main ) +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator __all__ = [ "GitHubActionClient", "SimpleClaudeRunner", + "ReviewModelConfig", + "ReviewOrchestrator", + "get_review_model_config", "main" ] diff --git a/claudecode/claude_api_client.py b/claudecode/claude_api_client.py index 5504b93..849f733 100644 --- a/claudecode/claude_api_client.py +++ b/claudecode/claude_api_client.py @@ -59,7 +59,7 @@ def validate_api_access(self) -> Tuple[bool, str]: try: # Simple test call to verify API access self.client.messages.create( - model="claude-3-5-haiku-20241022", + model="claude-haiku-4-5", max_tokens=10, messages=[{"role": "user", "content": "Hello"}], timeout=10 diff --git a/claudecode/constants.py b/claudecode/constants.py index bbf7988..6c81257 100644 --- a/claudecode/constants.py +++ b/claudecode/constants.py @@ -5,7 +5,7 @@ import os # API Configuration -DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-5-20251101' +DEFAULT_CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL') or 'claude-opus-4-6' DEFAULT_TIMEOUT_SECONDS = 180 # 3 minutes DEFAULT_MAX_RETRIES = 3 RATE_LIMIT_BACKOFF_MAX = 30 # Maximum backoff time for rate limits @@ -20,4 +20,3 @@ # Subprocess Configuration SUBPROCESS_TIMEOUT = 1200 # 20 minutes for Claude Code execution - diff --git a/claudecode/example_utils.py b/claudecode/example_utils.py new file mode 100644 index 0000000..38f2800 --- /dev/null +++ b/claudecode/example_utils.py @@ -0,0 +1,50 @@ +"""Example utilities with intentional issues for testing code review.""" + +import pickle +import subprocess +import os + + +def load_user_data(serialized_data): + """Load user data from serialized format.""" + # Security issue: unsafe pickle deserialization + return pickle.loads(serialized_data) + + +def run_command(user_input): + """Run a shell command based on user input.""" + # Security issue: command injection + result = subprocess.run(f"echo {user_input}", shell=True, capture_output=True) + return result.stdout.decode() + + +def read_file(filename): + """Read a file from disk.""" + # Security issue: path traversal + path = f"/data/{filename}" + with open(path, "r") as f: + return f.read() + + +def divide_numbers(a, b): + """Divide two numbers.""" + # Code quality issue: no zero division check + return a / b + + +def process_items(items): + """Process a list of items.""" + results = [] + for i in range(len(items)): + # Code quality issue: inefficient iteration + for j in range(len(items)): + if items[i] == items[j]: + results.append(items[i]) + return results + + +def get_user_by_id(user_id, connection): + """Get user from database.""" + # Security issue: SQL injection + query = f"SELECT * FROM users WHERE id = {user_id}" + return connection.execute(query) diff --git a/claudecode/findings_merge.py b/claudecode/findings_merge.py new file mode 100644 index 0000000..aae73cf --- /dev/null +++ b/claudecode/findings_merge.py @@ -0,0 +1,61 @@ +"""Utilities for merging and deduplicating findings from multiple phases.""" + +from typing import Any, Dict, List, Tuple + + +def _normalize_text(value: Any) -> str: + return str(value or "").strip().lower() + + +def _finding_key(finding: Dict[str, Any]) -> Tuple[str, int, str, str]: + file_path = _normalize_text(finding.get("file")) + line = finding.get("line") + try: + line_no = int(line) + except (TypeError, ValueError): + line_no = 1 + category = _normalize_text(finding.get("category")) + title = _normalize_text(finding.get("title")) + return file_path, line_no, category, title + + +def _severity_rank(value: Any) -> int: + sev = _normalize_text(value).upper() + if sev == "HIGH": + return 3 + if sev == "MEDIUM": + return 2 + if sev == "LOW": + return 1 + return 0 + + +def _confidence_value(value: Any) -> float: + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def merge_findings(findings: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Merge duplicate findings and keep the strongest candidate.""" + merged: Dict[Tuple[str, int, str, str], Dict[str, Any]] = {} + + for finding in findings: + if not isinstance(finding, dict): + continue + + key = _finding_key(finding) + existing = merged.get(key) + + if existing is None: + merged[key] = finding + continue + + incoming_score = (_severity_rank(finding.get("severity")), _confidence_value(finding.get("confidence"))) + existing_score = (_severity_rank(existing.get("severity")), _confidence_value(existing.get("confidence"))) + + if incoming_score > existing_score: + merged[key] = finding + + return list(merged.values()) diff --git a/claudecode/github_action_audit.py b/claudecode/github_action_audit.py index dbf6766..2ba78d4 100644 --- a/claudecode/github_action_audit.py +++ b/claudecode/github_action_audit.py @@ -15,10 +15,11 @@ import time # Import existing components we can reuse -from claudecode.prompts import get_unified_review_prompt from claudecode.findings_filter import FindingsFilter from claudecode.json_parser import parse_json_with_fallbacks from claudecode.format_pr_comments import format_pr_comments_for_prompt, is_bot_comment +from claudecode.prompts import get_unified_review_prompt # Backward-compatible import for tests/extensions. +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator from claudecode.constants import ( EXIT_CONFIGURATION_ERROR, DEFAULT_CLAUDE_MODEL, @@ -186,33 +187,25 @@ def get_pr_data(self, repo_name: str, pr_number: int) -> Dict[str, Any]: def get_pr_diff(self, repo_name: str, pr_number: int) -> str: """Get complete PR diff in unified format. - + Args: repo_name: Repository name in format "owner/repo" pr_number: Pull request number - + Returns: Complete PR diff in unified format """ url = f"https://api.github.com/repos/{repo_name}/pulls/{pr_number}" headers = dict(self.headers) headers['Accept'] = 'application/vnd.github.diff' - + response = requests.get(url, headers=headers) response.raise_for_status() - + return self._filter_generated_files(response.text) def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any]]: - """Get all review comments for a PR with pagination. - - Args: - repo_name: Repository name in format "owner/repo" - pr_number: Pull request number - - Returns: - List of comment dictionaries from GitHub API - """ + """Get all review comments for a PR with pagination.""" all_comments = [] page = 1 per_page = 100 @@ -230,13 +223,9 @@ def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any] break all_comments.extend(comments) - - # Check if there are more pages if len(comments) < per_page: break - page += 1 - except requests.RequestException as e: logger.warning(f"Failed to fetch comments page {page}: {e}") break @@ -244,15 +233,7 @@ def get_pr_comments(self, repo_name: str, pr_number: int) -> List[Dict[str, Any] return all_comments def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, int]: - """Get reactions for a specific comment, excluding bot reactions. - - Args: - repo_name: Repository name in format "owner/repo" - comment_id: The comment ID to fetch reactions for - - Returns: - Dictionary with reaction counts (e.g., {'+1': 3, '-1': 1}) - """ + """Get reactions for a specific comment, excluding bot reactions.""" url = f"https://api.github.com/repos/{repo_name}/pulls/comments/{comment_id}/reactions" try: @@ -260,24 +241,19 @@ def get_comment_reactions(self, repo_name: str, comment_id: int) -> Dict[str, in response.raise_for_status() reactions = response.json() - # Count reactions, excluding those from bots - counts = {} + counts: Dict[str, int] = {} for reaction in reactions: user = reaction.get('user', {}) - # Skip bot reactions (the bot adds its own 👍👎 as seeds) if user.get('type') == 'Bot': continue - content = reaction.get('content', '') if content: counts[content] = counts.get(content, 0) + 1 - return counts - except requests.RequestException as e: logger.debug(f"Failed to fetch reactions for comment {comment_id}: {e}") return {} - + def _is_excluded(self, filepath: str) -> bool: """Check if a file should be excluded based on directory or file patterns.""" import fnmatch @@ -310,6 +286,10 @@ def _is_excluded(self, filepath: str) -> bool: return True return False + + def is_excluded(self, filepath: str) -> bool: + """Public wrapper for exclusion checks.""" + return self._is_excluded(filepath) def _filter_generated_files(self, diff_text: str) -> str: """Filter out generated files and excluded directories from diff content.""" @@ -355,113 +335,121 @@ def __init__(self, timeout_minutes: Optional[int] = None): else: self.timeout_seconds = SUBPROCESS_TIMEOUT - def run_code_review(self, repo_dir: Path, prompt: str) -> Tuple[bool, str, Dict[str, Any]]: - """Run Claude Code review. - - Args: - repo_dir: Path to repository directory - prompt: Code review prompt - - Returns: - Tuple of (success, error_message, parsed_results) - """ + def run_prompt( + self, + repo_dir: Path, + prompt: str, + model: Optional[str] = None, + json_schema: Optional[Dict[str, Any]] = None, + ) -> Tuple[bool, str, Any]: + """Run a single Claude prompt and return parsed JSON payload or text.""" if not repo_dir.exists(): return False, f"Repository directory does not exist: {repo_dir}", {} - - # Check prompt size + prompt_size = len(prompt.encode('utf-8')) if prompt_size > 1024 * 1024: # 1MB print(f"[Warning] Large prompt size: {prompt_size / 1024 / 1024:.2f}MB", file=sys.stderr) - + + model_name = model or DEFAULT_CLAUDE_MODEL try: - # Construct Claude Code command - # Use stdin for prompt to avoid "argument list too long" error cmd = [ 'claude', '--output-format', 'json', - '--model', DEFAULT_CLAUDE_MODEL, + '--model', model_name, '--disallowed-tools', 'Bash(ps:*)', - '--json-schema', json.dumps(REVIEW_OUTPUT_SCHEMA) ] - - # Run Claude Code with retry logic + if json_schema: + cmd.extend(['--json-schema', json.dumps(json_schema)]) + NUM_RETRIES = 3 for attempt in range(NUM_RETRIES): result = subprocess.run( cmd, - input=prompt, # Pass prompt via stdin + input=prompt, cwd=repo_dir, capture_output=True, text=True, timeout=self.timeout_seconds ) - # Parse JSON output (even if returncode != 0, to detect specific errors) - success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") + if result.returncode != 0: + if attempt == NUM_RETRIES - 1: + error_details = f"Claude Code execution failed with return code {result.returncode}\n" + error_details += f"Stderr: {result.stderr}\n" + error_details += f"Stdout: {result.stdout[:500]}..." + return False, error_details, {} + time.sleep(5 * (attempt + 1)) + continue + success, parsed_result = parse_json_with_fallbacks(result.stdout, "Claude Code output") if success: - # Check for "Prompt is too long" error that should trigger fallback to agentic mode - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'success' and parsed_result.get('is_error') and parsed_result.get('result') == 'Prompt is too long'): return False, "PROMPT_TOO_LONG", {} - # Check for error_during_execution that should trigger retry - if (isinstance(parsed_result, dict) and - parsed_result.get('type') == 'result' and + if (isinstance(parsed_result, dict) and + parsed_result.get('type') == 'result' and parsed_result.get('subtype') == 'error_during_execution' and - attempt == 0): - continue # Retry - - # If returncode is 0, extract review findings - if result.returncode == 0: - parsed_results = self._extract_review_findings(parsed_result) - return True, "", parsed_results + attempt < NUM_RETRIES - 1): + continue - # Handle non-zero return codes after parsing - if result.returncode != 0: - if attempt == NUM_RETRIES - 1: - error_details = f"Claude Code execution failed with return code {result.returncode}\n" - error_details += f"Stderr: {result.stderr}\n" - error_details += f"Stdout: {result.stdout[:500]}..." # First 500 chars - return False, error_details, {} - else: - time.sleep(5*attempt) - # Note: We don't do exponential backoff here to keep the runtime reasonable - continue # Retry + if isinstance(parsed_result, dict) and 'result' in parsed_result and isinstance(parsed_result['result'], str): + nested_success, nested = parse_json_with_fallbacks(parsed_result['result'], "Claude result text") + if nested_success: + return True, "", nested + return True, "", parsed_result - # Parse failed if attempt == 0: - continue # Retry once - else: - return False, "Failed to parse Claude output", {} - + continue + return False, "Failed to parse Claude output", {} + return False, "Unexpected error in retry logic", {} - except subprocess.TimeoutExpired: return False, f"Claude Code execution timed out after {self.timeout_seconds // 60} minutes", {} except Exception as e: return False, f"Claude Code execution error: {str(e)}", {} + + def run_code_review(self, repo_dir: Path, prompt: str, model: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]: + """Run code review prompt and normalize to findings payload.""" + success, error_msg, parsed = self.run_prompt( + repo_dir, + prompt, + model=model or DEFAULT_CLAUDE_MODEL, + json_schema=REVIEW_OUTPUT_SCHEMA, + ) + if not success: + return False, error_msg, {} + if isinstance(parsed, dict) and 'findings' in parsed: + return True, "", parsed + return True, "", self._extract_review_findings(parsed) def _extract_review_findings(self, claude_output: Any) -> Dict[str, Any]: - """Extract review findings and PR summary from Claude's JSON response.""" + """Extract review findings from Claude's JSON response.""" if isinstance(claude_output, dict): - # Only accept Claude Code wrapper with result field - # Direct format without wrapper is not supported if 'result' in claude_output: result_text = claude_output['result'] if isinstance(result_text, str): # Try to extract JSON from the result text success, result_json = parse_json_with_fallbacks(result_text, "Claude result text") - if success and result_json and 'findings' in result_json and 'pr_summary' in result_json: + if success and result_json and 'findings' in result_json: + if 'pr_summary' not in result_json: + result_json['pr_summary'] = {} return result_json - + # Return empty structure if no findings found return { 'findings': [], - 'pr_summary': {} + 'pr_summary': {}, + 'analysis_summary': { + 'files_reviewed': 0, + 'high_severity': 0, + 'medium_severity': 0, + 'low_severity': 0, + 'review_completed': False, + } } def validate_claude_available(self) -> Tuple[bool, str]: @@ -559,17 +547,14 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No ConfigurationError: If filter initialization fails """ try: - # Check if we should use heuristic (pattern-based) filtering - use_heuristic_filtering = os.environ.get('ENABLE_HEURISTIC_FILTERING', 'true').lower() == 'true' - # Check if we should use Claude API filtering use_claude_filtering = os.environ.get('ENABLE_CLAUDE_FILTERING', 'false').lower() == 'true' api_key = os.environ.get('ANTHROPIC_API_KEY') - + if use_claude_filtering and api_key: # Use full filtering with Claude API return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=True, api_key=api_key, custom_filtering_instructions=custom_filtering_instructions @@ -577,13 +562,28 @@ def initialize_findings_filter(custom_filtering_instructions: Optional[str] = No else: # Fallback to filtering with hard rules only return FindingsFilter( - use_hard_exclusions=use_heuristic_filtering, + use_hard_exclusions=True, use_claude_filtering=False ) except Exception as e: raise ConfigurationError(f'Failed to initialize findings filter: {str(e)}') +def get_review_model_config() -> ReviewModelConfig: + """Resolve per-phase model configuration from environment.""" + return ReviewModelConfig.from_env(os.environ, DEFAULT_CLAUDE_MODEL) + + +def get_max_diff_lines() -> int: + """Return bounded inline diff budget in lines.""" + max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') + try: + parsed = int(max_diff_lines_str) + except ValueError: + parsed = 5000 + return max(0, parsed) + + def run_code_review(claude_runner: SimpleClaudeRunner, prompt: str) -> Dict[str, Any]: """Run the code review with Claude Code. @@ -744,35 +744,26 @@ def main(): print(json.dumps({'error': f'Failed to fetch PR data: {str(e)}'})) sys.exit(EXIT_GENERAL_ERROR) - # Fetch PR comments and build review context + # Backward-compatible context collection for review thread history. review_context = None try: pr_comments = github_client.get_pr_comments(repo_name, pr_number) if pr_comments: - # Build threads: find bot comments, their replies, and reactions bot_comment_threads = [] for comment in pr_comments: if is_bot_comment(comment): - # This is a bot comment (thread root) reactions = github_client.get_comment_reactions(repo_name, comment['id']) - - # Find replies to this comment replies = [ c for c in pr_comments if c.get('in_reply_to_id') == comment['id'] ] - # Sort replies by creation time replies.sort(key=lambda c: c.get('created_at', '')) - bot_comment_threads.append({ 'bot_comment': comment, 'replies': replies, 'reactions': reactions, }) - - # Sort threads by bot comment creation time (oldest first) bot_comment_threads.sort(key=lambda t: t['bot_comment'].get('created_at', '')) - if bot_comment_threads: review_context = format_pr_comments_for_prompt(bot_comment_threads) if review_context: @@ -781,110 +772,52 @@ def main(): logger.warning(f"Failed to fetch review context (continuing without it): {e}") review_context = None - # Determine whether to embed diff or use agentic file reading - max_diff_lines_str = os.environ.get('MAX_DIFF_LINES', '5000') - try: - max_diff_lines = int(max_diff_lines_str) - except ValueError: - max_diff_lines = 5000 - + max_diff_lines = get_max_diff_lines() diff_line_count = len(pr_diff.splitlines()) - use_agentic_mode = max_diff_lines == 0 or diff_line_count > max_diff_lines - - if use_agentic_mode: - print(f"[Info] Using agentic file reading mode (diff has {diff_line_count} lines, threshold: {max_diff_lines})", file=sys.stderr) + if max_diff_lines and diff_line_count > max_diff_lines: + print(f"[Info] Hybrid mode with truncated inline diff ({max_diff_lines}/{diff_line_count} lines)", file=sys.stderr) else: - print(f"[Debug] Embedding diff in prompt ({diff_line_count} lines)", file=sys.stderr) + print(f"[Info] Hybrid mode with full inline diff ({diff_line_count} lines)", file=sys.stderr) # Get repo directory from environment or use current directory repo_path = os.environ.get('REPO_PATH') repo_dir = Path(repo_path) if repo_path else Path.cwd() + model_config = get_review_model_config() + orchestrator = ReviewOrchestrator( + claude_runner=claude_runner, + findings_filter=findings_filter, + github_client=github_client, + model_config=model_config, + max_diff_lines=max_diff_lines, + ) - def run_review(include_diff: bool): - prompt_text = get_unified_review_prompt( - pr_data, - pr_diff if include_diff else None, - include_diff=include_diff, - custom_review_instructions=custom_review_instructions, - custom_security_instructions=custom_security_instructions, - review_context=review_context, - ) - return claude_runner.run_code_review(repo_dir, prompt_text), len(prompt_text) - - all_findings = [] - pr_summary_from_review = {} - - try: - (success, error_msg, review_results), prompt_len = run_review(include_diff=not use_agentic_mode) - - # Fallback to agentic mode if prompt still too long - if not success and error_msg == "PROMPT_TOO_LONG": - print(f"[Info] Prompt too long ({prompt_len} chars), falling back to agentic mode", file=sys.stderr) - (success, error_msg, review_results), prompt_len = run_review(include_diff=False) - - if not success: - raise AuditError(f'Code review failed: {error_msg}') - - pr_summary_from_review = review_results.get('pr_summary', {}) - for finding in review_results.get('findings', []): - if isinstance(finding, dict): - # Set review_type based on category - category = finding.get('category', '').lower() - if category == 'security': - finding.setdefault('review_type', 'security') - else: - finding.setdefault('review_type', 'general') - all_findings.append(finding) - - except AuditError as e: - print(json.dumps({'error': f'Code review failed: {str(e)}'})) + success, review_results, review_error = orchestrator.run( + repo_dir=repo_dir, + pr_data=pr_data, + pr_diff=pr_diff, + custom_review_instructions=custom_review_instructions, + custom_security_instructions=custom_security_instructions, + ) + if not success: + print(json.dumps({'error': f'Code review failed: {review_error}'})) sys.exit(EXIT_GENERAL_ERROR) - # Filter findings to reduce false positives - original_findings = all_findings - - # Prepare PR context for better filtering - pr_context = { - 'repo_name': repo_name, - 'pr_number': pr_number, - 'title': pr_data.get('title', ''), - 'description': pr_data.get('body', '') - } - - # Apply findings filter (including final directory exclusion) - kept_findings, excluded_findings, analysis_summary = apply_findings_filter( - findings_filter, original_findings, pr_context, github_client - ) - - # Prepare output summary - def severity_counts(findings_list): - high = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) - medium = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) - low = len([f for f in findings_list if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) - return high, medium, low - - high_count, medium_count, low_count = severity_counts(kept_findings) - - # Calculate files_reviewed from pr_summary.file_changes - files_reviewed = 0 - if isinstance(pr_summary_from_review, dict): - file_changes = pr_summary_from_review.get('file_changes', []) - if isinstance(file_changes, list): - # Count unique files from all 'files' arrays - all_files = set() - for entry in file_changes: - if isinstance(entry, dict): - files_list = entry.get('files', []) - if isinstance(files_list, list): - all_files.update(files_list) - files_reviewed = len(all_files) + findings = review_results.get('findings', []) + analysis_summary = review_results.get('analysis_summary', {}) + high_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) + medium_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'MEDIUM']) + low_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'LOW']) + files_reviewed = analysis_summary.get('files_reviewed', pr_data.get('changed_files', 0)) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 # Prepare output output = { 'pr_number': pr_number, 'repo': repo_name, - 'pr_summary': pr_summary_from_review, - 'findings': kept_findings, + 'findings': findings, 'analysis_summary': { 'files_reviewed': files_reviewed, 'high_severity': high_count, @@ -892,20 +825,18 @@ def severity_counts(findings_list): 'low_severity': low_count, 'review_completed': True }, - 'filtering_summary': { - 'total_original_findings': len(original_findings), - 'excluded_findings': len(excluded_findings), - 'kept_findings': len(kept_findings), - 'filter_analysis': analysis_summary, - 'excluded_findings_details': excluded_findings # Include full details of what was filtered - } + 'filtering_summary': review_results.get('filtering_summary', { + 'total_original_findings': len(findings), + 'excluded_findings': 0, + 'kept_findings': len(findings), + }) } # Output JSON to stdout print(json.dumps(output, indent=2)) # Exit with appropriate code - high_severity_count = len([f for f in kept_findings if f.get('severity', '').upper() == 'HIGH']) + high_severity_count = len([f for f in findings if isinstance(f, dict) and f.get('severity', '').upper() == 'HIGH']) sys.exit(EXIT_GENERAL_ERROR if high_severity_count > 0 else EXIT_SUCCESS) except Exception as e: diff --git a/claudecode/prompts.py b/claudecode/prompts.py index 6786858..9ae3f2e 100644 --- a/claudecode/prompts.py +++ b/claudecode/prompts.py @@ -1,13 +1,47 @@ -"""Code review prompt templates.""" +"""Prompt templates for multi-phase PR review orchestration.""" +from typing import Any, Dict, List, Optional -def _format_files_changed(pr_data): - """Format changed files for prompt context.""" - return "\n".join([f"- {f['filename']}" for f in pr_data['files']]) +COMPLIANCE_EXTRA_FIELDS = ',\n "rule_reference": "path/to/CLAUDE.md#section"' +SECURITY_EXTRA_FIELDS = ( + ',\n "exploit_preconditions": "...",\n "trust_boundary": "...",\n' + ' "cwe": "optional CWE-###"' +) -def _build_diff_section(pr_diff, include_diff): - """Build prompt section for inline diff or agentic file reading.""" +def _format_files_changed(pr_data: Dict[str, Any]) -> str: + """Format changed files for prompt context.""" + files = pr_data.get("files", []) + return "\n".join([f"- {f.get('filename', 'unknown')}" for f in files]) + + +def _build_hybrid_diff_section(pr_diff: str, max_lines: int) -> str: + """Build a bounded inline diff section while requiring tool-based context reads.""" + if not pr_diff: + return "\nNo inline diff available. Use repository tools to inspect changed files.\n" + if max_lines == 0: + return ( + "\nInline diff intentionally omitted (max-diff-lines=0). " + "Use repository tools to inspect changed files and context.\n" + ) + + lines = pr_diff.splitlines() + if max_lines > 0 and len(lines) > max_lines: + shown = "\n".join(lines[:max_lines]) + truncated_note = ( + f"\n[Diff truncated to {max_lines} lines out of {len(lines)} total lines. " + "You MUST use repository tools to inspect full context and missing hunks.]" + ) + return f"\nINLINE DIFF ANCHOR (TRUNCATED):\n```diff\n{shown}\n```{truncated_note}\n" + + return ( + "\nINLINE DIFF ANCHOR:\n" + f"```diff\n{pr_diff}\n```\n" + "Use this as a starting point only. You MUST validate findings with repository tool reads.\n" + ) + +def _build_diff_section(pr_diff: Optional[str], include_diff: bool) -> str: + """Build unified-prompt diff section for backward compatibility.""" if pr_diff and include_diff: return f""" @@ -32,174 +66,288 @@ def _build_diff_section(pr_diff, include_diff): """ -def get_unified_review_prompt( - pr_data, - pr_diff=None, - include_diff=True, - custom_review_instructions=None, - custom_security_instructions=None, - review_context=None, -): - """Generate unified code review + security prompt for Claude Code. - - This prompt covers both code quality (correctness, reliability, performance, - maintainability, testing) and security in a single pass. - - Args: - pr_data: PR data dictionary from GitHub API - pr_diff: Optional complete PR diff in unified format - include_diff: Whether to include the diff in the prompt (default: True) - custom_review_instructions: Optional custom review instructions to append - custom_security_instructions: Optional custom security instructions to append - review_context: Optional previous review context (bot findings and user replies) - - Returns: - Formatted prompt string - """ - +def _base_context_block(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Shared context block used across prompts.""" files_changed = _format_files_changed(pr_data) - diff_section = _build_diff_section(pr_diff, include_diff) + return f""" +PR CONTEXT: +- PR Number: {pr_data.get('number', 'unknown')} +- Title: {pr_data.get('title', 'unknown')} +- Author: {pr_data.get('user', 'unknown')} +- Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} +- Files changed: {pr_data.get('changed_files', 0)} +- Lines added: {pr_data.get('additions', 0)} +- Lines deleted: {pr_data.get('deletions', 0)} +- PR body: {pr_data.get('body', '') or 'No description'} + +MODIFIED FILES: +{files_changed or '- None listed'} +{_build_hybrid_diff_section(pr_diff, max_diff_lines)} +MANDATORY CONTEXT VALIDATION RULES: +1. You MUST use repository tools to read each relevant changed file before finalizing findings. +2. For every finding, verify at least one additional contextual location (caller, callee, config, or sibling path). +3. Do not rely on inline diff alone, even when diff is fully present. +""" + + +def _findings_output_schema(extra_fields: str = "") -> str: + return f""" +OUTPUT JSON SCHEMA (exact keys): +{{ + "findings": [ + {{ + "file": "path/to/file.py", + "line": 42, + "severity": "HIGH|MEDIUM|LOW", + "category": "correctness|reliability|performance|maintainability|testing|security|compliance", + "title": "Short issue title", + "description": "What is wrong", + "impact": "Concrete failure mode or exploit path", + "recommendation": "Actionable fix", + "confidence": 0.93{extra_fields} + }} + ], + "analysis_summary": {{ + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": true + }} +}} +""" + + +def build_triage_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for triage phase.""" + return f""" +You are the triage specialist for a pull request review. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +Decide whether review should be skipped. +Skip only if one of the following is true: +- PR is clearly trivial and cannot contain correctness/security/compliance risk +- PR is obviously generated deployment churn with no business logic changes +- There are no meaningful code changes in reviewed files + +Return JSON only: +{{ + "skip_review": false, + "reason": "short reason", + "risk_level": "low|medium|high" +}} +""" + + +def build_context_discovery_prompt(pr_data: Dict[str, Any], pr_diff: str, max_diff_lines: int) -> str: + """Prompt for context discovery phase.""" + return f""" +You are the repository context specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +Tasks: +1. Find relevant CLAUDE.md files: root and those in changed-file parent paths. +2. Summarize PR intent and risky hotspots. +3. Identify top files for deep review. + +Return JSON only: +{{ + "claude_md_files": ["path/CLAUDE.md"], + "change_summary": "brief summary", + "hotspots": ["path/to/file"], + "priority_files": ["path/to/file"] +}} +""" + + +def build_compliance_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], +) -> str: + """Prompt for CLAUDE.md compliance analysis.""" + context_json = discovered_context or {} + return f""" +You are the CLAUDE.md compliance specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{context_json} + +Focus exclusively on clear CLAUDE.md violations in changed code. Cite concrete violated rule text in each finding. +Reject ambiguous or preference-only claims. - custom_review_section = "" - if custom_review_instructions: - custom_review_section = f"\n{custom_review_instructions}\n" + {_findings_output_schema(COMPLIANCE_EXTRA_FIELDS)} + +Return JSON only. +""" + + +def build_quality_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_review_instructions: Optional[str] = None, +) -> str: + """Prompt for code quality analysis.""" + custom_block = f"\nCUSTOM QUALITY INSTRUCTIONS:\n{custom_review_instructions}\n" if custom_review_instructions else "" + return f""" +You are the code quality specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Focus on high-signal issues only: +- correctness and logic defects +- reliability regressions +- significant performance regressions +- maintainability risks with concrete failure/bug potential +- testing gaps only when they block confidence for risky behavior + +Exclude style-only feedback and speculative concerns. +{custom_block} +{_findings_output_schema()} + +Return JSON only. +""" - custom_security_section = "" - if custom_security_instructions: - custom_security_section = f"\n{custom_security_instructions}\n" - # Build PR description section - pr_description = pr_data.get('body', '').strip() if pr_data.get('body') else '' +def build_security_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + discovered_context: Dict[str, Any], + custom_security_instructions: Optional[str] = None, +) -> str: + """Prompt for security analysis with explicit exploitability criteria.""" + custom_block = ( + f"\nCUSTOM SECURITY INSTRUCTIONS:\n{custom_security_instructions}\n" + if custom_security_instructions + else "" + ) + + return f""" +You are the security specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +DISCOVERED CONTEXT: +{discovered_context or {}} + +Security review scope: +- injection (SQL/command/template/NoSQL/path traversal) +- authn/authz bypass and privilege escalation +- unsafe deserialization and code execution paths +- crypto and secrets handling flaws +- sensitive data exposure and trust-boundary breaks + +For every security finding you MUST provide: +1. exploit or abuse path +2. required attacker preconditions +3. impacted trust boundary or sensitive asset +4. concrete mitigation + +Do NOT report: +- generic DoS/rate-limiting comments without concrete exploitability +- speculative attacks without evidence in changed code paths +- issues outside modified scope unless required to prove exploitability +{custom_block} + {_findings_output_schema(SECURITY_EXTRA_FIELDS)} + +Return JSON only. +""" + + +def build_validation_prompt( + pr_data: Dict[str, Any], + pr_diff: str, + max_diff_lines: int, + candidate_findings: List[Dict[str, Any]], +) -> str: + """Prompt for finding validation and deduplication support.""" + return f""" +You are the validation specialist. + +{_base_context_block(pr_data, pr_diff, max_diff_lines)} + +CANDIDATE FINDINGS: +{candidate_findings} + +Validate each finding with strict criteria: +- must be reproducible or clearly inferable from changed code and surrounding context +- must have concrete impact +- confidence must be >= 0.8 to keep +- if two findings are duplicates, keep the stronger one only + +Return JSON only: +{{ + "validated_findings": [ + {{ + "finding_index": 0, + "keep": true, + "confidence": 0.92, + "reason": "short reason" + }} + ] +}} +""" + + +def get_unified_review_prompt( + pr_data: Dict[str, Any], + pr_diff: Optional[str] = None, + include_diff: bool = True, + custom_review_instructions: Optional[str] = None, + custom_security_instructions: Optional[str] = None, + review_context: Optional[str] = None, +) -> str: + """Backward-compatible unified prompt used by tests and direct calls.""" + files_changed = _format_files_changed(pr_data) + diff_section = _build_diff_section(pr_diff, include_diff) + custom_review_section = f"\n{custom_review_instructions}\n" if custom_review_instructions else "" + custom_security_section = f"\n{custom_security_instructions}\n" if custom_security_instructions else "" + + pr_description = (pr_data.get('body', '') or '').strip() pr_description_section = "" if pr_description: - # Truncate very long descriptions if len(pr_description) > 2000: pr_description = pr_description[:2000] + "... (truncated)" pr_description_section = f"\nPR Description:\n{pr_description}\n" - # Build review context section (previous bot reviews and user replies) - review_context_section = "" - if review_context: - review_context_section = review_context + review_context_section = review_context or "" return f""" -You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data['number']}: "{pr_data['title']}" +You are a senior engineer conducting a comprehensive code review of GitHub PR #{pr_data.get('number', 'unknown')}: "{pr_data.get('title', 'unknown')}" CONTEXT: - Repository: {pr_data.get('head', {}).get('repo', {}).get('full_name', 'unknown')} -- Author: {pr_data['user']} -- Files changed: {pr_data['changed_files']} -- Lines added: {pr_data['additions']} -- Lines deleted: {pr_data['deletions']} +- Author: {pr_data.get('user', 'unknown')} +- Files changed: {pr_data.get('changed_files', 0)} +- Lines added: {pr_data.get('additions', 0)} +- Lines deleted: {pr_data.get('deletions', 0)} {pr_description_section} Files modified: {files_changed}{diff_section}{review_context_section} OBJECTIVE: -Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. This covers both code quality (correctness, reliability, performance, maintainability, testing) AND security. Do not comment on pre-existing issues or purely stylistic preferences. - -CRITICAL INSTRUCTIONS: -1. MINIMIZE FALSE POSITIVES: Only flag issues where you're >80% confident they are real and impactful -2. AVOID NOISE: Skip style nits, subjective preferences, or low-impact suggestions -3. FOCUS ON IMPACT: Prioritize bugs, regressions, data loss, significant performance problems, or security vulnerabilities -4. SCOPE: Only evaluate code introduced or modified in this PR. Ignore unrelated existing issues +Perform a focused, high-signal code review to identify HIGH-CONFIDENCE issues introduced by this PR. CODE QUALITY CATEGORIES: - -**Correctness & Logic:** -- Incorrect business logic or wrong results -- Edge cases or null/empty handling regressions -- Incorrect error handling or missing validations leading to bad state -- Invariants broken by changes - -**Reliability & Resilience:** -- Concurrency or race conditions introduced by changes -- Resource leaks, timeouts, or missing retries in critical paths -- Partial failure handling or inconsistent state updates -- Idempotency or ordering issues - -**Performance & Scalability:** -- Algorithmic regressions in hot paths (O(n^2) where O(n) expected) -- N+1 query patterns -- Excessive synchronous I/O in latency-sensitive code -- Unbounded memory growth introduced by changes - -**Maintainability & Design:** -- Changes that significantly increase complexity or make future changes risky -- Tight coupling or unclear responsibility boundaries introduced -- Misleading APIs or brittle contracts - -**Testing & Observability:** -- Missing tests for high-risk changes -- Lack of logging/metrics around new critical behavior -- Flaky behavior due to nondeterministic changes +- correctness, reliability, performance, maintainability, testing {custom_review_section} SECURITY CATEGORIES: - -**Input Validation Vulnerabilities:** -- SQL injection via unsanitized user input -- Command injection in system calls or subprocesses -- XXE injection in XML parsing -- Template injection in templating engines -- NoSQL injection in database queries -- Path traversal in file operations - -**Authentication & Authorization Issues:** -- Authentication bypass logic -- Privilege escalation paths -- Session management flaws -- JWT token vulnerabilities -- Authorization logic bypasses - -**Crypto & Secrets Management:** -- Hardcoded API keys, passwords, or tokens -- Weak cryptographic algorithms or implementations -- Improper key storage or management -- Cryptographic randomness issues -- Certificate validation bypasses - -**Injection & Code Execution:** -- Remote code execution via deserialization -- Pickle injection in Python -- YAML deserialization vulnerabilities -- Eval injection in dynamic code execution -- XSS vulnerabilities in web applications (reflected, stored, DOM-based) - -**Data Exposure:** -- Sensitive data logging or storage -- PII handling violations -- API endpoint data leakage -- Debug information exposure +- input validation, authn/authz, crypto/secrets, code execution, data exposure {custom_security_section} -EXCLUSIONS - DO NOT REPORT: -- Denial of Service (DOS) vulnerabilities or resource exhaustion attacks -- Secrets/credentials stored on disk (these are managed separately) -- Rate limiting concerns or service overload scenarios - -Additional notes: -- Even if something is only exploitable from the local network, it can still be a HIGH severity issue - -ANALYSIS METHODOLOGY: - -Phase 1 - Repository Context Research (Use file search tools): -- Identify existing patterns, conventions, and critical paths -- Understand data flow, invariants, and error handling expectations -- Look for established security frameworks and patterns - -Phase 2 - Comparative Analysis: -- Compare new changes to existing patterns and contracts -- Identify deviations that introduce risk, regressions, or security issues -- Look for inconsistent handling between similar code paths - -Phase 3 - Issue Assessment: -- Examine each modified file for code quality and security implications -- Trace data flow from inputs to sensitive operations -- Identify concurrency, state management, and injection risks REQUIRED OUTPUT FORMAT: -You MUST output your findings as structured JSON with this exact schema: - {{ "pr_summary": {{ "overview": "2-4 sentence summary of what this PR changes and why it matters", @@ -208,11 +356,6 @@ def get_unified_review_prompt( "label": "src/auth.py", "files": ["src/auth.py"], "changes": "Brief description of changes (~10 words)" - }}, - {{ - "label": "tests/test_*.py", - "files": ["tests/test_auth.py", "tests/test_login.py"], - "changes": "Unit tests for authentication" }} ] }}, @@ -220,15 +363,12 @@ def get_unified_review_prompt( {{ "file": "path/to/file.py", "line": 42, - "severity": "HIGH", + "severity": "HIGH|MEDIUM|LOW", "category": "correctness|reliability|performance|maintainability|testing|security", "title": "Short summary of the issue", "description": "What is wrong and where it happens", - "impact": "Concrete impact or failure mode (use exploit scenario for security issues)", + "impact": "Concrete impact or failure mode", "recommendation": "Actionable fix or mitigation", - "suggestion": "Exact replacement code (optional). Can be multi-line. Must replace lines from suggestion_start_line to suggestion_end_line.", - "suggestion_start_line": 42, - "suggestion_end_line": 44, "confidence": 0.95 }} ] @@ -237,32 +377,5 @@ def get_unified_review_prompt( PR SUMMARY GUIDELINES: - overview: 2-4 sentences describing WHAT changed and WHY (purpose/goal) - file_changes: One entry per file or group of related files - - label: Display name (single file path, or pattern like "tests/*.py" for groups) - - files: Array of actual file paths covered by this entry (used for counting) - - changes: Brief description (~10 words), focus on purpose not implementation - - Group related files when it improves readability (e.g., test files, config files) - -SUGGESTION GUIDELINES: -- Only include `suggestion` if you can provide exact, working replacement code -- For single-line fixes: set suggestion_start_line = suggestion_end_line = the line number -- For multi-line fixes: set the range of lines being replaced -- The suggestion replaces all lines from suggestion_start_line to suggestion_end_line (inclusive) - -SEVERITY GUIDELINES: -- **HIGH**: Likely production bug, data loss, significant regression, or directly exploitable security vulnerability -- **MEDIUM**: Real issue with limited scope or specific triggering conditions -- **LOW**: Minor but real issue; use sparingly and only if clearly actionable - -CONFIDENCE SCORING: -- 0.9-1.0: Certain issue with clear evidence and impact -- 0.8-0.9: Strong signal with likely real-world impact -- 0.7-0.8: Plausible issue but may require specific conditions -- Below 0.7: Don't report (too speculative) - -FINAL REMINDER: -Focus on HIGH and MEDIUM findings only. Better to miss some theoretical issues than flood the report with false positives. Each finding should be something a senior engineer would confidently raise in a PR review. - -Begin your analysis now. Use the repository exploration tools to understand the codebase context, then analyze the PR changes for code quality and security implications. - -Your final reply must contain the JSON and nothing else. You should not reply again after outputting the JSON. +- changes: Brief description (~10 words), focus on purpose not implementation """ diff --git a/claudecode/review_orchestrator.py b/claudecode/review_orchestrator.py new file mode 100644 index 0000000..8f70bac --- /dev/null +++ b/claudecode/review_orchestrator.py @@ -0,0 +1,266 @@ +"""Multi-phase review orchestration for GitHub Action execution.""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from claudecode.findings_merge import merge_findings +from claudecode.json_parser import parse_json_with_fallbacks +from claudecode.logger import get_logger +from claudecode.prompts import ( + build_compliance_prompt, + build_context_discovery_prompt, + build_quality_prompt, + build_security_prompt, + build_triage_prompt, + build_validation_prompt, +) + +logger = get_logger(__name__) + + +@dataclass +class ReviewModelConfig: + """Per-phase model configuration with global fallback.""" + + triage: str + compliance: str + quality: str + security: str + validation: str + + @classmethod + def from_env(cls, env: Dict[str, str], default_model: str) -> "ReviewModelConfig": + def resolve(key: str, fallback: str) -> str: + value = (env.get(key) or "").strip() + return value or fallback + + global_model = resolve("CLAUDE_MODEL", default_model) + return cls( + triage=resolve("MODEL_TRIAGE", global_model), + compliance=resolve("MODEL_COMPLIANCE", global_model), + quality=resolve("MODEL_QUALITY", global_model), + security=resolve("MODEL_SECURITY", global_model), + validation=resolve("MODEL_VALIDATION", global_model), + ) + + +class ReviewOrchestrator: + """Coordinates multi-phase review and returns final findings.""" + + def __init__( + self, + claude_runner: Any, + findings_filter: Any, + github_client: Any, + model_config: ReviewModelConfig, + max_diff_lines: int, + ): + self.claude_runner = claude_runner + self.findings_filter = findings_filter + self.github_client = github_client + self.model_config = model_config + self.max_diff_lines = max(0, max_diff_lines) + + def _run_phase(self, repo_dir: Path, prompt: str, model: str, phase_name: str) -> Tuple[bool, Dict[str, Any], str]: + run_prompt = getattr(self.claude_runner, "run_prompt", None) + if not callable(run_prompt): + return False, {}, f"Runner missing run_prompt for {phase_name}" + + raw_result = run_prompt(repo_dir, prompt, model=model) + if not (isinstance(raw_result, tuple) and len(raw_result) == 3): + return False, {}, f"Invalid runner response for {phase_name}" + + success, error_msg, raw = raw_result + if not success: + return False, {}, error_msg + + if isinstance(raw, dict): + return True, raw, "" + + parsed_ok, parsed = parse_json_with_fallbacks(str(raw), f"{phase_name} output") + if not parsed_ok: + return False, {}, f"Failed to parse {phase_name} output" + return True, parsed, "" + + def _is_excluded(self, filepath: str) -> bool: + checker = getattr(self.github_client, "is_excluded", None) + if callable(checker): + return bool(checker(filepath)) + # Fallback for current client/tests that still expose private method. + return bool(self.github_client._is_excluded(filepath)) + + def _collect_phase_findings(self, phase_result: Dict[str, Any], source_agent: str) -> List[Dict[str, Any]]: + findings: List[Dict[str, Any]] = [] + for finding in phase_result.get("findings", []): + if isinstance(finding, dict): + enriched = finding.copy() + enriched.setdefault("source_agent", source_agent) + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + findings.append(enriched) + return findings + + def _ensure_review_type(self, finding: Dict[str, Any]) -> Dict[str, Any]: + enriched = finding.copy() + category = str(enriched.get("category", "")).lower() + if "review_type" not in enriched: + enriched["review_type"] = "security" if category == "security" else "general" + return enriched + + def run( + self, + repo_dir: Path, + pr_data: Dict[str, Any], + pr_diff: str, + custom_review_instructions: Optional[str] = None, + custom_security_instructions: Optional[str] = None, + ) -> Tuple[bool, Dict[str, Any], str]: + # Phase A: triage + triage_prompt = build_triage_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, triage_result, err = self._run_phase(repo_dir, triage_prompt, self.model_config.triage, "triage") + if not ok: + return False, {}, f"Triage phase failed: {err}" + + if not isinstance(triage_result, dict) or "skip_review" not in triage_result: + return False, {}, "Triage phase returned invalid schema" + + if triage_result.get("skip_review") is True: + logger.info("Skipping review based on triage decision: %s", triage_result.get("reason", "")) + return True, { + "findings": [], + "analysis_summary": { + "files_reviewed": 0, + "high_severity": 0, + "medium_severity": 0, + "low_severity": 0, + "review_completed": True, + }, + "triage": triage_result, + }, "" + + # Phase B: context discovery + context_prompt = build_context_discovery_prompt(pr_data, pr_diff, self.max_diff_lines) + ok, context_result, err = self._run_phase(repo_dir, context_prompt, self.model_config.compliance, "context discovery") + if not ok: + return False, {}, f"Context discovery phase failed: {err}" + + # Phase C: specialist passes + compliance_prompt = build_compliance_prompt(pr_data, pr_diff, self.max_diff_lines, context_result) + quality_prompt = build_quality_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_review_instructions=custom_review_instructions, + ) + security_prompt = build_security_prompt( + pr_data, + pr_diff, + self.max_diff_lines, + context_result, + custom_security_instructions=custom_security_instructions, + ) + + ok_c, compliance_result, err_c = self._run_phase(repo_dir, compliance_prompt, self.model_config.compliance, "compliance") + ok_q, quality_result, err_q = self._run_phase(repo_dir, quality_prompt, self.model_config.quality, "quality") + ok_s, security_result, err_s = self._run_phase(repo_dir, security_prompt, self.model_config.security, "security") + + if not ok_c: + return False, {}, f"Compliance phase failed: {err_c}" + if not ok_q: + return False, {}, f"Quality phase failed: {err_q}" + if not ok_s: + return False, {}, f"Security phase failed: {err_s}" + + all_candidates: List[Dict[str, Any]] = [] + all_candidates.extend(self._collect_phase_findings(compliance_result, "compliance")) + all_candidates.extend(self._collect_phase_findings(quality_result, "quality")) + all_candidates.extend(self._collect_phase_findings(security_result, "security")) + + all_candidates = merge_findings(all_candidates) + + # Phase D: validation + validation_prompt = build_validation_prompt(pr_data, pr_diff, self.max_diff_lines, all_candidates) + ok_v, validation_result, err_v = self._run_phase(repo_dir, validation_prompt, self.model_config.validation, "validation") + if not ok_v: + return False, {}, f"Validation phase failed: {err_v}" + + validated: List[Dict[str, Any]] = [] + has_validation_output = isinstance(validation_result, dict) and "validated_findings" in validation_result + decisions = validation_result.get("validated_findings", []) if isinstance(validation_result, dict) else [] + if not isinstance(decisions, list): + decisions = [] + for decision in decisions: + if not isinstance(decision, dict): + continue + idx = decision.get("finding_index") + keep = bool(decision.get("keep")) + confidence = decision.get("confidence", 0.0) + try: + idx_int = int(idx) + except (TypeError, ValueError): + continue + if idx_int < 0 or idx_int >= len(all_candidates): + continue + if keep and float(confidence or 0.0) >= 0.8: + finding = all_candidates[idx_int].copy() + finding["confidence"] = float(confidence) + validated.append(finding) + + # If validator did not return decisions at all, preserve candidates. + # If it explicitly returned validated_findings (including empty list), trust validator output. + if not has_validation_output: + validated = all_candidates + + # Apply existing filtering pipeline + pr_context = { + "repo_name": pr_data.get("head", {}).get("repo", {}).get("full_name", "unknown"), + "pr_number": pr_data.get("number"), + "title": pr_data.get("title", ""), + "description": pr_data.get("body", ""), + } + kept_findings = validated + original_count = len(all_candidates) + filter_response = self.findings_filter.filter_findings(validated, pr_context) + if isinstance(filter_response, tuple) and len(filter_response) == 3: + filter_success, filter_results, _ = filter_response + if filter_success and isinstance(filter_results, dict): + kept_findings = filter_results.get("filtered_findings", validated) + + final_findings: List[Dict[str, Any]] = [] + for finding in kept_findings: + if not isinstance(finding, dict): + continue + if self._is_excluded(finding.get("file", "")): + continue + final_findings.append(self._ensure_review_type(finding)) + + high = len([f for f in final_findings if str(f.get("severity", "")).upper() == "HIGH"]) + medium = len([f for f in final_findings if str(f.get("severity", "")).upper() == "MEDIUM"]) + low = len([f for f in final_findings if str(f.get("severity", "")).upper() == "LOW"]) + + files_reviewed = pr_data.get("changed_files", 0) + try: + files_reviewed = int(files_reviewed) + except (TypeError, ValueError): + files_reviewed = 0 + + return True, { + "findings": final_findings, + "analysis_summary": { + "files_reviewed": files_reviewed, + "high_severity": high, + "medium_severity": medium, + "low_severity": low, + "review_completed": True, + }, + "filtering_summary": { + "total_original_findings": original_count, + "excluded_findings": max(0, original_count - len(final_findings)), + "kept_findings": len(final_findings), + }, + "context_summary": context_result, + "triage": triage_result, + }, "" diff --git a/claudecode/test_findings_merge.py b/claudecode/test_findings_merge.py new file mode 100644 index 0000000..9129e7a --- /dev/null +++ b/claudecode/test_findings_merge.py @@ -0,0 +1,32 @@ +"""Unit tests for findings_merge utilities.""" + +from claudecode.findings_merge import merge_findings + + +def test_merge_findings_empty_input(): + assert merge_findings([]) == [] + + +def test_merge_findings_keeps_higher_severity_then_confidence(): + findings = [ + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "MEDIUM", "confidence": 0.9}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.8}, + {"file": "a.py", "line": 10, "category": "security", "title": "Issue", "severity": "HIGH", "confidence": 0.95}, + ] + + merged = merge_findings(findings) + assert len(merged) == 1 + assert merged[0]["severity"] == "HIGH" + assert merged[0]["confidence"] == 0.95 + + +def test_merge_findings_separate_keys_are_preserved(): + findings = [ + {"file": "a.py", "line": 1, "category": "correctness", "title": "One", "severity": "LOW"}, + {"file": "a.py", "line": 2, "category": "correctness", "title": "Two", "severity": "LOW"}, + {"file": "b.py", "line": 1, "category": "security", "title": "One", "severity": "MEDIUM"}, + ] + + merged = merge_findings(findings) + assert len(merged) == 3 + diff --git a/claudecode/test_main_function.py b/claudecode/test_main_function.py index 0b2bdae..f5239d2 100644 --- a/claudecode/test_main_function.py +++ b/claudecode/test_main_function.py @@ -12,6 +12,28 @@ from claudecode.github_action_audit import main +def _mock_multiphase_success(mock_runner, findings=None): + """Configure runner mock for strict multi-phase orchestration.""" + findings = findings or [] + mock_runner.run_prompt.side_effect = [ + (True, "", {"skip_review": False, "reason": "", "risk_level": "medium"}), # triage + (True, "", {"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context + (True, "", {"findings": findings}), # compliance + (True, "", {"findings": []}), # quality + (True, "", {"findings": []}), # security + ( + True, + "", + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + }, + ), # validation + ] + + class TestMainFunction: """Test main function execution flow.""" @@ -232,20 +254,7 @@ def test_main_successful_audit_no_findings(self, mock_client_class, mock_runner_ mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': [], - 'analysis_summary': { - 'files_reviewed': 5, - 'high_severity': 0, - 'medium_severity': 0, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=[]) mock_runner_class.return_value = mock_runner mock_filter = Mock() @@ -299,6 +308,7 @@ def test_main_successful_audit_with_findings(self, mock_client_class, mock_runne } mock_client.get_pr_diff.return_value = "diff content" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ @@ -320,20 +330,7 @@ def test_main_successful_audit_with_findings(self, mock_client_class, mock_runne mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - # Unified review - single call with all findings - mock_runner.run_code_review.return_value = ( - True, - "", - { - 'findings': findings, - 'analysis_summary': { - 'files_reviewed': 2, - 'high_severity': 1, - 'medium_severity': 1, - 'low_severity': 0 - } - } - ) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner # Filter keeps only high severity @@ -391,13 +388,14 @@ def test_main_with_full_filter(self, mock_client_class, mock_runner_class, } mock_client.get_pr_diff.return_value = "diff content" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [{'file': 'test.py', 'line': 10, 'severity': 'HIGH', 'description': 'Issue'}] mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_prompt_func.return_value = "prompt" @@ -450,6 +448,7 @@ def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_ru mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False # Don't exclude any files in tests + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client findings = [ @@ -459,7 +458,7 @@ def test_main_filter_failure_keeps_all_findings(self, mock_client_class, mock_ru mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_prompt_func.return_value = "prompt" @@ -539,11 +538,7 @@ def test_audit_failure(self, mock_client_class, mock_runner_class, mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = ( - False, - "Claude execution failed", - {} - ) + mock_runner.run_prompt.return_value = (False, "Claude execution failed", {}) mock_runner_class.return_value = mock_runner mock_filter_class.return_value = Mock() @@ -582,6 +577,7 @@ def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_ mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client # Mixed findings - security and non-security @@ -592,7 +588,7 @@ def test_findings_have_correct_review_type(self, mock_client_class, mock_runner_ mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner # Filter passes through all findings @@ -644,6 +640,7 @@ def test_review_type_not_overwritten_if_already_set(self, mock_client_class, moc mock_client.get_pr_data.return_value = {'number': 123, 'title': 'Test', 'body': ''} mock_client.get_pr_diff.return_value = "diff" mock_client._is_excluded.return_value = False + mock_client.is_excluded.return_value = False mock_client_class.return_value = mock_client # Finding already has review_type set (e.g., from a custom analyzer) @@ -654,7 +651,7 @@ def test_review_type_not_overwritten_if_already_set(self, mock_client_class, moc mock_runner = Mock() mock_runner.validate_claude_available.return_value = (True, "") - mock_runner.run_code_review.return_value = (True, "", {'findings': findings}) + _mock_multiphase_success(mock_runner, findings=findings) mock_runner_class.return_value = mock_runner mock_filter = Mock() diff --git a/claudecode/test_prompts.py b/claudecode/test_prompts.py index dbb9d4f..2e9ed52 100644 --- a/claudecode/test_prompts.py +++ b/claudecode/test_prompts.py @@ -192,4 +192,12 @@ def test_get_unified_review_prompt_includes_summary_guidelines(self): assert "PR SUMMARY GUIDELINES:" in prompt assert "2-4 sentences" in prompt - assert "~10 words" in prompt \ No newline at end of file + assert "~10 words" in prompt + + def test_build_hybrid_diff_section_max_lines_zero_omits_inline_diff(self): + from claudecode.prompts import _build_hybrid_diff_section + + section = _build_hybrid_diff_section("diff --git a/a.py b/a.py\n+print('x')", 0) + + assert "intentionally omitted" in section + assert "```diff" not in section diff --git a/claudecode/test_review_orchestrator.py b/claudecode/test_review_orchestrator.py new file mode 100644 index 0000000..8dec762 --- /dev/null +++ b/claudecode/test_review_orchestrator.py @@ -0,0 +1,44 @@ +"""Unit tests for review_orchestrator.""" + +from pathlib import Path +from unittest.mock import Mock + +from claudecode.review_orchestrator import ReviewModelConfig, ReviewOrchestrator + + +def test_review_model_config_from_env_with_fallbacks(): + env = { + "CLAUDE_MODEL": "global-model", + "MODEL_TRIAGE": "triage-model", + "MODEL_SECURITY": "security-model", + } + cfg = ReviewModelConfig.from_env(env, "default-model") + assert cfg.triage == "triage-model" + assert cfg.compliance == "global-model" + assert cfg.quality == "global-model" + assert cfg.security == "security-model" + assert cfg.validation == "global-model" + + +def test_orchestrator_invalid_triage_schema_fails(): + runner = Mock() + runner.run_prompt.return_value = ( + True, + "", + {"findings": [{"file": "a.py", "line": 3, "severity": "HIGH", "category": "security", "title": "Issue"}]}, + ) + findings_filter = Mock() + github_client = Mock() + github_client.is_excluded.return_value = False + cfg = ReviewModelConfig("t", "c", "q", "s", "v") + + orchestrator = ReviewOrchestrator(runner, findings_filter, github_client, cfg, 100) + ok, result, err = orchestrator.run( + repo_dir=Path("."), + pr_data={"number": 1, "changed_files": 1, "head": {"repo": {"full_name": "x/y"}}}, + pr_diff="diff --git a/a.py b/a.py", + ) + + assert ok is False + assert result == {} + assert "invalid schema" in err.lower() diff --git a/claudecode/test_workflow_integration.py b/claudecode/test_workflow_integration.py index e5410d6..325d062 100644 --- a/claudecode/test_workflow_integration.py +++ b/claudecode/test_workflow_integration.py @@ -13,6 +13,31 @@ from claudecode.github_action_audit import main +def _mock_claude_result(payload): + """Build a mocked subprocess result containing JSON payload.""" + return Mock(returncode=0, stdout=json.dumps(payload), stderr='') + + +def _multiphase_run_side_effect(findings): + """Create subprocess.run side effects for strict multi-phase flow.""" + return [ + Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), # claude --version + _mock_claude_result({"skip_review": False, "reason": "", "risk_level": "medium"}), # triage + _mock_claude_result({"claude_md_files": [], "change_summary": "", "hotspots": [], "priority_files": []}), # context + _mock_claude_result({"findings": findings}), # compliance + _mock_claude_result({"findings": []}), # quality + _mock_claude_result({"findings": []}), # security + _mock_claude_result( + { + "validated_findings": [ + {"finding_index": idx, "keep": True, "confidence": 0.95, "reason": "valid"} + for idx in range(len(findings)) + ] + } + ), # validation + ] + + class TestFullWorkflowIntegration: """Test complete workflow scenarios.""" @@ -242,23 +267,7 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): ] } - # Mock Claude CLI - version_result = Mock() - version_result.returncode = 0 - version_result.stdout = 'claude version 1.0.0' - version_result.stderr = '' - - audit_result = Mock() - audit_result.returncode = 0 - # Claude wraps the result in a specific format - claude_wrapped_response = { - 'result': json.dumps(claude_response) - } - audit_result.stdout = json.dumps(claude_wrapped_response) - audit_result.stderr = '' - - # Provide results for unified review (single pass) - mock_run.side_effect = [version_result, audit_result] + mock_run.side_effect = _multiphase_run_side_effect(claude_response["findings"]) # Run the workflow with tempfile.TemporaryDirectory() as tmpdir: @@ -281,10 +290,10 @@ def test_full_workflow_with_real_pr_structure(self, mock_get, mock_run): # Verify API calls # 5 calls: PR data, files, diff, comments, reactions for bot comment assert mock_get.call_count == 5 - assert mock_run.call_count == 2 # 1 version check + 1 unified review + assert mock_run.call_count == 7 # 1 version check + 6 phase calls # Verify the audit was run with proper prompt - audit_call = mock_run.call_args_list[1] + audit_call = mock_run.call_args_list[1] # triage prompt prompt = audit_call[1]['input'] assert 'Add new authentication feature' in prompt # Title assert 'src/auth/oauth2.py' in prompt # File name @@ -337,11 +346,7 @@ def test_workflow_with_llm_filtering(self, mock_get, mock_run): } ] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": claude_findings}), stderr=''), - Mock(returncode=0, stdout=json.dumps({"findings": []}), stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect(claude_findings) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -415,11 +420,7 @@ def test_workflow_with_no_review_issues(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude finds no issues - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr=''), - Mock(returncode=0, stdout='{"findings": [], "analysis_summary": {"review_completed": true}}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -506,11 +507,7 @@ def test_workflow_with_massive_pr(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] # Claude handles it gracefully - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) @@ -586,11 +583,7 @@ def test_workflow_with_binary_files(self, mock_get, mock_run): mock_get.side_effect = [pr_response, files_response, diff_response] - mock_run.side_effect = [ - Mock(returncode=0, stdout='claude version 1.0.0', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr=''), - Mock(returncode=0, stdout='{"findings": []}', stderr='') - ] + mock_run.side_effect = _multiphase_run_side_effect([]) with tempfile.TemporaryDirectory() as tmpdir: os.chdir(tmpdir) diff --git a/scripts/comment-pr-findings.bun.test.js b/scripts/comment-pr-findings.bun.test.js index e5dae90..d28878c 100644 --- a/scripts/comment-pr-findings.bun.test.js +++ b/scripts/comment-pr-findings.bun.test.js @@ -1026,7 +1026,7 @@ describe('comment-pr-findings.js', () => { id: 201, state: 'APPROVED', user: { type: 'Bot' }, - body: '📋 **PR Summary:**\nThis PR adds auth.\n\n---\n\nNo issues found. Changes look good.' + body: '\nNo issues found. Changes look good.' } ]; @@ -1076,7 +1076,7 @@ describe('comment-pr-findings.js', () => { test('should update existing review in place when state is unchanged and no inline comments', async () => { // Existing APPROVED review, new findings also result in APPROVED (no HIGH severity) const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; readFileSyncSpy.mockImplementation((path) => { @@ -1129,7 +1129,7 @@ describe('comment-pr-findings.js', () => { test('should dismiss and create new review when state changes', async () => { // Existing APPROVED review, but new findings have HIGH severity = CHANGES_REQUESTED const mockReviews = [ - { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: 'No issues found. Changes look good.' } + { id: 101, state: 'APPROVED', user: { type: 'Bot' }, body: '\nNo issues found. Changes look good.' } ]; const mockFindings = [{ diff --git a/scripts/comment-pr-findings.js b/scripts/comment-pr-findings.js index 270f98b..0dbd4ee 100755 --- a/scripts/comment-pr-findings.js +++ b/scripts/comment-pr-findings.js @@ -6,6 +6,7 @@ const fs = require('fs'); const { spawnSync } = require('child_process'); +const REVIEW_MARKER = ''; // PR Summary marker for identifying our summary sections const PR_SUMMARY_MARKER = '📋 **PR Summary:**'; @@ -102,26 +103,7 @@ function addReactionsToReview(reviewId) { // Check if a review was posted by this action function isOwnReview(review) { if (!review.body) return false; - - // Check for our review summary patterns - const ownPatterns = [ - PR_SUMMARY_MARKER, - 'No issues found. Changes look good.', - /^Found \d+ .+ issues?\./, - 'Please address the high-severity issues before merging.', - 'Consider addressing the suggestions in the comments.', - 'Minor suggestions noted in comments.' - ]; - - for (const pattern of ownPatterns) { - if (pattern instanceof RegExp) { - if (pattern.test(review.body)) return true; - } else { - if (review.body.includes(pattern)) return true; - } - } - - return false; + return review.body.includes(REVIEW_MARKER); } // Find an existing review posted by this action @@ -135,8 +117,8 @@ function findExistingReview() { for (const review of reviews) { const isDismissible = review.state === 'APPROVED' || review.state === 'CHANGES_REQUESTED'; - const isBot = review.user && review.user.type === 'Bot'; const isOwn = isOwnReview(review); + const isBot = review.user && review.user.type === 'Bot'; if (isBot && isDismissible && isOwn) { return review; @@ -293,7 +275,7 @@ async function run() { const highSeverityCount = analysisSummary.high_severity || 0; const reviewEvent = highSeverityCount > 0 ? 'REQUEST_CHANGES' : 'APPROVE'; - const reviewBody = buildReviewSummary(newFindings, prSummary, analysisSummary); + const reviewBody = `${buildReviewSummary(newFindings, prSummary, analysisSummary)}\n\n${REVIEW_MARKER}`; // Prepare review comments const reviewComments = [];