From 14a902aaccc53e94891e1b42cb0b60abd2e769ce Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 11:50:55 +0000 Subject: [PATCH 01/12] chore: install ripgrep Signed-off-by: Mouad BANI --- scripts/services/docker/Dockerfile.git_integration | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/services/docker/Dockerfile.git_integration b/scripts/services/docker/Dockerfile.git_integration index 30a5890259..c2a8068d40 100644 --- a/scripts/services/docker/Dockerfile.git_integration +++ b/scripts/services/docker/Dockerfile.git_integration @@ -65,6 +65,7 @@ FROM base AS runner RUN apt-get update && apt-get install -y \ ca-certificates \ git \ + ripgrep \ --no-install-recommends \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean \ From b597c996255fd6ea4232e88c53d82307b0a45c31 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 12:47:26 +0000 Subject: [PATCH 02/12] feat: leverage maintainersFile from db before falling back to regular detection Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 94 +++++++++++++++---- 1 file changed, 78 insertions(+), 16 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 6c4e532017..9c047818f3 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -390,9 +390,76 @@ async def find_maintainer_file(self, repo_path: str, owner: str, repo: str): return None, None, ai_cost - async def extract_maintainers(self, repo_path: str, owner: str, repo: str): + async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: + """ + Analyze file content with AI and return a MaintainerResult. + Raises MaintanerAnalysisError if no maintainers are found. + """ + self.logger.info(f"Analyzing maintainer file: {filename}") + result = await self.analyze_file_content(filename, content) + + if not result.output.info: + raise MaintanerAnalysisError(ai_cost=result.cost) + + return MaintainerResult( + maintainer_file=filename, + maintainer_info=result.output.info, + total_cost=result.cost, + ) + + async def try_saved_maintainer_file( + self, repo_path: str, saved_maintainer_file: str + ) -> tuple[MaintainerResult | None, float]: + """ + Attempt to read and analyze the previously saved maintainer file. + Returns (result, cost) where result is None if the attempt failed. + """ + cost = 0.0 + file_path = os.path.join(repo_path, saved_maintainer_file) + + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' no longer exists on disk" + ) + return None, cost + + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + + result = await self.analyze_and_build_result(saved_maintainer_file, content) + cost += result.total_cost + return result, cost + except MaintanerAnalysisError as e: + cost += e.ai_cost + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' analysis failed: {e.error_message}" + ) + return None, cost + except Exception as e: + self.logger.warning( + f"Saved maintainer file '{saved_maintainer_file}' processing failed: {repr(e)}" + ) + return None, cost + + async def extract_maintainers( + self, + repo_path: str, + owner: str, + repo: str, + saved_maintainer_file: str | None = None, + ): total_cost = 0 + if saved_maintainer_file: + self.logger.info(f"Trying saved maintainer file: {saved_maintainer_file}") + result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) + total_cost += cost + if result: + result.total_cost = total_cost + return result + self.logger.info("Falling back to maintainer file detection") + self.logger.info("Looking for maintainer file...") maintainer_file, file_content, cost = await self.find_maintainer_file( repo_path, owner, repo @@ -404,21 +471,11 @@ async def extract_maintainers(self, repo_path: str, owner: str, repo: str): raise MaintainerFileNotFoundError(ai_cost=total_cost) decoded_content = base64.b64decode(file_content).decode("utf-8") + result = await self.analyze_and_build_result(maintainer_file, decoded_content) + total_cost += result.total_cost - self.logger.info(f"Analyzing maintainer file: {maintainer_file}") - result = await self.analyze_file_content(maintainer_file, decoded_content) - maintainer_info = result.output.info - total_cost += result.cost - - if not maintainer_info: - self.logger.error("Failed to analyze the maintainer file content.") - raise MaintanerAnalysisError(ai_cost=total_cost) - - return MaintainerResult( - maintainer_file=maintainer_file, - maintainer_info=maintainer_info, - total_cost=total_cost, - ) + result.total_cost = total_cost + return result async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -507,7 +564,12 @@ async def process_maintainers( ) self.logger.info(f"Starting maintainers processing for repo: {batch_info.remote}") - maintainers = await self.extract_maintainers(batch_info.repo_path, owner, repo_name) + maintainers = await self.extract_maintainers( + batch_info.repo_path, + owner, + repo_name, + saved_maintainer_file=repository.maintainer_file, + ) latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost maintainers_found = len(maintainers.maintainer_info) From 5cb07fac0e61c59f9cdc746eaa8333d791d2e721 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 15:31:06 +0000 Subject: [PATCH 03/12] feat: improve maintainers detection & analysis Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 244 ++++++++++++++---- 1 file changed, 192 insertions(+), 52 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 9c047818f3..1dcad51975 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -1,5 +1,4 @@ import asyncio -import base64 import os import time as time_module from datetime import datetime, time, timezone @@ -20,6 +19,7 @@ ) from crowdgit.enums import ErrorCode, ExecutionStatus, OperationType from crowdgit.errors import ( + CommandExecutionError, CrowdGitError, MaintainerFileNotFoundError, MaintainerIntervalNotElapsedError, @@ -37,7 +37,7 @@ from crowdgit.models.service_execution import ServiceExecution from crowdgit.services.base.base_service import BaseService from crowdgit.services.maintainer.bedrock import invoke_bedrock -from crowdgit.services.utils import parse_repo_url +from crowdgit.services.utils import run_shell_command from crowdgit.settings import MAINTAINER_RETRY_INTERVAL_DAYS, MAINTAINER_UPDATE_INTERVAL_HOURS @@ -46,24 +46,60 @@ class MaintainerService(BaseService): MAX_CHUNK_SIZE = 5000 MAX_CONCURRENT_CHUNKS = 3 # Maximum concurrent chunk processing + MAX_AI_ANALYSIS_ATTEMPTS = 3 + # List of common maintainer file names MAINTAINER_FILES = [ "MAINTAINERS", "MAINTAINERS.md", "MAINTAINER.md", + "CODEOWNERS", "CODEOWNERS.md", "CONTRIBUTORS", "CONTRIBUTORS.md", - "docs/MAINTAINERS.md", "OWNERS", - "CODEOWNERS", + "OWNERS.md", + "AUTHORS", + "AUTHORS.md", + "docs/MAINTAINERS.md", ".github/MAINTAINERS.md", ".github/CONTRIBUTORS.md", + ".github/CODEOWNERS", "GOVERNANCE.md", - "README.md", - "SECURITY-INSIGHTS.md", ] + VALID_EXTENSIONS = { + "", + ".md", + ".markdown", + ".txt", + ".rst", + ".yaml", + ".yml", + ".toml", + ".adoc", + ".csv", + } + + CONTENT_VALIDATION_KEYWORDS = [ + "maintainer", + "codeowner", + "owner", + "contributor", + "author", + "reviewer", + "governance", + "lead", + "approver", + "committer", + "credit", + "administrator", + "steward", + "emeritus", + ] + + EXCLUDED_FILENAMES = {"contributing.md", "contributing"} + def make_role(self, title: str): title = title.lower() title = ( @@ -351,44 +387,122 @@ async def find_maintainer_file_with_ai(self, file_names): else: return None, result.cost - async def find_maintainer_file(self, repo_path: str, owner: str, repo: str): - self.logger.info(f"Looking for maintainer files in {owner}/{repo}...") - - file_names = await aiofiles.os.listdir(repo_path) - - for file in self.MAINTAINER_FILES: - file_path = os.path.join(repo_path, file) - if await aiofiles.os.path.isfile(file_path): - self.logger.info(f"maintainer file: {file_path} found in repo") - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() + async def _list_repo_files(self, repo_path: str) -> list[str]: + """List all files in the repo recursively, respecting .gitignore via rg.""" + try: + output = await run_shell_command( + ["rg", "--files", "--hidden", "--glob", "!.git/", "."], cwd=repo_path + ) + return [ + line[2:] if line.startswith("./") else line + for line in output.strip().split("\n") + if line.strip() + ] + except Exception as e: + self.logger.warning(f"rg --files failed, falling back to os.walk: {repr(e)}") + results = [] + for dirpath, dirnames, filenames in os.walk(repo_path): + dirnames[:] = [d for d in dirnames if d != ".git"] + for filename in filenames: + full_path = os.path.join(dirpath, filename) + results.append(os.path.relpath(full_path, repo_path)) + return results + + async def _ripgrep_search(self, repo_path: str) -> list[str]: + """Search for files containing maintainer-related keywords, filtered to valid extensions.""" + pattern = "|".join(self.CONTENT_VALIDATION_KEYWORDS) + + exclusion_globs = ["--glob", "!.git/"] + for name in self.EXCLUDED_FILENAMES: + exclusion_globs.extend(["--iglob", f"!{name}"]) - if file.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info(f"Skipping {file}: no maintainer-related content found") - continue + try: + output = await run_shell_command( + ["rg", "-l", "-i", "--hidden", pattern, *exclusion_globs, "."], cwd=repo_path + ) + except CommandExecutionError: + self.logger.info("Ripgrep found no files containing maintainer keywords") + return [] + except Exception as e: + self.logger.warning(f"Ripgrep search failed: {repr(e)}") + return [] - return file, base64.b64encode(content.encode()).decode(), 0 + results = [] + for line in output.strip().split("\n"): + line = line.strip() + if not line: + continue + if line.startswith("./"): + line = line[2:] + basename = os.path.basename(line).lower() + ext = os.path.splitext(basename)[1] + if ext not in self.VALID_EXTENSIONS: + self.logger.debug(f"Skipping '{line}': extension '{ext}' not in valid extensions") + continue + if ext == "" and not any(kw in basename for kw in self.CONTENT_VALIDATION_KEYWORDS): + self.logger.debug( + f"Skipping extensionless file '{line}': " + f"basename '{basename}' contains no governance keyword" + ) + continue + results.append(line) - self.logger.warning("No maintainer files found using the known file names.") + self.logger.info(f"Ripgrep found {len(results)} candidate files after filtering") + return results - file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str]]: + """ + Find all potential maintainer files using static list + dynamic ripgrep search. + Returns ordered list of (relative_path, content) tuples. + Static matches come first, then dynamic matches sorted by content keyword score. + """ + candidates_static = [] + static_paths_lower = set() - if file_name: - file_path = os.path.join(repo_path, file_name) + for file in self.MAINTAINER_FILES: + file_path = os.path.join(repo_path, file) if await aiofiles.os.path.isfile(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + candidates_static.append((file, content)) + static_paths_lower.add(file.lower()) + self.logger.info(f"Static match found: {file}") + except Exception as e: + self.logger.warning(f"Failed to read static match {file}: {repr(e)}") + + dynamic_paths = await self._ripgrep_search(repo_path) + + scored_dynamic = [] + for candidate_path in dynamic_paths: + if candidate_path.lower() in static_paths_lower: + continue + + file_path = os.path.join(repo_path, candidate_path) + try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() + except Exception as e: + self.logger.warning(f"Failed to read dynamic match {candidate_path}: {repr(e)}") + continue - if file_name.lower() == "readme.md" and "maintainer" not in content.lower(): - self.logger.info( - f"AI suggested {file_name}, but it has no maintainer-related content. Skipping." - ) - return None, None, ai_cost + content_lower = content.lower() + # Calculate score based on keywords matched in the content + score = sum(1 for kw in self.CONTENT_VALIDATION_KEYWORDS if kw in content_lower) + if score > 0: + scored_dynamic.append((candidate_path, content, score)) + self.logger.info( + f"Dynamic match validated: {candidate_path} (keyword score: {score})" + ) - self.logger.info(f"\nMaintainer file found: {file_name}") - return file_name, base64.b64encode(content.encode()).decode(), ai_cost + # Sort by score DESC + scored_dynamic.sort(key=lambda c: c[2], reverse=True) - return None, None, ai_cost + result = candidates_static + [(path, content) for path, content, _ in scored_dynamic] + self.logger.info( + f"Found {len(candidates_static)} static and {len(scored_dynamic)} dynamic candidates" + ) + return result async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: """ @@ -445,12 +559,11 @@ async def try_saved_maintainer_file( async def extract_maintainers( self, repo_path: str, - owner: str, - repo: str, saved_maintainer_file: str | None = None, ): total_cost = 0 + # Step 1: Try the previously saved maintainer file if saved_maintainer_file: self.logger.info(f"Trying saved maintainer file: {saved_maintainer_file}") result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) @@ -460,22 +573,53 @@ async def extract_maintainers( return result self.logger.info("Falling back to maintainer file detection") - self.logger.info("Looking for maintainer file...") - maintainer_file, file_content, cost = await self.find_maintainer_file( - repo_path, owner, repo - ) - total_cost += cost + # Step 2: Find candidates via static list + ripgrep dynamic search + candidates = await self.find_candidate_files(repo_path) + + # Step 3: Try AI analysis on candidates, stop on first success + if candidates: + attempts = min(len(candidates), self.MAX_AI_ANALYSIS_ATTEMPTS) + for filename, content in candidates[:attempts]: + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + result.total_cost = total_cost + return result + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") + except Exception as e: + self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + + self.logger.warning( + f"AI analysis failed for all {attempts} candidate(s), trying AI file detection" + ) + else: + self.logger.warning("No candidate files found via search, trying AI file detection") - if not maintainer_file or not file_content: - self.logger.error("No maintainer file found") - raise MaintainerFileNotFoundError(ai_cost=total_cost) + # Step 4: AI file detection as last resort + file_names = await self._list_repo_files(repo_path) + ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + total_cost += ai_cost - decoded_content = base64.b64decode(file_content).decode("utf-8") - result = await self.analyze_and_build_result(maintainer_file, decoded_content) - total_cost += result.total_cost + if ai_file_name: + file_path = os.path.join(repo_path, ai_file_name) + if await aiofiles.os.path.isfile(file_path): + try: + async with aiofiles.open(file_path, "r", encoding="utf-8") as f: + content = await f.read() + result = await self.analyze_and_build_result(ai_file_name, content) + total_cost += result.total_cost + result.total_cost = total_cost + return result + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning( + f"AI-suggested file '{ai_file_name}' analysis failed: {e.error_message}" + ) - result.total_cost = total_cost - return result + self.logger.error("No maintainer file found") + raise MaintainerFileNotFoundError(ai_cost=total_cost) async def check_if_interval_elapsed(self, repository: Repository) -> tuple[bool, float]: """ @@ -553,8 +697,6 @@ async def process_maintainers( maintainers_skipped = 0 try: - owner, repo_name = parse_repo_url(batch_info.remote) - has_interval_elapsed, remaining_hours = await self.check_if_interval_elapsed( repository ) @@ -566,8 +708,6 @@ async def process_maintainers( self.logger.info(f"Starting maintainers processing for repo: {batch_info.remote}") maintainers = await self.extract_maintainers( batch_info.repo_path, - owner, - repo_name, saved_maintainer_file=repository.maintainer_file, ) latest_maintainer_file = maintainers.maintainer_file From dd7d2c62ee767405470464e7ef69766d9bfcea6c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 10 Mar 2026 15:39:09 +0000 Subject: [PATCH 04/12] feat: track analyzed maintainers files in metrics Signed-off-by: Mouad BANI --- .../src/crowdgit/models/maintainer_info.py | 2 ++ .../services/maintainer/maintainer_service.py | 25 ++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 5a420567ae..6914059a2b 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -34,3 +34,5 @@ class MaintainerResult(BaseModel): maintainer_file: str | None = None maintainer_info: list[MaintainerInfoItem] | None = None total_cost: float = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 1dcad51975..673bc0d6d4 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -562,6 +562,14 @@ async def extract_maintainers( saved_maintainer_file: str | None = None, ): total_cost = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None + + def _attach_metadata(result: MaintainerResult) -> MaintainerResult: + result.total_cost = total_cost + result.candidate_files = candidate_files + result.ai_suggested_file = ai_suggested_file + return result # Step 1: Try the previously saved maintainer file if saved_maintainer_file: @@ -569,12 +577,12 @@ async def extract_maintainers( result, cost = await self.try_saved_maintainer_file(repo_path, saved_maintainer_file) total_cost += cost if result: - result.total_cost = total_cost - return result + return _attach_metadata(result) self.logger.info("Falling back to maintainer file detection") # Step 2: Find candidates via static list + ripgrep dynamic search candidates = await self.find_candidate_files(repo_path) + candidate_files = [path for path, _ in candidates] # Step 3: Try AI analysis on candidates, stop on first success if candidates: @@ -583,8 +591,7 @@ async def extract_maintainers( try: result = await self.analyze_and_build_result(filename, content) total_cost += result.total_cost - result.total_cost = total_cost - return result + return _attach_metadata(result) except MaintanerAnalysisError as e: total_cost += e.ai_cost self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") @@ -600,6 +607,7 @@ async def extract_maintainers( # Step 4: AI file detection as last resort file_names = await self._list_repo_files(repo_path) ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + ai_suggested_file = ai_file_name total_cost += ai_cost if ai_file_name: @@ -610,8 +618,7 @@ async def extract_maintainers( content = await f.read() result = await self.analyze_and_build_result(ai_file_name, content) total_cost += result.total_cost - result.total_cost = total_cost - return result + return _attach_metadata(result) except MaintanerAnalysisError as e: total_cost += e.ai_cost self.logger.warning( @@ -695,6 +702,8 @@ async def process_maintainers( ai_cost = 0.0 maintainers_found = 0 maintainers_skipped = 0 + candidate_files: list[str] = [] + ai_suggested_file: str | None = None try: has_interval_elapsed, remaining_hours = await self.check_if_interval_elapsed( @@ -713,6 +722,8 @@ async def process_maintainers( latest_maintainer_file = maintainers.maintainer_file ai_cost = maintainers.total_cost maintainers_found = len(maintainers.maintainer_info) + candidate_files = maintainers.candidate_files + ai_suggested_file = maintainers.ai_suggested_file if repository.parent_repo: filtered_maintainers = await self.exclude_parent_repo_maintainers( @@ -767,6 +778,8 @@ async def process_maintainers( "ai_cost": ai_cost, "maintainers_found": maintainers_found, "maintainers_skipped": maintainers_skipped, + "candidate_files": candidate_files, + "ai_suggested_file": ai_suggested_file, }, ) await save_service_execution(service_execution) From 1eb1483cd08563f36c8617e7ec2acb6d7beacc0c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:32:20 +0000 Subject: [PATCH 05/12] feat: change candidate file detection to be more narrow Signed-off-by: Mouad BANI --- .../src/crowdgit/models/maintainer_info.py | 2 +- .../services/maintainer/maintainer_service.py | 272 ++++++++++-------- 2 files changed, 146 insertions(+), 128 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py index 6914059a2b..1752999e54 100644 --- a/services/apps/git_integration/src/crowdgit/models/maintainer_info.py +++ b/services/apps/git_integration/src/crowdgit/models/maintainer_info.py @@ -34,5 +34,5 @@ class MaintainerResult(BaseModel): maintainer_file: str | None = None maintainer_info: list[MaintainerInfoItem] | None = None total_cost: float = 0 - candidate_files: list[str] = [] + candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 673bc0d6d4..add752ba94 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -45,28 +45,51 @@ class MaintainerService(BaseService): """Service for processing maintainer data""" MAX_CHUNK_SIZE = 5000 - MAX_CONCURRENT_CHUNKS = 3 # Maximum concurrent chunk processing - MAX_AI_ANALYSIS_ATTEMPTS = 3 - - # List of common maintainer file names - MAINTAINER_FILES = [ - "MAINTAINERS", - "MAINTAINERS.md", - "MAINTAINER.md", - "CODEOWNERS", - "CODEOWNERS.md", - "CONTRIBUTORS", - "CONTRIBUTORS.md", - "OWNERS", - "OWNERS.md", - "AUTHORS", - "AUTHORS.md", - "docs/MAINTAINERS.md", - ".github/MAINTAINERS.md", - ".github/CONTRIBUTORS.md", - ".github/CODEOWNERS", - "GOVERNANCE.md", - ] + MAX_CONCURRENT_CHUNKS = 3 + + # Full paths that get the highest score bonus when matched exactly + KNOWN_PATHS = { + "maintainers", + "maintainers.md", + "maintainer.md", + "codeowners", + "codeowners.md", + "contributors", + "contributors.md", + "owners", + "owners.md", + "authors", + "authors.md", + "governance.md", + "docs/maintainers.md", + ".github/maintainers.md", + ".github/contributors.md", + ".github/codeowners", + } + + # Governance stems (basename without extension, lowercased) for filename search + GOVERNANCE_STEMS = { + "maintainers", + "maintainer", + "codeowners", + "codeowner", + "contributors", + "contributor", + "owners", + "owners_aliases", + "authors", + "committers", + "commiters", + "reviewers", + "approvers", + "administrators", + "stewards", + "credits", + "governance", + "core_team", + "code_owners", + "emeritus", + } VALID_EXTENSIONS = { "", @@ -79,26 +102,31 @@ class MaintainerService(BaseService): ".toml", ".adoc", ".csv", + ".rdoc", } - CONTENT_VALIDATION_KEYWORDS = [ + SCORING_KEYWORDS = [ "maintainer", "codeowner", "owner", "contributor", - "author", - "reviewer", "governance", - "lead", - "approver", - "committer", - "credit", - "administrator", "steward", "emeritus", + "approver", + "reviewer", ] - EXCLUDED_FILENAMES = {"contributing.md", "contributing"} + EXCLUDED_FILENAMES = { + "contributing.md", + "contributing", + "code_of_conduct.md", + "code-of-conduct.md", + } + + FULL_PATH_SCORE = 100 + STEM_MATCH_SCORE = 50 + PARTIAL_STEM_SCORE = 25 def make_role(self, title: str): title = title.lower() @@ -378,7 +406,7 @@ def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[ async def find_maintainer_file_with_ai(self, file_names): self.logger.info("Using AI to find maintainer files...") - prompt = self.get_maintainer_file_prompt(self.MAINTAINER_FILES, file_names) + prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), file_names) result = await invoke_bedrock(prompt, pydantic_model=MaintainerFile) if result.output.file_name is not None: @@ -388,40 +416,39 @@ async def find_maintainer_file_with_ai(self, file_names): return None, result.cost async def _list_repo_files(self, repo_path: str) -> list[str]: - """List all files in the repo recursively, respecting .gitignore via rg.""" - try: - output = await run_shell_command( - ["rg", "--files", "--hidden", "--glob", "!.git/", "."], cwd=repo_path - ) - return [ - line[2:] if line.startswith("./") else line - for line in output.strip().split("\n") - if line.strip() - ] - except Exception as e: - self.logger.warning(f"rg --files failed, falling back to os.walk: {repr(e)}") - results = [] - for dirpath, dirnames, filenames in os.walk(repo_path): - dirnames[:] = [d for d in dirnames if d != ".git"] - for filename in filenames: - full_path = os.path.join(dirpath, filename) - results.append(os.path.relpath(full_path, repo_path)) - return results + """List non-code files in the repo recursively, filtered by VALID_EXTENSIONS.""" + glob_args = ["--glob", "!.git/"] + for ext in self.VALID_EXTENSIONS: + glob_args.extend(["--iglob", f"*{ext}"]) - async def _ripgrep_search(self, repo_path: str) -> list[str]: - """Search for files containing maintainer-related keywords, filtered to valid extensions.""" - pattern = "|".join(self.CONTENT_VALIDATION_KEYWORDS) + output = await run_shell_command( + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path + ) + return [ + line[2:] if line.startswith("./") else line + for line in output.strip().split("\n") + if line.strip() + ] - exclusion_globs = ["--glob", "!.git/"] - for name in self.EXCLUDED_FILENAMES: - exclusion_globs.extend(["--iglob", f"!{name}"]) + async def _ripgrep_search(self, repo_path: str) -> list[str]: + """Search for files whose basename matches a governance stem, at any depth.""" + glob_args = ["--glob", "!.git/"] + for stem in self.GOVERNANCE_STEMS: + glob_args.extend( + [ + "--iglob", + f"*{stem}*", + "--iglob", + f"*{stem}*.*", + ] + ) try: output = await run_shell_command( - ["rg", "-l", "-i", "--hidden", pattern, *exclusion_globs, "."], cwd=repo_path + ["rg", "--files", "--hidden", *glob_args, "."], cwd=repo_path ) except CommandExecutionError: - self.logger.info("Ripgrep found no files containing maintainer keywords") + self.logger.info("Ripgrep found no governance files by filename") return [] except Exception as e: self.logger.warning(f"Ripgrep search failed: {repr(e)}") @@ -435,74 +462,64 @@ async def _ripgrep_search(self, repo_path: str) -> list[str]: if line.startswith("./"): line = line[2:] basename = os.path.basename(line).lower() + if basename in self.EXCLUDED_FILENAMES: + continue ext = os.path.splitext(basename)[1] if ext not in self.VALID_EXTENSIONS: - self.logger.debug(f"Skipping '{line}': extension '{ext}' not in valid extensions") - continue - if ext == "" and not any(kw in basename for kw in self.CONTENT_VALIDATION_KEYWORDS): - self.logger.debug( - f"Skipping extensionless file '{line}': " - f"basename '{basename}' contains no governance keyword" - ) continue results.append(line) - self.logger.info(f"Ripgrep found {len(results)} candidate files after filtering") + self.logger.info(f"Ripgrep found {len(results)} governance files by filename") return results - async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str]]: + def _score_filename(self, candidate_path: str) -> int: + """Score by how closely the filename matches known governance patterns.""" + path = candidate_path.lower() + if path in self.KNOWN_PATHS: + return self.FULL_PATH_SCORE + stem = os.path.splitext(os.path.basename(path))[0].lstrip(".") + if stem in self.GOVERNANCE_STEMS: + return self.STEM_MATCH_SCORE + if any(known_stem in stem for known_stem in self.GOVERNANCE_STEMS): + return self.PARTIAL_STEM_SCORE + return 0 + + async def find_candidate_files(self, repo_path: str) -> list[tuple[str, str, int]]: """ - Find all potential maintainer files using static list + dynamic ripgrep search. - Returns ordered list of (relative_path, content) tuples. - Static matches come first, then dynamic matches sorted by content keyword score. + Find governance files by filename, score them, and return all candidates sorted by score. + Scoring: full known-path match (100) > exact stem (50) > partial stem (25) + content keywords (+1 each). """ - candidates_static = [] - static_paths_lower = set() - - for file in self.MAINTAINER_FILES: - file_path = os.path.join(repo_path, file) - if await aiofiles.os.path.isfile(file_path): - try: - async with aiofiles.open(file_path, "r", encoding="utf-8") as f: - content = await f.read() - candidates_static.append((file, content)) - static_paths_lower.add(file.lower()) - self.logger.info(f"Static match found: {file}") - except Exception as e: - self.logger.warning(f"Failed to read static match {file}: {repr(e)}") - - dynamic_paths = await self._ripgrep_search(repo_path) - - scored_dynamic = [] - for candidate_path in dynamic_paths: - if candidate_path.lower() in static_paths_lower: - continue + found_paths = await self._ripgrep_search(repo_path) + if not found_paths: + return [] + scored = [] + for candidate_path in found_paths: file_path = os.path.join(repo_path, candidate_path) try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() except Exception as e: - self.logger.warning(f"Failed to read dynamic match {candidate_path}: {repr(e)}") + self.logger.warning(f"Failed to read candidate {candidate_path}: {repr(e)}") continue - content_lower = content.lower() - # Calculate score based on keywords matched in the content - score = sum(1 for kw in self.CONTENT_VALIDATION_KEYWORDS if kw in content_lower) - if score > 0: - scored_dynamic.append((candidate_path, content, score)) - self.logger.info( - f"Dynamic match validated: {candidate_path} (keyword score: {score})" - ) + filename_score = self._score_filename(candidate_path) + content_score = sum(1 for kw in self.SCORING_KEYWORDS if kw in content.lower()) + total = filename_score + content_score - # Sort by score DESC - scored_dynamic.sort(key=lambda c: c[2], reverse=True) + scored.append((candidate_path, content, total)) + self.logger.info( + f"Candidate: {candidate_path} " + f"(filename: {filename_score}, content: {content_score}, total: {total})" + ) - result = candidates_static + [(path, content) for path, content, _ in scored_dynamic] - self.logger.info( - f"Found {len(candidates_static)} static and {len(scored_dynamic)} dynamic candidates" - ) - return result + scored.sort(key=lambda c: c[2], reverse=True) + + if scored: + self.logger.info(f"Top candidate: {scored[0][0]} (from {len(scored)} total)") + else: + self.logger.info("No valid candidates after scoring") + return scored async def analyze_and_build_result(self, filename: str, content: str) -> MaintainerResult: """ @@ -562,7 +579,7 @@ async def extract_maintainers( saved_maintainer_file: str | None = None, ): total_cost = 0 - candidate_files: list[str] = [] + candidate_files: list[tuple[str, int]] = [] ai_suggested_file: str | None = None def _attach_metadata(result: MaintainerResult) -> MaintainerResult: @@ -580,27 +597,24 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: return _attach_metadata(result) self.logger.info("Falling back to maintainer file detection") - # Step 2: Find candidates via static list + ripgrep dynamic search + # Step 2: Find top candidate via filename search + scoring candidates = await self.find_candidate_files(repo_path) - candidate_files = [path for path, _ in candidates] + candidate_files = [(path, score) for path, _, score in candidates] - # Step 3: Try AI analysis on candidates, stop on first success + # Step 3: Try AI analysis on top candidate if candidates: - attempts = min(len(candidates), self.MAX_AI_ANALYSIS_ATTEMPTS) - for filename, content in candidates[:attempts]: - try: - result = await self.analyze_and_build_result(filename, content) - total_cost += result.total_cost - return _attach_metadata(result) - except MaintanerAnalysisError as e: - total_cost += e.ai_cost - self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") - except Exception as e: - self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + filename, content, _ = candidates[0] + try: + result = await self.analyze_and_build_result(filename, content) + total_cost += result.total_cost + return _attach_metadata(result) + except MaintanerAnalysisError as e: + total_cost += e.ai_cost + self.logger.warning(f"AI analysis failed for '{filename}': {e.error_message}") + except Exception as e: + self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") - self.logger.warning( - f"AI analysis failed for all {attempts} candidate(s), trying AI file detection" - ) + self.logger.warning("Top candidate failed, trying AI file detection") else: self.logger.warning("No candidate files found via search, trying AI file detection") @@ -612,7 +626,11 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: if ai_file_name: file_path = os.path.join(repo_path, ai_file_name) - if await aiofiles.os.path.isfile(file_path): + if not await aiofiles.os.path.isfile(file_path): + self.logger.warning( + f"AI suggested '{ai_file_name}' but file does not exist on disk" + ) + else: try: async with aiofiles.open(file_path, "r", encoding="utf-8") as f: content = await f.read() From b19c8b2c630478bff69cb81cb5666b813fc33e3a Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:37:33 +0000 Subject: [PATCH 06/12] fix: enable email fallback for identity lookup during maintainer update Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index add752ba94..24559f5512 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -188,14 +188,18 @@ async def compare_and_update_maintainers( for github_username, maintainer in new_maintainers_dict.items(): role = maintainer.normalized_title original_role = self.make_role(maintainer.title) - if github_username == "unknown": + if github_username == "unknown" and maintainer.email in ("unknown", None): self.logger.warning( - f"Skipping unkown github_username with title {maintainer.title}" + f"Skipping unknown github_username & email with title {maintainer.title}" ) continue elif github_username not in current_maintainers_dict: # New maintainer - identity_id = await find_github_identity(github_username) + identity_id = ( + await find_github_identity(github_username) + if github_username != "unknown" + else await find_maintainer_identity_by_email(maintainer.email) + ) self.logger.info(f"Found new maintainer {github_username} to be inserted") if identity_id: await upsert_maintainer( @@ -205,7 +209,7 @@ async def compare_and_update_maintainers( f"Successfully inserted new maintainer {github_username} with identity_id {identity_id}" ) else: - # will happend for new users if their identity isn't created yet but should fixed on the next iteration + # will happen for new users if their identity isn't created yet but should be fixed on the next iteration self.logger.warning(f"Identity not found for username: {github_username}") else: # Existing maintainer From 98ea9cecf5031d657c5a2c0940befc18ac156e60 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:45:05 +0000 Subject: [PATCH 07/12] chore: avoid bulding ai prompt when full content if batching is required Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 24559f5512..a050708b31 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -302,7 +302,6 @@ def get_extraction_prompt(self, filename: str, content_to_analyze: str) -> str: """ async def analyze_file_content(self, maintainer_filename: str, content: str): - prompt = self.get_extraction_prompt(maintainer_filename, content) if len(content) > self.MAX_CHUNK_SIZE: self.logger.info( "Maintainers file content exceeded max chunk size, splitting into chunks" @@ -346,7 +345,10 @@ async def process_chunk(chunk_index: int, chunk: str): aggregated_info.cost += chunk_info.cost maintainer_info = aggregated_info else: - maintainer_info = await invoke_bedrock(prompt, pydantic_model=MaintainerInfo) + maintainer_info = await invoke_bedrock( + self.get_extraction_prompt(maintainer_filename, content), + pydantic_model=MaintainerInfo, + ) self.logger.info("Maintainers file content analyzed by AI") self.logger.info(f"Maintainers response: {maintainer_info}") if maintainer_info.output.info is not None: From cdfc93dee5fa13194068df038347f8bbd117da1a Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:46:19 +0000 Subject: [PATCH 08/12] fix: remove duplicate rg pattern Signed-off-by: Mouad BANI --- .../crowdgit/services/maintainer/maintainer_service.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index a050708b31..da24923683 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -440,14 +440,7 @@ async def _ripgrep_search(self, repo_path: str) -> list[str]: """Search for files whose basename matches a governance stem, at any depth.""" glob_args = ["--glob", "!.git/"] for stem in self.GOVERNANCE_STEMS: - glob_args.extend( - [ - "--iglob", - f"*{stem}*", - "--iglob", - f"*{stem}*.*", - ] - ) + glob_args.extend(["--iglob", f"*{stem}*"]) try: output = await run_shell_command( From b4dd488928d13146d9656f2f7c154565f65786b5 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 13:54:24 +0000 Subject: [PATCH 09/12] chore: add extra validation for reamde files to have maintainer keyword in content Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index da24923683..419596f669 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -526,6 +526,11 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai Raises MaintanerAnalysisError if no maintainers are found. """ self.logger.info(f"Analyzing maintainer file: {filename}") + if "readme" in filename.lower() and "maintainer" not in content.lower(): + self.logger.warning( + f"Skipping README file '{filename}': no 'maintainer' keyword found in content" + ) + raise MaintanerAnalysisError(error_code=ErrorCode.NO_MAINTAINER_FOUND) result = await self.analyze_file_content(filename, content) if not result.output.info: From 3ae091ff145cb43d917731ffef2d8a27d505cb3b Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 14:32:41 +0000 Subject: [PATCH 10/12] feat: improve ai fallback detection by passing scored candidates and improve prompt Signed-off-by: Mouad BANI --- .../services/maintainer/maintainer_service.py | 56 +++++++++++++++---- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 419596f669..47f65d432a 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -46,6 +46,7 @@ class MaintainerService(BaseService): MAX_CHUNK_SIZE = 5000 MAX_CONCURRENT_CHUNKS = 3 + MAX_AI_FILE_LIST_SIZE = 300 # Full paths that get the highest score bonus when matched exactly KNOWN_PATHS = { @@ -369,33 +370,40 @@ async def process_chunk(chunk_index: int, chunk: str): ai_cost=maintainer_info.cost, ) - def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[str]) -> str: + def get_maintainer_file_prompt( + self, example_files: list[str], candidates: list[tuple[str, int]] + ) -> str: """ Generates the prompt for the LLM to identify a maintainer file from a list. + candidates: list of (filename, score) where score reflects name-match strength. """ example_files_str = "\n".join(f"- {name}" for name in example_files) - file_names_str = "\n".join(f"- {name}" for name in file_names) + candidates_str = "\n".join(f"- {name} [score={score}]" for name, score in candidates) return f""" - You are an expert AI assistant specializing in identifying repository governance files. Your task is to find a maintainer file from a given list of filenames. + You are an expert AI assistant specializing in identifying repository governance files. Your task is to find the single best maintainer file from a given list of candidates. - 1. **Analyze the Input**: Carefully review the list of filenames provided in the `` tag. - 2. **Identify a Maintainer File**: Compare each filename against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. - 3. **Apply Rules**: Follow all constraints listed in the `` section, especially the exclusion rule. - 4. **Select the First Match**: Scan the list and select the *first* filename that you identify as a maintainer file. You only need to find one. Once a match is found, stop searching. + 1. **Analyze the Input**: Carefully review the list of candidates in the `` tag. Each entry shows the file path and a pre-computed name-match score. + 2. **Identify the Best Maintainer File**: Compare each candidate against the characteristics of a maintainer file. These files typically define project ownership, governance, or code owners. Use the `` as a guide. + 3. **Use Signals to Rank**: When multiple candidates qualify, prefer: + - Higher **score** — stronger filename match against known governance patterns. + - Fewer path separators (`/`) in the path — files closer to the repo root apply to the whole project; deeply nested files are usually component-specific. + - When score and nesting conflict, prefer the file most likely to be the repo-wide governance file. + 4. **Apply Rules**: Follow all constraints listed in the `` section. 5. **Format the Output**: Return your answer as a single JSON object according to the `` specification, and nothing else. - **Definition**: A maintainer file's name usually contains keywords like `MAINTAINERS`, `CODEOWNERS`, or `OWNERS`. - **Exclusion**: The filename `CONTRIBUTING.md` must ALWAYS be ignored and never selected, even if it's the only file that seems relevant. + - **Third-party exclusion**: Do NOT select files that are inside directories associated with vendored dependencies, third-party libraries, or packages consumed by the project (e.g. paths containing `vendor/`, `node_modules/`, `third_party/`, `external/`, `.cache/`, `dist/`, `site-packages/`). These files belong to external projects, not this repository's own governance. - **No Match**: If no file in the list matches the criteria after checking all of them, you must return the 'not_found' error. - **Empty Input**: If the `` is empty or contains no filenames, you must return the 'not_found' error. - - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. + - **If a maintainer file is found**: Return a JSON object in the format `{{"file_name": ""}}`. - **If no maintainer file is found**: Return a JSON object in the format `{{"error": "not_found"}}`. @@ -404,15 +412,18 @@ def get_maintainer_file_prompt(self, example_files: list[str], file_names: list[ - {file_names_str} + {candidates_str} Return only the final JSON object. """ - async def find_maintainer_file_with_ai(self, file_names): + async def find_maintainer_file_with_ai( + self, candidates: list[tuple[str, int]] + ) -> tuple[str | None, float]: + """Ask AI to select the best maintainer file from scored candidates.""" self.logger.info("Using AI to find maintainer files...") - prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), file_names) + prompt = self.get_maintainer_file_prompt(sorted(self.KNOWN_PATHS), candidates) result = await invoke_bedrock(prompt, pydantic_model=MaintainerFile) if result.output.file_name is not None: @@ -606,6 +617,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: candidate_files = [(path, score) for path, _, score in candidates] # Step 3: Try AI analysis on top candidate + failed_candidate: str | None = None if candidates: filename, content, _ = candidates[0] try: @@ -618,13 +630,33 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: except Exception as e: self.logger.warning(f"Unexpected error analyzing '{filename}': {repr(e)}") + failed_candidate = filename self.logger.warning("Top candidate failed, trying AI file detection") else: self.logger.warning("No candidate files found via search, trying AI file detection") # Step 4: AI file detection as last resort file_names = await self._list_repo_files(repo_path) - ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(file_names) + # Pre-filter to governance-scored files to keep the AI prompt within model limits. + # Fall back to a hard-capped slice of the full list if nothing scores. + # Exclude the already-failed top candidate to avoid re-suggesting it. + scored_tuples = [ + (f, self._score_filename(f)) + for f in file_names + if self._score_filename(f) > 0 and f != failed_candidate + ] + ai_input_files: list[tuple[str, int]] = ( + scored_tuples + if scored_tuples + else [ + (f, 0) for f in file_names[: self.MAX_AI_FILE_LIST_SIZE] if f != failed_candidate + ] + ) + self.logger.info( + f"Passing {len(ai_input_files)} files to AI for maintainer file detection " + f"(total repo files: {len(file_names)})" + ) + ai_file_name, ai_cost = await self.find_maintainer_file_with_ai(ai_input_files) ai_suggested_file = ai_file_name total_cost += ai_cost From 59cff58ca65ed1c1beeb11fbe50f36512af827dc Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 14:34:35 +0000 Subject: [PATCH 11/12] chore: limit candiate_files saved in db to 100 Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index 47f65d432a..d366475f91 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -614,7 +614,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult: # Step 2: Find top candidate via filename search + scoring candidates = await self.find_candidate_files(repo_path) - candidate_files = [(path, score) for path, _, score in candidates] + candidate_files = [(path, score) for path, _, score in candidates][:100] # Step 3: Try AI analysis on top candidate failed_candidate: str | None = None From b38abdcb20d787690d84e90206cade76b74bab2c Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Wed, 11 Mar 2026 17:17:56 +0000 Subject: [PATCH 12/12] chore: add extra filename & stems Signed-off-by: Mouad BANI --- .../src/crowdgit/services/maintainer/maintainer_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py index d366475f91..808cf2e5e9 100644 --- a/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py +++ b/services/apps/git_integration/src/crowdgit/services/maintainer/maintainer_service.py @@ -66,6 +66,7 @@ class MaintainerService(BaseService): ".github/maintainers.md", ".github/contributors.md", ".github/codeowners", + "SECURITY-INSIGHTS.md", } # Governance stems (basename without extension, lowercased) for filename search @@ -90,6 +91,7 @@ class MaintainerService(BaseService): "core_team", "code_owners", "emeritus", + "workgroup", } VALID_EXTENSIONS = {