Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 71 additions & 7 deletions src/kit/code_searcher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import logging
import os
import re
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -35,16 +37,78 @@ def __init__(self, repo_path: str) -> None:
self._gitignore_spec = self._load_gitignore() # Load gitignore spec

def _load_gitignore(self):
"""Loads .gitignore rules from the repository root."""
gitignore_path = self.repo_path / ".gitignore"
if gitignore_path.exists():
"""Load all .gitignore files in repository tree and merge them.

Returns a PathSpec that respects all .gitignore files, with proper
precedence (deeper paths override shallower ones).
"""
gitignore_files = []

# Collect all .gitignore files
for dirpath, dirnames, filenames in os.walk(self.repo_path):
# Skip .git directory
if ".git" in Path(dirpath).parts:
continue

if ".gitignore" in filenames:
gitignore_path = Path(dirpath) / ".gitignore"
gitignore_files.append(gitignore_path)

if not gitignore_files:
return None

# Sort by depth (deepest first) for correct precedence
gitignore_files.sort(key=lambda p: len(p.parts), reverse=True)

# Collect all patterns with proper path prefixes
all_patterns = []
for gitignore_path in gitignore_files:
gitignore_dir = gitignore_path.parent

try:
with open(gitignore_path, "r", encoding="utf-8") as f:
return pathspec.PathSpec.from_lines("gitwildmatch", f)
patterns = f.readlines()

# Calculate relative base path from repo root
try:
rel_base = gitignore_dir.relative_to(self.repo_path)
except ValueError:
# gitignore outside repo (shouldn't happen, but be safe)
continue

# Process each pattern
for pattern in patterns:
pattern = pattern.strip()

# Skip empty lines and comments
if not pattern or pattern.startswith("#"):
continue

# Adjust pattern to be relative to repo root
if str(rel_base) != ".":
# Pattern is in subdirectory - prefix with path
if pattern.startswith("/"):
# Absolute pattern (from gitignore dir) - make relative to repo
adjusted = f"{rel_base}/{pattern[1:]}"
else:
# Relative pattern - prefix with directory path
adjusted = f"{rel_base}/{pattern}"
else:
# Pattern is in root .gitignore - use as-is
adjusted = pattern

all_patterns.append(adjusted)

except Exception as e:
# Log this error if logging is set up, or print
print(f"Warning: Could not load .gitignore: {e}")
return None
# Log warning but continue processing other .gitignore files
logging.warning(f"Could not load {gitignore_path}: {e}")
continue

if not all_patterns:
return None

# Create single merged pathspec
return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns)

def _should_ignore(self, file: Path) -> bool:
"""Checks if a file should be ignored based on .gitignore rules."""
Expand Down
77 changes: 72 additions & 5 deletions src/kit/repo_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,78 @@ def __init__(self, repo_path: str) -> None:
self._gitignore_spec = self._load_gitignore()

def _load_gitignore(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Ferymad Looks like the the _load_gitignore functions in repo_mapper.py and code_searcher.py ought to be extracted/unified/de-duplicated?

gitignore_path = self.repo_path / ".gitignore"
if gitignore_path.exists():
with open(gitignore_path) as f:
return pathspec.PathSpec.from_lines("gitwildmatch", f)
return None
"""Load all .gitignore files in repository tree and merge them.

Returns a PathSpec that respects all .gitignore files, with proper
precedence (deeper paths override shallower ones).
"""
gitignore_files = []

# Collect all .gitignore files
for dirpath, dirnames, filenames in os.walk(self.repo_path):
# Skip .git directory
if ".git" in Path(dirpath).parts:
continue

if ".gitignore" in filenames:
gitignore_path = Path(dirpath) / ".gitignore"
gitignore_files.append(gitignore_path)

if not gitignore_files:
return None

# Sort by depth (deepest first) for correct precedence
gitignore_files.sort(key=lambda p: len(p.parts), reverse=True)

# Collect all patterns with proper path prefixes
all_patterns = []
for gitignore_path in gitignore_files:
gitignore_dir = gitignore_path.parent

try:
with open(gitignore_path, "r", encoding="utf-8") as f:
patterns = f.readlines()

# Calculate relative base path from repo root
try:
rel_base = gitignore_dir.relative_to(self.repo_path)
except ValueError:
# gitignore outside repo (shouldn't happen, but be safe)
continue

# Process each pattern
for pattern in patterns:
pattern = pattern.strip()

# Skip empty lines and comments
if not pattern or pattern.startswith("#"):
continue

# Adjust pattern to be relative to repo root
if str(rel_base) != ".":
# Pattern is in subdirectory - prefix with path
if pattern.startswith("/"):
# Absolute pattern (from gitignore dir) - make relative to repo
adjusted = f"{rel_base}/{pattern[1:]}"
else:
# Relative pattern - prefix with directory path
adjusted = f"{rel_base}/{pattern}"
else:
# Pattern is in root .gitignore - use as-is
adjusted = pattern

all_patterns.append(adjusted)

except Exception as e:
# Log warning but continue processing other .gitignore files
logging.warning(f"Could not load {gitignore_path}: {e}")
continue

if not all_patterns:
return None

# Create single merged pathspec
return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns)

def _should_ignore(self, file: Path) -> bool:
# Handle potential symlink resolution mismatches
Expand Down
44 changes: 44 additions & 0 deletions tests/integration/test_humanlayer_repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
from pathlib import Path
from kit.repo_mapper import RepoMapper
import subprocess


@pytest.mark.integration
@pytest.mark.skipif(
not Path("/home/selman/dev/humanlayer").exists(),
reason="Requires humanlayer repository"
)
def test_humanlayer_repo_gitignore():
"""Integration test: Verify fix works on actual humanlayer repo."""

# Get git's file count
result = subprocess.run(
["git", "ls-files"],
cwd="/home/selman/dev/humanlayer",
capture_output=True,
text=True
)
git_files = set(result.stdout.strip().split("\n"))
git_count = len(git_files)

# Get kit's file count
mapper = RepoMapper("/home/selman/dev/humanlayer")
tree = mapper.get_file_tree()
kit_count = len(tree)
kit_paths = {item["path"] for item in tree}

# Should be approximately equal (within 10% tolerance for build artifacts)
tolerance = 0.1
assert abs(kit_count - git_count) / git_count < tolerance, \
f"Kit returned {kit_count} files, Git tracks {git_count} files"

# Should be well under token limit (assuming ~100 chars per file path)
estimated_tokens = kit_count * 100
assert estimated_tokens < 25000, \
f"Estimated {estimated_tokens} tokens (exceeds 25k limit)"

# Verify no node_modules files included
node_modules_files = [p for p in kit_paths if "node_modules" in p]
assert len(node_modules_files) == 0, \
f"Found {len(node_modules_files)} node_modules files (should be 0)"
138 changes: 138 additions & 0 deletions tests/test_gitignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import pytest
from pathlib import Path
import tempfile
from kit.repo_mapper import RepoMapper


def test_root_gitignore_only():
"""Test basic root .gitignore works as before."""
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)

# Create root .gitignore
(repo / ".gitignore").write_text("*.pyc\n__pycache__/\n")

# Create test files
(repo / "test.py").touch()
(repo / "test.pyc").touch()
(repo / "__pycache__").mkdir()
(repo / "__pycache__" / "test.pyc").touch()

mapper = RepoMapper(str(repo))
tree = mapper.get_file_tree()

# Should only include test.py, not .pyc or __pycache__
paths = [item["path"] for item in tree]
assert "test.py" in paths
assert "test.pyc" not in paths
assert "__pycache__/test.pyc" not in paths


def test_subdirectory_gitignore():
"""Test subdirectory .gitignore files are respected."""
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)

# Create subdirectory with its own .gitignore
subdir = repo / "frontend"
subdir.mkdir()
(subdir / ".gitignore").write_text("node_modules/\n*.log\n")

# Create test files
(subdir / "app.js").touch()
(subdir / "debug.log").touch()
node_modules = subdir / "node_modules"
node_modules.mkdir()
(node_modules / "package.json").touch()

mapper = RepoMapper(str(repo))
tree = mapper.get_file_tree()

# Should include app.js but not debug.log or node_modules
paths = [item["path"] for item in tree]
assert "frontend/app.js" in paths
assert "frontend/debug.log" not in paths
assert "frontend/node_modules/package.json" not in paths


def test_nested_gitignore_precedence():
"""Test deeper .gitignore files override shallower ones."""
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)

# Root .gitignore ignores *.tmp
(repo / ".gitignore").write_text("*.tmp\n")

# Subdirectory .gitignore allows *.tmp (negation)
subdir = repo / "special"
subdir.mkdir()
(subdir / ".gitignore").write_text("!*.tmp\n")

# Create test files
(repo / "root.tmp").touch()
(subdir / "special.tmp").touch()

mapper = RepoMapper(str(repo))
tree = mapper.get_file_tree()

# Root .tmp should be ignored, but special/ .tmp should be included
paths = [item["path"] for item in tree]
assert "root.tmp" not in paths
assert "special/special.tmp" in paths # Negation pattern


def test_multiple_subdirectory_gitignores():
"""Test multiple subdirectories each with .gitignore files."""
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)

# Frontend with node_modules
frontend = repo / "frontend"
frontend.mkdir()
(frontend / ".gitignore").write_text("node_modules/\n")
(frontend / "app.js").touch()
fe_nm = frontend / "node_modules"
fe_nm.mkdir()
(fe_nm / "react.js").touch()

# Backend with venv
backend = repo / "backend"
backend.mkdir()
(backend / ".gitignore").write_text("venv/\n__pycache__/\n")
(backend / "main.py").touch()
be_venv = backend / "venv"
be_venv.mkdir()
(be_venv / "python").touch()

mapper = RepoMapper(str(repo))
tree = mapper.get_file_tree()

paths = [item["path"] for item in tree]

# Should include source files
assert "frontend/app.js" in paths
assert "backend/main.py" in paths

# Should exclude ignored directories
assert "frontend/node_modules/react.js" not in paths
assert "backend/venv/python" not in paths


def test_no_gitignore_files():
"""Test repository with no .gitignore files."""
with tempfile.TemporaryDirectory() as tmpdir:
repo = Path(tmpdir)

# Create files without .gitignore
(repo / "test.py").touch()
subdir = repo / "src"
subdir.mkdir()
(subdir / "main.py").touch()

mapper = RepoMapper(str(repo))
tree = mapper.get_file_tree()

# All files should be included
paths = [item["path"] for item in tree]
assert "test.py" in paths
assert "src/main.py" in paths
Loading