Skip to content

Commit 0df78c5

Browse files
IronAdamantclaude
andcommitted
Exclude test files from risk_map to fix coverage_gap noise
Test files always score coverage_gap=1.0 because edges go FROM test units, never TO test-file code units. Including them masks real coverage differences between source files. Default exclude_tests=True aligns risk_map with test_gaps behavior. Pass --no-exclude-tests to include. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7fc2db7 commit 0df78c5

9 files changed

Lines changed: 126 additions & 13 deletions

File tree

CLAUDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ chisel/
5656
- **Inline coupling partners**: `risk_map` includes `"coupling_partners"` (top 3 by co-commit count) in each file entry alongside the breakdown. Data is already fetched in the batch query — no extra DB calls.
5757
- **Triage tool**: Composite `triage` runs `risk_map` (top-N) + `test_gaps` (filtered to top-N files) + `stale_tests` in a single read lock. Returns a dict, not a list, so `limit` is not injected. Summary includes `test_edge_count`, `test_result_count`, and `coupling_threshold` for data quality visibility.
5858
- **Risk-map `_meta` envelope**: `tool_risk_map()` returns `{"files": [...], "_meta": {...}}` instead of a bare list. `_meta` contains: `total_files`, `coupling_threshold`, `total_test_edges`, `total_test_results`, `effective_components` (list of components that vary across files), `uniform_components` (dict of components with identical values + diagnostic reason). This tells LLM agents which risk components are providing real signal vs noise. `_build_risk_meta()` and `_diagnose_uniform()` in `engine.py`. `dispatch_tool()` in `mcp_server.py` applies `limit` to `result["files"]` for dict-wrapped responses. CLI `_limit()` handles both formats.
59+
- **Risk-map test-file exclusion**: `risk_map` and `triage` exclude test files by default (`exclude_tests=True`). Test files always score `coverage_gap=1.0` because edges go FROM test units, never TO test-file code units — including them adds noise and masks real coverage differences between source files. `storage.get_test_file_paths()` fetches distinct test file paths from `test_units`. CLI flag: `--no-exclude-tests`. Aligns with `test_gaps` which already excludes test files by default.
5960

6061
## Dev Commands
6162

chisel/cli.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def create_parser():
8383
help="Show risk scores for all files")
8484
p_risk.add_argument("directory", nargs="?", default=None,
8585
help="Directory to scope (default: all)")
86+
p_risk.add_argument("--no-exclude-tests", action="store_true", default=False,
87+
help="Include test files in risk map")
8688

8789
# stale-tests
8890
sub.add_parser("stale-tests", parents=[shared], help="Detect stale tests")
@@ -140,6 +142,8 @@ def create_parser():
140142
help="Directory to scope (default: all)")
141143
p_triage.add_argument("--top-n", type=int, default=10,
142144
help="Number of top-risk files (default: 10)")
145+
p_triage.add_argument("--no-exclude-tests", action="store_true", default=False,
146+
help="Include test files in risk ranking")
143147

144148
# serve
145149
p_serve = sub.add_parser("serve", parents=[shared],
@@ -297,7 +301,9 @@ def fmt(result, _args):
297301
print("\nDiagnostics (uniform components — not differentiating):")
298302
for comp, info in uniform.items():
299303
print(f" {comp}: {info['reason']}")
300-
return _run_tool(args, "tool_risk_map", {"directory": args.directory}, fmt)
304+
return _run_tool(args, "tool_risk_map",
305+
{"directory": args.directory,
306+
"exclude_tests": not args.no_exclude_tests}, fmt)
301307

302308

303309
def cmd_stale_tests(args):
@@ -386,7 +392,8 @@ def fmt(result, _args):
386392
else:
387393
print("\nNo stale tests found.")
388394
return _run_tool(args, "tool_triage",
389-
{"directory": args.directory, "top_n": args.top_n},
395+
{"directory": args.directory, "top_n": args.top_n,
396+
"exclude_tests": not args.no_exclude_tests},
390397
fmt, use_limit=False)
391398

392399

chisel/engine.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -284,18 +284,25 @@ def tool_coupling(self, file_path, min_count=3):
284284
return empty
285285
return self.storage.get_co_changes(file_path, min_count)
286286

287-
def tool_risk_map(self, directory=None):
287+
def tool_risk_map(self, directory=None, exclude_tests=True):
288288
"""MCP tool: risk scores for all files.
289289
290290
Returns ``{"files": [...], "_meta": {...}}`` so LLM agents can
291291
inspect which risk components are differentiating vs uniform noise.
292+
293+
Args:
294+
directory: Optional subdirectory to scope the risk map.
295+
exclude_tests: If True (default), exclude test files. Test
296+
files always score coverage_gap=1.0 (no edges point *to*
297+
test-file code units), which adds noise and masks real
298+
coverage signal.
292299
"""
293300
with self._process_lock.shared():
294301
with self.lock.read_lock():
295302
empty = self._check_analysis_data()
296303
if empty is not None:
297304
return empty
298-
files = self.impact.get_risk_map(directory)
305+
files = self.impact.get_risk_map(directory, exclude_tests)
299306
meta = self._build_risk_meta(files)
300307
return {"files": files, "_meta": meta}
301308

@@ -414,14 +421,16 @@ def tool_record_result(self, test_id, passed, duration_ms=None):
414421
result["heuristic_edges_created"] = edges_created
415422
return result
416423

417-
def tool_triage(self, directory=None, top_n=10):
424+
def tool_triage(self, directory=None, top_n=10, exclude_tests=True):
418425
"""MCP tool: combined risk_map + test_gaps + stale_tests triage."""
419426
with self._process_lock.shared():
420427
with self.lock.read_lock():
421428
empty = self._check_analysis_data()
422429
if empty is not None:
423430
return empty
424-
risk_map = self.impact.get_risk_map(directory)[:top_n]
431+
risk_map = self.impact.get_risk_map(
432+
directory, exclude_tests,
433+
)[:top_n]
425434
test_gaps = self.impact.get_test_gaps(directory=directory)
426435
stale = self.impact.detect_stale_tests()
427436
stats = self.storage.get_stats()

chisel/impact.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,20 +223,29 @@ def get_test_gaps(self, file_path=None, directory=None, exclude_tests=True):
223223
# Risk map
224224
# ------------------------------------------------------------------ #
225225

226-
def get_risk_map(self, directory=None):
226+
def get_risk_map(self, directory=None, exclude_tests=True):
227227
"""Compute risk scores for all tracked files (optionally in a directory).
228228
229229
Uses batch queries to avoid the N+1 pattern: fetches churn, coupling,
230230
code units, edges, and blame for all files in a small number of queries.
231231
232+
Args:
233+
directory: Optional subdirectory to scope the risk map.
234+
exclude_tests: If True (default), exclude test files from the
235+
risk map. Test files always score coverage_gap=1.0 (edges
236+
go *from* tests, never *to* test-file code units), which
237+
adds noise and masks real coverage differences.
238+
232239
Returns:
233240
List of dicts: {file_path, risk_score, breakdown}
234241
"""
235242
all_churn = self.storage.get_all_churn_stats()
236243
dir_prefix = directory.rstrip("/") + "/" if directory else ""
244+
test_files = self.storage.get_test_file_paths() if exclude_tests else set()
237245
files = sorted({
238246
stat["file_path"] for stat in all_churn
239-
if not directory or stat["file_path"].startswith(dir_prefix)
247+
if (not directory or stat["file_path"].startswith(dir_prefix))
248+
and stat["file_path"] not in test_files
240249
})
241250
if not files:
242251
return []

chisel/schemas.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@
137137
"type": "string",
138138
"description": "Optional subdirectory to scope the risk map.",
139139
},
140+
"exclude_tests": {
141+
"type": "boolean",
142+
"description": (
143+
"Exclude test files (default: true). Test files always "
144+
"score coverage_gap=1.0, adding noise."
145+
),
146+
},
140147
},
141148
"required": [],
142149
},
@@ -280,6 +287,12 @@
280287
"type": "integer",
281288
"description": "Number of top-risk files to include (default: 10).",
282289
},
290+
"exclude_tests": {
291+
"type": "boolean",
292+
"description": (
293+
"Exclude test files from risk ranking (default: true)."
294+
),
295+
},
283296
},
284297
"required": [],
285298
},
@@ -297,15 +310,15 @@
297310
"churn": ("tool_churn", ["file_path", "unit_name"]),
298311
"ownership": ("tool_ownership", ["file_path"]),
299312
"coupling": ("tool_coupling", ["file_path", "min_count"]),
300-
"risk_map": ("tool_risk_map", ["directory"]),
313+
"risk_map": ("tool_risk_map", ["directory", "exclude_tests"]),
301314
"stale_tests": ("tool_stale_tests", []),
302315
"history": ("tool_history", ["file_path"]),
303316
"who_reviews": ("tool_who_reviews", ["file_path"]),
304317
"diff_impact": ("tool_diff_impact", ["ref"]),
305318
"update": ("tool_update", []),
306319
"test_gaps": ("tool_test_gaps", ["file_path", "directory", "exclude_tests"]),
307320
"record_result": ("tool_record_result", ["test_id", "passed", "duration_ms"]),
308-
"triage": ("tool_triage", ["directory", "top_n"]),
321+
"triage": ("tool_triage", ["directory", "top_n", "exclude_tests"]),
309322
"stats": ("tool_stats", []),
310323
}
311324

chisel/storage.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,13 @@ def get_test_units_by_file(self, file_path):
264264
def get_all_test_units(self):
265265
return self._fetchall("SELECT * FROM test_units ORDER BY file_path, name")
266266

267+
def get_test_file_paths(self):
268+
"""Return the set of file paths that contain test units."""
269+
rows = self._fetchall(
270+
"SELECT DISTINCT file_path FROM test_units",
271+
)
272+
return {r["file_path"] for r in rows}
273+
267274
# --- test_edges ---
268275

269276
def upsert_test_edge(self, test_id, code_id, edge_type, weight=1.0):

tests/test_cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ def test_main_risk_map(self, mock_cls):
725725

726726
main(["risk-map", "--project-dir", "/tmp/p"])
727727

728-
engine.tool_risk_map.assert_called_once_with(directory=None)
728+
engine.tool_risk_map.assert_called_once_with(directory=None, exclude_tests=True)
729729

730730
@patch("chisel.cli.ChiselEngine")
731731
def test_main_stale_tests(self, mock_cls):
@@ -831,7 +831,7 @@ def test_main_triage(self, mock_cls):
831831

832832
main(["triage", "--project-dir", "/tmp/p"])
833833

834-
engine.tool_triage.assert_called_once_with(directory=None, top_n=10)
834+
engine.tool_triage.assert_called_once_with(directory=None, top_n=10, exclude_tests=True)
835835

836836
@patch("chisel.cli.ChiselEngine")
837837
def test_main_triage_with_args(self, mock_cls):
@@ -844,7 +844,7 @@ def test_main_triage_with_args(self, mock_cls):
844844

845845
main(["triage", "--project-dir", "/tmp/p", "src/", "--top-n", "5"])
846846

847-
engine.tool_triage.assert_called_once_with(directory="src/", top_n=5)
847+
engine.tool_triage.assert_called_once_with(directory="src/", top_n=5, exclude_tests=True)
848848

849849
@patch("chisel.cli.ChiselEngine")
850850
def test_main_stats(self, mock_cls):

tests/test_engine.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,31 @@ def test_tool_risk_map(self, engine):
138138
assert isinstance(result["files"], list)
139139
assert len(result["files"]) > 0
140140

141+
def test_tool_risk_map_excludes_test_files(self, engine):
142+
engine.analyze()
143+
result = engine.tool_risk_map()
144+
files = [r["file_path"] for r in result["files"]]
145+
# test_app.py should be excluded by default
146+
assert not any("test_" in f for f in files)
147+
# Source file should be present
148+
assert "app.py" in files
149+
150+
def test_tool_risk_map_include_tests(self, engine):
151+
engine.analyze()
152+
result = engine.tool_risk_map(exclude_tests=False)
153+
files = [r["file_path"] for r in result["files"]]
154+
assert any("test_" in f for f in files)
155+
156+
def test_tool_risk_map_coverage_gap_with_edges(self, engine):
157+
"""coverage_gap should be < 1.0 for files with test edges."""
158+
engine.analyze()
159+
result = engine.tool_risk_map()
160+
app = next(r for r in result["files"] if r["file_path"] == "app.py")
161+
# app.py has 3 functions, 2 are tested (process_data, validate_input)
162+
# format_output is untested → coverage_gap = 1/3 ≈ 0.33
163+
assert app["breakdown"]["coverage_gap"] < 1.0
164+
assert abs(app["breakdown"]["coverage_gap"] - 0.3333) < 0.01
165+
141166
def test_tool_stale_tests(self, engine):
142167
engine.analyze()
143168
result = engine.tool_stale_tests()

tests/test_impact.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,48 @@ def test_risk_map_coupling_partners_empty_when_no_coupling(self, storage, analyz
209209
solo = next(r for r in risk_map if r["file_path"] == "solo.py")
210210
assert solo["coupling_partners"] == []
211211

212+
def test_coverage_gap_reflects_test_edges(self, storage, analyzer):
213+
"""Files with test edges should have coverage_gap < 1.0."""
214+
_seed_basic_data(storage)
215+
risk_map = analyzer.get_risk_map()
216+
app = next(r for r in risk_map if r["file_path"] == "app.py")
217+
lib = next(r for r in risk_map if r["file_path"] == "lib.py")
218+
# app.py: foo and bar both have edges → 0/2 gap → 0.0
219+
assert app["breakdown"]["coverage_gap"] == 0.0
220+
# lib.py: helper has edge → 0/1 gap → 0.0
221+
assert lib["breakdown"]["coverage_gap"] == 0.0
222+
223+
def test_coverage_gap_partial_coverage(self, storage, analyzer):
224+
"""File with some tested and some untested units."""
225+
storage.upsert_code_unit("m.py:a:func", "m.py", "a", "func", 1, 5)
226+
storage.upsert_code_unit("m.py:b:func", "m.py", "b", "func", 6, 10)
227+
storage.upsert_code_unit("m.py:c:func", "m.py", "c", "func", 11, 15)
228+
storage.upsert_test_unit("test_m.py:t1", "test_m.py", "t1", "pytest")
229+
storage.upsert_test_edge("test_m.py:t1", "m.py:a:func", "import")
230+
storage.upsert_churn_stat("m.py", "", churn_score=1.0)
231+
risk_map = analyzer.get_risk_map()
232+
entry = next(r for r in risk_map if r["file_path"] == "m.py")
233+
# 1 of 3 tested → coverage 0.333, gap 0.667
234+
assert abs(entry["breakdown"]["coverage_gap"] - 0.6667) < 0.01
235+
236+
def test_exclude_tests_filters_test_files(self, storage, analyzer):
237+
"""Test files should be excluded from risk_map by default."""
238+
_seed_basic_data(storage)
239+
# test_app.py has churn (from seed) — would appear without filtering
240+
storage.upsert_churn_stat("test_app.py", "", churn_score=1.0)
241+
risk_map = analyzer.get_risk_map()
242+
files = [r["file_path"] for r in risk_map]
243+
assert "test_app.py" not in files
244+
assert "app.py" in files
245+
246+
def test_exclude_tests_false_includes_test_files(self, storage, analyzer):
247+
"""exclude_tests=False includes test files."""
248+
_seed_basic_data(storage)
249+
storage.upsert_churn_stat("test_app.py", "", churn_score=1.0)
250+
risk_map = analyzer.get_risk_map(exclude_tests=False)
251+
files = [r["file_path"] for r in risk_map]
252+
assert "test_app.py" in files
253+
212254

213255
class TestGetOwnership:
214256
def test_returns_authors(self, storage, analyzer):

0 commit comments

Comments
 (0)