Exclude test files from risk_map to fix coverage_gap noise

IronAdamant · claude · IronAdamant · commit 0df78c573cd0 · 2026-03-25T06:42:50.000+11:00
Test files always score coverage_gap=1.0 because edges go FROM test
units, never TO test-file code units. Including them masks real coverage
differences between source files. Default exclude_tests=True aligns
risk_map with test_gaps behavior. Pass --no-exclude-tests to include.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -56,6 +56,7 @@ chisel/
 - **Inline coupling partners**: `risk_map` includes `"coupling_partners"` (top 3 by co-commit count) in each file entry alongside the breakdown. Data is already fetched in the batch query — no extra DB calls.
 - **Triage tool**: Composite `triage` runs `risk_map` (top-N) + `test_gaps` (filtered to top-N files) + `stale_tests` in a single read lock. Returns a dict, not a list, so `limit` is not injected. Summary includes `test_edge_count`, `test_result_count`, and `coupling_threshold` for data quality visibility.
 - **Risk-map `_meta` envelope**: `tool_risk_map()` returns `{"files": [...], "_meta": {...}}` instead of a bare list. `_meta` contains: `total_files`, `coupling_threshold`, `total_test_edges`, `total_test_results`, `effective_components` (list of components that vary across files), `uniform_components` (dict of components with identical values + diagnostic reason). This tells LLM agents which risk components are providing real signal vs noise. `_build_risk_meta()` and `_diagnose_uniform()` in `engine.py`. `dispatch_tool()` in `mcp_server.py` applies `limit` to `result["files"]` for dict-wrapped responses. CLI `_limit()` handles both formats.
+- **Risk-map test-file exclusion**: `risk_map` and `triage` exclude test files by default (`exclude_tests=True`). Test files always score `coverage_gap=1.0` because edges go FROM test units, never TO test-file code units — including them adds noise and masks real coverage differences between source files. `storage.get_test_file_paths()` fetches distinct test file paths from `test_units`. CLI flag: `--no-exclude-tests`. Aligns with `test_gaps` which already excludes test files by default.
 
 ## Dev Commands
 
diff --git a/chisel/cli.py b/chisel/cli.py
@@ -83,6 +83,8 @@ def create_parser():
                             help="Show risk scores for all files")
     p_risk.add_argument("directory", nargs="?", default=None,
                         help="Directory to scope (default: all)")
+    p_risk.add_argument("--no-exclude-tests", action="store_true", default=False,
+                        help="Include test files in risk map")
 
     # stale-tests
     sub.add_parser("stale-tests", parents=[shared], help="Detect stale tests")
@@ -140,6 +142,8 @@ def create_parser():
                            help="Directory to scope (default: all)")
     p_triage.add_argument("--top-n", type=int, default=10,
                            help="Number of top-risk files (default: 10)")
+    p_triage.add_argument("--no-exclude-tests", action="store_true", default=False,
+                           help="Include test files in risk ranking")
 
     # serve
     p_serve = sub.add_parser("serve", parents=[shared],
@@ -297,7 +301,9 @@ def fmt(result, _args):
             print("\nDiagnostics (uniform components — not differentiating):")
             for comp, info in uniform.items():
                 print(f"  {comp}: {info['reason']}")
-    return _run_tool(args, "tool_risk_map", {"directory": args.directory}, fmt)
+    return _run_tool(args, "tool_risk_map",
+                     {"directory": args.directory,
+                      "exclude_tests": not args.no_exclude_tests}, fmt)
 
 
 def cmd_stale_tests(args):
@@ -386,7 +392,8 @@ def fmt(result, _args):
         else:
             print("\nNo stale tests found.")
     return _run_tool(args, "tool_triage",
-                     {"directory": args.directory, "top_n": args.top_n},
+                     {"directory": args.directory, "top_n": args.top_n,
+                      "exclude_tests": not args.no_exclude_tests},
                      fmt, use_limit=False)
 
 
diff --git a/chisel/engine.py b/chisel/engine.py
@@ -284,18 +284,25 @@ def tool_coupling(self, file_path, min_count=3):
                     return empty
                 return self.storage.get_co_changes(file_path, min_count)
 
-    def tool_risk_map(self, directory=None):
+    def tool_risk_map(self, directory=None, exclude_tests=True):
         """MCP tool: risk scores for all files.
 
         Returns ``{"files": [...], "_meta": {...}}`` so LLM agents can
         inspect which risk components are differentiating vs uniform noise.
+
+        Args:
+            directory: Optional subdirectory to scope the risk map.
+            exclude_tests: If True (default), exclude test files.  Test
+                files always score coverage_gap=1.0 (no edges point *to*
+                test-file code units), which adds noise and masks real
+                coverage signal.
         """
         with self._process_lock.shared():
             with self.lock.read_lock():
                 empty = self._check_analysis_data()
                 if empty is not None:
                     return empty
-                files = self.impact.get_risk_map(directory)
+                files = self.impact.get_risk_map(directory, exclude_tests)
                 meta = self._build_risk_meta(files)
                 return {"files": files, "_meta": meta}
 
@@ -414,14 +421,16 @@ def tool_record_result(self, test_id, passed, duration_ms=None):
                     result["heuristic_edges_created"] = edges_created
                 return result
 
-    def tool_triage(self, directory=None, top_n=10):
+    def tool_triage(self, directory=None, top_n=10, exclude_tests=True):
         """MCP tool: combined risk_map + test_gaps + stale_tests triage."""
         with self._process_lock.shared():
             with self.lock.read_lock():
                 empty = self._check_analysis_data()
                 if empty is not None:
                     return empty
-                risk_map = self.impact.get_risk_map(directory)[:top_n]
+                risk_map = self.impact.get_risk_map(
+                    directory, exclude_tests,
+                )[:top_n]
                 test_gaps = self.impact.get_test_gaps(directory=directory)
                 stale = self.impact.detect_stale_tests()
                 stats = self.storage.get_stats()
diff --git a/chisel/impact.py b/chisel/impact.py
@@ -223,20 +223,29 @@ def get_test_gaps(self, file_path=None, directory=None, exclude_tests=True):
     # Risk map
     # ------------------------------------------------------------------ #
 
-    def get_risk_map(self, directory=None):
+    def get_risk_map(self, directory=None, exclude_tests=True):
         """Compute risk scores for all tracked files (optionally in a directory).
 
         Uses batch queries to avoid the N+1 pattern: fetches churn, coupling,
         code units, edges, and blame for all files in a small number of queries.
 
+        Args:
+            directory: Optional subdirectory to scope the risk map.
+            exclude_tests: If True (default), exclude test files from the
+                risk map.  Test files always score coverage_gap=1.0 (edges
+                go *from* tests, never *to* test-file code units), which
+                adds noise and masks real coverage differences.
+
         Returns:
             List of dicts: {file_path, risk_score, breakdown}
         """
         all_churn = self.storage.get_all_churn_stats()
         dir_prefix = directory.rstrip("/") + "/" if directory else ""
+        test_files = self.storage.get_test_file_paths() if exclude_tests else set()
         files = sorted({
             stat["file_path"] for stat in all_churn
-            if not directory or stat["file_path"].startswith(dir_prefix)
+            if (not directory or stat["file_path"].startswith(dir_prefix))
+            and stat["file_path"] not in test_files
         })
         if not files:
             return []
diff --git a/chisel/schemas.py b/chisel/schemas.py
@@ -137,6 +137,13 @@
                     "type": "string",
                     "description": "Optional subdirectory to scope the risk map.",
                 },
+                "exclude_tests": {
+                    "type": "boolean",
+                    "description": (
+                        "Exclude test files (default: true). Test files always "
+                        "score coverage_gap=1.0, adding noise."
+                    ),
+                },
             },
             "required": [],
         },
@@ -280,6 +287,12 @@
                     "type": "integer",
                     "description": "Number of top-risk files to include (default: 10).",
                 },
+                "exclude_tests": {
+                    "type": "boolean",
+                    "description": (
+                        "Exclude test files from risk ranking (default: true)."
+                    ),
+                },
             },
             "required": [],
         },
@@ -297,15 +310,15 @@
     "churn": ("tool_churn", ["file_path", "unit_name"]),
     "ownership": ("tool_ownership", ["file_path"]),
     "coupling": ("tool_coupling", ["file_path", "min_count"]),
-    "risk_map": ("tool_risk_map", ["directory"]),
+    "risk_map": ("tool_risk_map", ["directory", "exclude_tests"]),
     "stale_tests": ("tool_stale_tests", []),
     "history": ("tool_history", ["file_path"]),
     "who_reviews": ("tool_who_reviews", ["file_path"]),
     "diff_impact": ("tool_diff_impact", ["ref"]),
     "update": ("tool_update", []),
     "test_gaps": ("tool_test_gaps", ["file_path", "directory", "exclude_tests"]),
     "record_result": ("tool_record_result", ["test_id", "passed", "duration_ms"]),
-    "triage": ("tool_triage", ["directory", "top_n"]),
+    "triage": ("tool_triage", ["directory", "top_n", "exclude_tests"]),
     "stats": ("tool_stats", []),
 }
 
diff --git a/chisel/storage.py b/chisel/storage.py
@@ -264,6 +264,13 @@ def get_test_units_by_file(self, file_path):
     def get_all_test_units(self):
         return self._fetchall("SELECT * FROM test_units ORDER BY file_path, name")
 
+    def get_test_file_paths(self):
+        """Return the set of file paths that contain test units."""
+        rows = self._fetchall(
+            "SELECT DISTINCT file_path FROM test_units",
+        )
+        return {r["file_path"] for r in rows}
+
     # --- test_edges ---
 
     def upsert_test_edge(self, test_id, code_id, edge_type, weight=1.0):
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -725,7 +725,7 @@ def test_main_risk_map(self, mock_cls):
 
         main(["risk-map", "--project-dir", "/tmp/p"])
 
-        engine.tool_risk_map.assert_called_once_with(directory=None)
+        engine.tool_risk_map.assert_called_once_with(directory=None, exclude_tests=True)
 
     @patch("chisel.cli.ChiselEngine")
     def test_main_stale_tests(self, mock_cls):
@@ -831,7 +831,7 @@ def test_main_triage(self, mock_cls):
 
         main(["triage", "--project-dir", "/tmp/p"])
 
-        engine.tool_triage.assert_called_once_with(directory=None, top_n=10)
+        engine.tool_triage.assert_called_once_with(directory=None, top_n=10, exclude_tests=True)
 
     @patch("chisel.cli.ChiselEngine")
     def test_main_triage_with_args(self, mock_cls):
@@ -844,7 +844,7 @@ def test_main_triage_with_args(self, mock_cls):
 
         main(["triage", "--project-dir", "/tmp/p", "src/", "--top-n", "5"])
 
-        engine.tool_triage.assert_called_once_with(directory="src/", top_n=5)
+        engine.tool_triage.assert_called_once_with(directory="src/", top_n=5, exclude_tests=True)
 
     @patch("chisel.cli.ChiselEngine")
     def test_main_stats(self, mock_cls):
diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -138,6 +138,31 @@ def test_tool_risk_map(self, engine):
         assert isinstance(result["files"], list)
         assert len(result["files"]) > 0
 
+    def test_tool_risk_map_excludes_test_files(self, engine):
+        engine.analyze()
+        result = engine.tool_risk_map()
+        files = [r["file_path"] for r in result["files"]]
+        # test_app.py should be excluded by default
+        assert not any("test_" in f for f in files)
+        # Source file should be present
+        assert "app.py" in files
+
+    def test_tool_risk_map_include_tests(self, engine):
+        engine.analyze()
+        result = engine.tool_risk_map(exclude_tests=False)
+        files = [r["file_path"] for r in result["files"]]
+        assert any("test_" in f for f in files)
+
+    def test_tool_risk_map_coverage_gap_with_edges(self, engine):
+        """coverage_gap should be < 1.0 for files with test edges."""
+        engine.analyze()
+        result = engine.tool_risk_map()
+        app = next(r for r in result["files"] if r["file_path"] == "app.py")
+        # app.py has 3 functions, 2 are tested (process_data, validate_input)
+        # format_output is untested → coverage_gap = 1/3 ≈ 0.33
+        assert app["breakdown"]["coverage_gap"] < 1.0
+        assert abs(app["breakdown"]["coverage_gap"] - 0.3333) < 0.01
+
     def test_tool_stale_tests(self, engine):
         engine.analyze()
         result = engine.tool_stale_tests()
diff --git a/tests/test_impact.py b/tests/test_impact.py
@@ -209,6 +209,48 @@ def test_risk_map_coupling_partners_empty_when_no_coupling(self, storage, analyz
         solo = next(r for r in risk_map if r["file_path"] == "solo.py")
         assert solo["coupling_partners"] == []
 
+    def test_coverage_gap_reflects_test_edges(self, storage, analyzer):
+        """Files with test edges should have coverage_gap < 1.0."""
+        _seed_basic_data(storage)
+        risk_map = analyzer.get_risk_map()
+        app = next(r for r in risk_map if r["file_path"] == "app.py")
+        lib = next(r for r in risk_map if r["file_path"] == "lib.py")
+        # app.py: foo and bar both have edges → 0/2 gap → 0.0
+        assert app["breakdown"]["coverage_gap"] == 0.0
+        # lib.py: helper has edge → 0/1 gap → 0.0
+        assert lib["breakdown"]["coverage_gap"] == 0.0
+
+    def test_coverage_gap_partial_coverage(self, storage, analyzer):
+        """File with some tested and some untested units."""
+        storage.upsert_code_unit("m.py:a:func", "m.py", "a", "func", 1, 5)
+        storage.upsert_code_unit("m.py:b:func", "m.py", "b", "func", 6, 10)
+        storage.upsert_code_unit("m.py:c:func", "m.py", "c", "func", 11, 15)
+        storage.upsert_test_unit("test_m.py:t1", "test_m.py", "t1", "pytest")
+        storage.upsert_test_edge("test_m.py:t1", "m.py:a:func", "import")
+        storage.upsert_churn_stat("m.py", "", churn_score=1.0)
+        risk_map = analyzer.get_risk_map()
+        entry = next(r for r in risk_map if r["file_path"] == "m.py")
+        # 1 of 3 tested → coverage 0.333, gap 0.667
+        assert abs(entry["breakdown"]["coverage_gap"] - 0.6667) < 0.01
+
+    def test_exclude_tests_filters_test_files(self, storage, analyzer):
+        """Test files should be excluded from risk_map by default."""
+        _seed_basic_data(storage)
+        # test_app.py has churn (from seed) — would appear without filtering
+        storage.upsert_churn_stat("test_app.py", "", churn_score=1.0)
+        risk_map = analyzer.get_risk_map()
+        files = [r["file_path"] for r in risk_map]
+        assert "test_app.py" not in files
+        assert "app.py" in files
+
+    def test_exclude_tests_false_includes_test_files(self, storage, analyzer):
+        """exclude_tests=False includes test files."""
+        _seed_basic_data(storage)
+        storage.upsert_churn_stat("test_app.py", "", churn_score=1.0)
+        risk_map = analyzer.get_risk_map(exclude_tests=False)
+        files = [r["file_path"] for r in risk_map]
+        assert "test_app.py" in files
+
 
 class TestGetOwnership:
     def test_returns_authors(self, storage, analyzer):