Skip to content

Commit 3b3e10e

Browse files
authored
Merge pull request #26 from codellm-devkit/25-codeql-edge-startline-join-fallback
Fix CodeQL call-graph edges dropped on (file, start_line) join miss (#25)
2 parents eae4e9c + 95ec64a commit 3b3e10e

3 files changed

Lines changed: 115 additions & 28 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.15] - 2026-05-15
9+
10+
### Fixed
11+
- **CodeQL call-graph edges silently dropped on `(file, start_line)` join miss** ([#25](https://github.com/codellm-devkit/codeanalyzer-python/issues/25)). CodeQL endpoints were matched back into Jedi's `PyCallable` signature space by an exact `(absolute_file_path, start_line)` key; when CodeQL and Jedi disagreed on a definition's start line (commonly with decorated functions), the caller lookup missed and the entire edge was discarded (callee misses degraded to ghost nodes). Replaced the exact-only index with a resolution ladder: exact `(file, start_line)` → candidates sharing `(file, short_name)` (single candidate taken directly, else nearest `start_line` among those whose parameter count matches the CodeQL positional arity) → no match (caller skipped / callee ghost, as before). The CodeQL query now emits `Function.getName()` and positional arity for both endpoints. Jedi's parameter count includes `*args`/`**kwargs`/keyword-only slots while CodeQL's arity is positional only, so the arity filter is exact for plain signatures and yields to the nearest-line tiebreak otherwise.
12+
813
## [0.1.14] - 2026-05-13
914

1015
### Added

codeanalyzer/semantic_analysis/codeql/codeql_analysis.py

Lines changed: 109 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,67 @@
3232
from codeanalyzer.utils import logger
3333

3434

35+
class _CallableResolver:
36+
"""Maps a CodeQL endpoint ``(file, start_line, name, arity)`` to a Jedi
37+
``PyCallable``.
38+
39+
Resolution ladder:
40+
1. exact ``(abs_path, start_line)`` — the precise join;
41+
2. on miss, candidates sharing ``(abs_path, short_name)``: a single
42+
candidate is taken directly; otherwise prefer those whose
43+
parameter count equals the CodeQL positional arity, then the
44+
nearest ``start_line``;
45+
3. no name match -> ``None`` (caller row skipped / callee becomes
46+
a ghost node).
47+
48+
Step 2 recovers edges the ``(file, line)`` join silently drops when
49+
CodeQL and Jedi disagree on a definition's start line (e.g. decorator
50+
handling). Jedi's ``parameters`` counts every declared slot (incl.
51+
``*args``/``**kwargs``/keyword-only) whereas CodeQL's arity is
52+
positional only, so the arity filter is exact for plain signatures
53+
and otherwise yields to the nearest-line tiebreak.
54+
"""
55+
56+
def __init__(self) -> None:
57+
self._by_loc: Dict[Tuple[str, int], Any] = {}
58+
self._by_name: Dict[Tuple[str, str], List[Any]] = {}
59+
60+
@staticmethod
61+
def _abs(path: str) -> str:
62+
try:
63+
return str(Path(path).resolve())
64+
except (OSError, RuntimeError):
65+
return path
66+
67+
@classmethod
68+
def from_symbol_table(
69+
cls, symbol_table: Dict[str, PyModule]
70+
) -> "_CallableResolver":
71+
resolver = cls()
72+
for c in iter_callables_in_symbol_table(symbol_table):
73+
abs_path = cls._abs(c.path)
74+
resolver._by_loc[(abs_path, c.start_line)] = c
75+
resolver._by_name.setdefault((abs_path, c.name), []).append(c)
76+
return resolver
77+
78+
def resolve(
79+
self, file: str, start_line: int, name: str, arity: int
80+
) -> Any:
81+
exact = self._by_loc.get((file, start_line))
82+
if exact is not None:
83+
return exact
84+
if not name:
85+
return None
86+
candidates = self._by_name.get((file, name))
87+
if not candidates:
88+
return None
89+
if len(candidates) == 1:
90+
return candidates[0]
91+
arity_matched = [c for c in candidates if len(c.parameters) == arity]
92+
pool = arity_matched or candidates
93+
return min(pool, key=lambda c: abs(c.start_line - start_line))
94+
95+
3596
class CodeQL:
3697
"""A class for building the application view of a Python application using CodeQL.
3798
@@ -99,9 +160,14 @@ def _query_call_edges(self) -> DataFrame:
99160
# codeql/python-all 7.x — it returns the ``CallNode`` (CFG)
100161
# whose target was resolved to that ``Value``. Cleaner than
101162
# poking at ``pointsTo`` directly.
102-
"from CallNode call, Function caller, FunctionValue calleeVal",
163+
# ``callee`` is bound to the FunctionValue's scope so the
164+
# endpoint emits the same Function-level facts (name, arity,
165+
# location) the post-processor needs for the name+arity
166+
# fallback when the (file, start_line) join misses.
167+
"from CallNode call, Function caller, FunctionValue calleeVal, Function callee",
103168
"where",
104169
" call.getScope() = caller and",
170+
" callee = calleeVal.getScope() and",
105171
" (",
106172
# Direct function / bound-method call: foo() or obj.foo()
107173
" call = calleeVal.getACall()",
@@ -115,15 +181,20 @@ def _query_call_edges(self) -> DataFrame:
115181
" )",
116182
" )",
117183
"select",
118-
# --- Caller endpoint --- (joins to PyCallable via file + start_line)
184+
# --- Caller endpoint --- (joins to PyCallable: exact by
185+
# (file, start_line), else by (file, name) + arity)
119186
" caller.getLocation().getFile().getAbsolutePath(),",
120187
" caller.getLocation().getStartLine(),",
121188
" caller.getQualifiedName(),",
189+
" caller.getName(),",
190+
" count(caller.getArg(_)),",
122191
# --- Callee endpoint --- (file/line may live in a library stub;
123192
# post-processor classifies as in-source or ghost)
124-
" calleeVal.getScope().getLocation().getFile().getAbsolutePath(),",
125-
" calleeVal.getScope().getLocation().getStartLine(),",
193+
" callee.getLocation().getFile().getAbsolutePath(),",
194+
" callee.getLocation().getStartLine(),",
126195
" calleeVal.getQualifiedName(),",
196+
" callee.getName(),",
197+
" count(callee.getArg(_)),",
127198
# --- Call-site location --- (for PyCallsite augmentation)
128199
" call.getLocation().getStartLine(),",
129200
" call.getLocation().getStartColumn(),",
@@ -149,9 +220,13 @@ def _query_call_edges(self) -> DataFrame:
149220
"caller_file",
150221
"caller_start_line",
151222
"caller_qname",
223+
"caller_name",
224+
"caller_arity",
152225
"callee_file",
153226
"callee_start_line",
154227
"callee_qname",
228+
"callee_name",
229+
"callee_arity",
155230
"call_start_line",
156231
"call_start_column",
157232
"call_end_line",
@@ -162,24 +237,15 @@ def _query_call_edges(self) -> DataFrame:
162237
return df
163238

164239
@staticmethod
165-
def _build_callable_location_index(
240+
def _build_callable_resolver(
166241
symbol_table: Dict[str, PyModule],
167-
) -> Dict[Tuple[str, int], "PyCallable"]:
168-
"""Build ``(absolute_file_path, start_line) -> PyCallable`` from Jedi.
242+
) -> _CallableResolver:
243+
"""Build the endpoint -> ``PyCallable`` resolver from Jedi.
169244
170245
Paths are resolved so they match CodeQL's ``getAbsolutePath()``
171246
regardless of symlinks or the current working directory.
172247
"""
173-
from codeanalyzer.schema.py_schema import PyCallable # local to avoid cycle
174-
175-
index: Dict[Tuple[str, int], PyCallable] = {}
176-
for c in iter_callables_in_symbol_table(symbol_table):
177-
try:
178-
abs_path = str(Path(c.path).resolve())
179-
except (OSError, RuntimeError):
180-
abs_path = c.path
181-
index[(abs_path, c.start_line)] = c
182-
return index
248+
return _CallableResolver.from_symbol_table(symbol_table)
183249

184250
def _iter_resolved_rows(
185251
self, symbol_table: Dict[str, PyModule]
@@ -194,19 +260,27 @@ def _iter_resolved_rows(
194260
df = self._query_call_edges()
195261
if df.empty:
196262
return
197-
location_index = self._build_callable_location_index(symbol_table)
263+
resolver = self._build_callable_resolver(symbol_table)
198264

199265
skipped_unknown_caller = 0
200266
ghost_callees = 0
201267
for row in df.itertuples(index=False):
202-
caller_key = (row.caller_file, int(row.caller_start_line))
203-
caller = location_index.get(caller_key)
268+
caller = resolver.resolve(
269+
row.caller_file,
270+
int(row.caller_start_line),
271+
row.caller_name,
272+
int(row.caller_arity),
273+
)
204274
if caller is None:
205275
skipped_unknown_caller += 1
206276
continue
207277

208-
callee_key = (row.callee_file, int(row.callee_start_line))
209-
callee = location_index.get(callee_key)
278+
callee = resolver.resolve(
279+
row.callee_file,
280+
int(row.callee_start_line),
281+
row.callee_name,
282+
int(row.callee_arity),
283+
)
210284
if callee is not None:
211285
target_sig = callee.signature
212286
else:
@@ -267,20 +341,28 @@ def augment_call_sites(self, symbol_table: Dict[str, PyModule]) -> int:
267341
Returns:
268342
Number of ``PyCallsite`` entries augmented.
269343
"""
270-
location_index = self._build_callable_location_index(symbol_table)
344+
resolver = self._build_callable_resolver(symbol_table)
271345
df = self._query_call_edges()
272346
if df.empty:
273347
return 0
274348

275349
augmented = 0
276350
for row in df.itertuples(index=False):
277-
caller_key = (row.caller_file, int(row.caller_start_line))
278-
caller = location_index.get(caller_key)
351+
caller = resolver.resolve(
352+
row.caller_file,
353+
int(row.caller_start_line),
354+
row.caller_name,
355+
int(row.caller_arity),
356+
)
279357
if caller is None:
280358
continue
281359

282-
callee_key = (row.callee_file, int(row.callee_start_line))
283-
callee = location_index.get(callee_key)
360+
callee = resolver.resolve(
361+
row.callee_file,
362+
int(row.callee_start_line),
363+
row.callee_name,
364+
int(row.callee_arity),
365+
)
284366
resolved_sig = callee.signature if callee is not None else row.callee_qname
285367

286368
call_start = int(row.call_start_line)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "codeanalyzer-python"
3-
version = "0.1.14"
3+
version = "0.1.15"
44
description = "Static Analysis on Python source code using Jedi, CodeQL and Treesitter."
55
readme = "README.md"
66
authors = [

0 commit comments

Comments
 (0)