-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeatures.py
More file actions
304 lines (258 loc) · 13.1 KB
/
features.py
File metadata and controls
304 lines (258 loc) · 13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
"""
CodeSense - Feature Extraction
Extracts 33 features for the ML model from static analysis results.
All features are normalized to [0, 1] or standard ranges for ML consumption.
"""
import ast
import math
import re
from typing import Any, Dict, List, Optional
import numpy as np
from constants import FEATURE_NAMES, NUM_FEATURES
from logger import get_logger
logger = get_logger(__name__)
class FeatureExtractor:
"""
Extracts NUM_FEATURES numerical features from code and analysis results.
Features are ordered to match FEATURE_NAMES in constants.py.
"""
def extract(self, code: str, language: str,
analysis: Dict[str, Any],
dsa: Dict[str, Any]) -> Dict[str, float]:
"""
Compute all 33 features.
Args:
code: Raw source code.
language: Detected language.
analysis: Output from StaticAnalyzer.analyze().
dsa: Output from DSADetector.detect().
Returns:
Dict mapping feature name → float value.
"""
lines = code.splitlines()
metrics = analysis.get("metrics", {})
cx = analysis.get("complexity", {})
style = analysis.get("style", {})
sec = analysis.get("security", {})
doc = analysis.get("documentation", {})
struct = analysis.get("structure", {})
total_lines = max(1, metrics.get("total_lines", len(lines)))
num_fns = max(1, struct.get("num_functions", 1))
features: Dict[str, float] = {}
# ── 1–8: Basic metrics ───────────────────────────────────────────────
features["lines_of_code"] = float(metrics.get("code_lines", len(lines)))
features["blank_lines"] = float(metrics.get("blank_lines", 0))
features["comment_lines"] = float(metrics.get("comment_lines", 0))
features["comment_ratio"] = float(metrics.get("comment_ratio", 0))
features["avg_line_length"] = float(metrics.get("avg_line_len", 0))
features["max_line_length"] = float(metrics.get("max_line_len", 0))
features["num_functions"] = float(num_fns - 1) # ≥0
features["num_classes"] = float(struct.get("num_classes", 0))
# ── 9–14: Complexity ─────────────────────────────────────────────────
features["avg_cyclomatic_complexity"] = float(cx.get("avg_complexity", 1))
features["max_cyclomatic_complexity"] = float(cx.get("max_complexity", 1))
features["avg_cognitive_complexity"] = self._cognitive_complexity(code, language)
features["max_nesting_depth"] = float(cx.get("max_nesting", 0))
features["avg_function_length"] = self._avg_fn_length(cx)
features["max_function_length"] = self._max_fn_length(cx)
# ── 15–19: Style ─────────────────────────────────────────────────────
features["naming_consistency_score"] = float(style.get("naming_score", 100)) / 100.0
features["long_line_ratio"] = (
style.get("long_line_count", 0) / total_lines
)
features["magic_number_count"] = float(min(style.get("magic_number_count", 0), 50))
features["docstring_ratio"] = float(
doc.get("docstring_ratio", doc.get("comment_ratio", 0))
)
features["avg_parameters"] = self._avg_parameters(cx)
# ── 20–23: Security ──────────────────────────────────────────────────
features["security_issue_count"] = float(min(sec.get("total_issues", 0), 20))
features["critical_security_count"] = float(
min(sec.get("counts", {}).get("CRITICAL", 0), 10)
)
features["high_security_count"] = float(
min(sec.get("counts", {}).get("HIGH", 0), 10)
)
features["has_input_validation"] = self._has_input_validation(code, language)
# ── 24–26: DSA ───────────────────────────────────────────────────────
dsa_summary = dsa.get("summary", {})
features["dsa_complexity_score"] = float(dsa_summary.get("complexity_score", 0)) / 100.0
features["algorithm_count"] = float(min(dsa_summary.get("algorithm_count", 0), 10))
features["data_structure_count"] = float(min(dsa_summary.get("data_structure_count", 0), 10))
# ── 27–33: Contextual (NEW) ──────────────────────────────────────────
features["code_duplication_score"] = self._duplication_score(code, lines)
features["exception_handling_coverage"] = self._exception_coverage(code, language)
features["test_coverage_estimate"] = self._test_coverage_estimate(code, language)
features["code_reusability_score"] = self._reusability_score(code, language, num_fns)
features["design_pattern_usage"] = self._design_patterns(code, language)
features["code_smell_count"] = float(min(self._code_smells(code, cx, style, language), 20))
features["technical_debt_minutes"] = self._technical_debt(features)
assert len(features) == NUM_FEATURES, \
f"Expected {NUM_FEATURES} features, got {len(features)}"
return features
def to_array(self, features: Dict[str, float]) -> np.ndarray:
"""Return features as ordered numpy array matching FEATURE_NAMES."""
return np.array([features[name] for name in FEATURE_NAMES], dtype=np.float32)
# ─── Feature Helpers ─────────────────────────────────────────────────────
@staticmethod
def _avg_fn_length(cx: Dict) -> float:
fns = cx.get("functions", [])
if not fns:
return 0.0
return sum(f.get("length", 0) for f in fns) / len(fns)
@staticmethod
def _max_fn_length(cx: Dict) -> float:
fns = cx.get("functions", [])
if not fns:
return 0.0
return max((f.get("length", 0) for f in fns), default=0)
@staticmethod
def _avg_parameters(cx: Dict) -> float:
fns = cx.get("functions", [])
if not fns:
return 0.0
return sum(f.get("params", 0) for f in fns) / len(fns)
@staticmethod
def _cognitive_complexity(code: str, language: str) -> float:
"""
Simplified cognitive complexity heuristic:
nested control structures score higher than unnested ones.
"""
nesting = 0
score = 0.0
increment_kws = re.compile(
r"\b(if|else|elif|for|while|do|catch|case|switch|&&|\|\|)\b"
)
lines = code.splitlines()
for line in lines:
stripped = line.strip()
nesting += stripped.count("{")
nesting -= stripped.count("}")
nesting = max(0, nesting)
if increment_kws.search(stripped):
score += 1 + nesting * 0.5
return round(min(score, 200), 2)
@staticmethod
def _has_input_validation(code: str, language: str) -> float:
"""1.0 if code validates user input, 0.0 otherwise."""
patterns = {
"python": [r"isinstance\(|type\(|re\.(match|search|fullmatch)|\.strip\(\)|len\(.*\)\s*[<>]=?",
r"try:.*except|validate|sanitize"],
"java": [r"\.isEmpty\(\)|\.matches\(|Objects\.requireNonNull|Validate\.",
r"try\s*\{.*catch"],
"cpp": [r"if\s*\(.*cin|if\s*\(.*strlen|if\s*\(.*strcmp|isdigit|isalpha"],
}
for pattern in patterns.get(language, []):
if re.search(pattern, code, re.DOTALL | re.IGNORECASE):
return 1.0
return 0.0
@staticmethod
def _duplication_score(code: str, lines: List[str]) -> float:
"""
Approximate duplication ratio using identical non-trivial line hashing.
Returns 0.0 (no duplication) to 1.0 (entirely duplicated).
"""
non_trivial = [l.strip() for l in lines
if len(l.strip()) > 10 and not l.strip().startswith(("#", "//"))]
if len(non_trivial) < 5:
return 0.0
unique = len(set(non_trivial))
dup_ratio = 1.0 - (unique / len(non_trivial))
return round(max(0.0, min(1.0, dup_ratio)), 3)
@staticmethod
def _exception_coverage(code: str, language: str) -> float:
"""
Ratio of try/except blocks to total function count.
Clamped to [0, 1].
"""
if language == "python":
try_count = len(re.findall(r"\btry\s*:", code))
fn_count = max(1, len(re.findall(r"\bdef\s+\w+", code)))
else:
try_count = len(re.findall(r"\btry\s*\{", code))
fn_count = max(1, len(re.findall(r"\b\w+\s+\w+\s*\([^)]*\)\s*\{", code)))
return round(min(1.0, try_count / fn_count), 2)
@staticmethod
def _test_coverage_estimate(code: str, language: str) -> float:
"""Estimate test coverage based on test function presence."""
if language == "python":
test_fns = len(re.findall(r"\bdef\s+test_\w+|class\s+Test\w+", code))
all_fns = max(1, len(re.findall(r"\bdef\s+\w+", code)))
elif language == "java":
test_fns = len(re.findall(r"@Test|void\s+test\w+\(", code))
all_fns = max(1, len(re.findall(r"\bpublic\s+\w+\s+\w+\s*\(", code)))
else:
test_fns = len(re.findall(r"\bTEST\s*\(|void\s+test_\w+", code))
all_fns = max(1, len(re.findall(r"\b\w+\s+\w+\s*\([^)]*\)\s*\{", code)))
return round(min(1.0, test_fns / all_fns), 2)
@staticmethod
def _reusability_score(code: str, language: str, num_fns: int) -> float:
"""
Score based on: function count, parameter usage, and absence of global state.
"""
score = 0.5
# Reward having functions
if num_fns > 3:
score += 0.2
# Penalize global variables
global_count = len(re.findall(r"\bglobal\b", code))
score -= min(0.3, global_count * 0.05)
# Reward type hints (Python)
if language == "python":
typed = len(re.findall(r"def\s+\w+\s*\([^)]*:\s*\w+", code))
score += min(0.2, typed * 0.05)
return round(max(0.0, min(1.0, score)), 2)
@staticmethod
def _design_patterns(code: str, language: str) -> float:
"""Detect common design pattern usage; score 0–1."""
patterns = [
r"Singleton|singleton|_instance",
r"Factory|factory.*method|create_\w+",
r"Observer|observer|subscribe|notify|listener",
r"Decorator|@\w+\s*\ndef|decorator",
r"Strategy|strategy.*pattern|Context.*strategy",
r"Iterator|__iter__|__next__",
r"Builder|build\(\)|with_\w+\(",
]
found = sum(1 for p in patterns if re.search(p, code, re.IGNORECASE))
return round(min(1.0, found * 0.15), 2)
@staticmethod
def _code_smells(code: str, cx: Dict, style: Dict, language: str) -> int:
"""Count code smells."""
smells = 0
# Long methods
smells += len([f for f in cx.get("functions", []) if f.get("length", 0) > 50])
# Large class (many methods in one class context)
if cx.get("num_functions", 0) > 20:
smells += 2
# Long parameter list
smells += len([f for f in cx.get("functions", []) if f.get("params", 0) > 7])
# Magic numbers
smells += min(3, style.get("magic_number_count", 0) // 5)
# Deeply nested code
smells += len(cx.get("deeply_nested_fns", []))
# High complexity
smells += len(cx.get("high_complexity_fns", []))
return smells
@staticmethod
def _technical_debt(features: Dict[str, float]) -> float:
"""
Estimate technical debt in minutes using SQALE-like model.
"""
debt = 0.0
# Complexity debt: 30 min per very complex function
debt += features.get("max_cyclomatic_complexity", 0) * 2
# Security debt: critical issues are expensive
debt += features.get("critical_security_count", 0) * 60
debt += features.get("high_security_count", 0) * 30
debt += features.get("security_issue_count", 0) * 10
# Style debt
debt += features.get("magic_number_count", 0) * 5
debt += (1.0 - features.get("naming_consistency_score", 1)) * 30
# Documentation debt
doc_ratio = features.get("docstring_ratio", 0)
if doc_ratio < 0.2:
debt += (0.2 - doc_ratio) * 100
# Duplication debt
debt += features.get("code_duplication_score", 0) * 120
return round(min(debt, 9999), 1)