LongCodeUnderstanding/data_collection.py at main · Wangsq37/LongCodeUnderstanding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
# # -*- coding: utf-8 -*-

# import ast
# import os
# import json
# import copy
# import sys

# # 尝试导入 astunparse，如果失败则提示用户安装
# try:
#     import astunparse
# except ImportError:
#     print("错误: 'astunparse' 库未安装。", file=sys.stderr)
#     print("请使用以下命令进行安装: pip install astunparse", file=sys.stderr)
#     sys.exit(1)

# # 导入数据质量过滤器
# try:
#     from data_quality_filter import create_quality_filter
# except ImportError:
#     print("警告: 数据质量过滤器模块未找到，将使用基础过滤。", file=sys.stderr)
#     create_quality_filter = None

# # ==============================================================================
# # 核心数据提取代码 (源自旧版本，保持不变)
# # ==============================================================================

# class TaskIdGenerator:
#     """一个简单的线程不安全的计数器，用于生成唯一的task ID。"""
#     def __init__(self):
#         self.current_id = 0

#     def next(self) -> int:
#         """返回当前ID并递增计数器。"""
#         task_id = self.current_id
#         self.current_id += 1
#         return task_id

# class AssertionTransformer(ast.NodeTransformer):
#     """
#     一个AST转换器，其功能是：
#     1. 寻找由行号和列号指定的 *单个* `assert` 语句。
#     2. 如果该语句包含等式检查 (`==`)，则将其右侧替换为 '???'。
#     3. 将原始右侧内容记录为 ground truth（字符串形式）。
#     """
#     def __init__(self, target_lineno: int, target_col_offset: int):
#         self.target_lineno = target_lineno
#         self.target_col_offset = target_col_offset
#         self.transformed = False
#         self.ground_truth = None

#     def visit_Assert(self, node: ast.Assert) -> ast.Assert:
#         if not (node.lineno == self.target_lineno and node.col_offset == self.target_col_offset):
#             return node

#         if not (isinstance(node.test, ast.Compare) and
#                 node.test.ops and isinstance(node.test.ops[0], ast.Eq)):
#             return node

#         original_node = node.test.comparators[0]
#         self.ground_truth = astunparse.unparse(original_node).strip()
#         self.transformed = True

#         # 兼容不同 Python 版本的 AST 节点
#         try:
#             node.test.comparators[0] = ast.Constant(value='???', kind=None)
#         except AttributeError:
#             node.test.comparators[0] = ast.Name(id='???', ctx=ast.Load())

#         return node

# def process_test_file(file_path: str, repo_root: str, task_id_gen: TaskIdGenerator):
#     """
#     解析一个 Python 测试文件，找到包含等式断言的测试函数，
#     并为 *每一个* 这样的断言创建一条结构化数据。
#     现在集成了数据质量过滤功能。
#     """
#     try:
#         with open(file_path, 'r', encoding='utf-8') as f:
#             source_code = f.read()
#         tree = ast.parse(source_code)
#     except (SyntaxError, UnicodeDecodeError, PermissionError, FileNotFoundError) as e:
#         print(f"警告: 无法解析 {file_path}。原因: {e}。已跳过。", file=sys.stderr)
#         return []

#     imports_list = [astunparse.unparse(node).strip() for node in tree.body if isinstance(node, (ast.Import, ast.ImportFrom))]

#     all_data_entries = []
#     reponame = os.path.basename(repo_root)
#     testpath = os.path.relpath(file_path, repo_root)
#     testname = os.path.basename(file_path)

#     # 初始化质量过滤器
#     quality_filter = create_quality_filter() if create_quality_filter else None

#     for node in ast.walk(tree):
#         if isinstance(node, ast.FunctionDef) and node.name.startswith("test_"):
#             # 使用质量过滤器筛选断言
#             if quality_filter:
#                 quality_asserts = quality_filter.filter_test_function(node)
#                 candidate_asserts = [item["assert_node"] for item in quality_asserts]
#             else:
#                 # 回退到基础过滤
#                 candidate_asserts = []
#                 for sub_node in ast.walk(node):
#                     if (isinstance(sub_node, ast.Assert) and
#                         isinstance(sub_node.test, ast.Compare) and
#                         sub_node.test.ops and isinstance(sub_node.test.ops[0], ast.Eq)):
#                         candidate_asserts.append(sub_node)

#             for assert_to_mask in candidate_asserts:
#                 func_copy = copy.deepcopy(node)
#                 transformer = AssertionTransformer(
#                     target_lineno=assert_to_mask.lineno,
#                     target_col_offset=assert_to_mask.col_offset
#                 )
#                 masked_function_node = transformer.visit(func_copy)

#                 if transformer.transformed and transformer.ground_truth is not None:
#                     task_id_str = f"{reponame}_{task_id_gen.next()}"

#                     # 获取质量分析信息
#                     quality_analysis = None
#                     if quality_filter:
#                         analysis = quality_filter.analyze_assertion_complexity(assert_to_mask)
#                         quality_analysis = analysis

#                     data_entry = {
#                         "task_id": task_id_str,
#                         "reponame": reponame,
#                         "testpath": testpath.replace('\\', '/'),
#                         "testname": testname,
#                         "funcname": node.name,
#                         "imports": imports_list,
#                         "code": astunparse.unparse(node).strip(),
#                         "masked_code": astunparse.unparse(masked_function_node).strip(),
#                         "ground_truth": transformer.ground_truth
#                     }

#                     # 添加质量分析信息
#                     if quality_analysis:
#                         data_entry["quality_analysis"] = quality_analysis

#                     all_data_entries.append(data_entry)

#     return all_data_entries

# def process_single_repo(repo_path: str, output_file: str):
#     """
#     遍历单个仓库，处理所有测试文件，并将结构化数据保存到JSONL文件中。
#     """
#     repo_abs_path = os.path.abspath(repo_path)
#     if not os.path.isdir(repo_abs_path):
#         print(f"错误: 仓库路径 '{repo_path}' 不存在或不是一个目录。已跳过。", file=sys.stderr)
#         return

#     task_id_gen = TaskIdGenerator()

#     print(f"\n--- 正在处理仓库: {os.path.basename(repo_abs_path)} ---")
#     all_processed_data = []
#     # 忽略常见的虚拟环境、Git和构建目录
#     excluded_dirs = {'.venv', 'venv', '.git', 'node_modules', 'build', 'dist', '__pycache__'}

#     for root, dirs, files in os.walk(repo_abs_path):
#         dirs[:] = [d for d in dirs if d not in excluded_dirs]
#         for file in files:
#             if file.startswith("test_") and file.endswith(".py"):
#                 file_path = os.path.join(root, file)
#                 processed_data = process_test_file(file_path, repo_abs_path, task_id_gen)
#                 all_processed_data.extend(processed_data)

#     if not all_processed_data:
#         print(f"在 {os.path.basename(repo_abs_path)} 中未找到带有等式断言的测试函数。")
#         return

#     # 确保输出目录存在
#     os.makedirs(os.path.dirname(output_file), exist_ok=True)
#     with open(output_file, 'w', encoding='utf-8') as f:
#         for entry in all_processed_data:
#             json_line = json.dumps(entry, ensure_ascii=False)
#             f.write(json_line + '\n')

#     print(f"✅ 成功! 找到 {len(all_processed_data)} 个断言。数据集已保存至: {output_file}")

# def process_multiple_repos(repo_paths: list, output_dir: str):
#     """
#     协调处理多个仓库。
#     """
#     os.makedirs(output_dir, exist_ok=True)
#     print(f"所有输出将被保存到: {os.path.abspath(output_dir)}")

#     for repo_path in repo_paths:
#         reponame = os.path.basename(os.path.abspath(repo_path))
#         output_file = os.path.join(output_dir, f"{reponame}.jsonl")
#         process_single_repo(repo_path, output_file)

# # ==============================================================================
# # 主逻辑 (融合了筛选条件)
# # ==============================================================================

# def main():
#     """
#     主函数，执行筛选和数据提取的融合流程。
#     """
#     current_dir = os.getcwd()
#     python_repos_dir = os.path.join(current_dir, 'python_repos')
#     output_results_dir = os.path.join(current_dir, 'output_results')
#     data_collection_dir = os.path.join(current_dir, 'data_collection')

#     # --- 阶段 1: 根据 report_functions.jsonl 的存在性来筛选仓库 ---
#     print("--- 阶段 1: 开始筛选要处理的仓库 ---")

#     # 检查所需目录是否存在
#     if not os.path.isdir(output_results_dir):
#         print(f"错误: 报告目录 '{output_results_dir}' 未找到。脚本中止。", file=sys.stderr)
#         return
#     if not os.path.isdir(python_repos_dir):
#         print(f"错误: 源码目录 '{python_repos_dir}' 未找到。脚本中止。", file=sys.stderr)
#         return

#     filtered_repo_names = []
#     # 假设 output_results 下的每个子目录对应一个仓库
#     repo_subdirs = [d for d in os.listdir(output_results_dir) if os.path.isdir(os.path.join(output_results_dir, d))]

#     for reponame in repo_subdirs:
#         report_file = os.path.join(output_results_dir, reponame, 'report_functions.jsonl')

#         # 核心筛选条件：报告文件必须存在
#         if os.path.exists(report_file):
#             # 并且，对应的代码仓库也必须存在
#             if os.path.isdir(os.path.join(python_repos_dir, reponame)):
#                 filtered_repo_names.append(reponame)
#                 print(f"找到报告 '{report_file}', 将仓库 '{reponame}' 添加到处理列表。")
#             else:
#                 print(f"警告: 找到了仓库 '{reponame}' 的报告文件, 但在 'python_repos' 目录中未找到其源代码。已跳过。")
#         else:
#             print(f"信息: 在仓库 '{reponame}' 中未找到报告文件，跳过处理。")


#     if not filtered_repo_names:
#         print("\n未找到任何满足条件的仓库（即同时存在报告文件和源代码）。脚本退出。")
#         return

#     print(f"\n✅ 筛选完成! 共找到 {len(filtered_repo_names)} 个仓库准备进行数据提取。")

#     # --- 阶段 2: 从筛选出的仓库中提取数据 ---
#     print("\n--- 阶段 2: 开始从筛选后的仓库中提取断言数据 ---")

#     # 构建需要处理的仓库的完整路径列表
#     repo_paths_to_process = [os.path.join(python_repos_dir, name) for name in filtered_repo_names]

#     # 调用数据提取流程
#     process_multiple_repos(repo_paths_to_process, data_collection_dir)

#     print("\n--- 所有任务已完成 ---")

# if __name__ == '__main__':
#     main()

# -*- coding: utf-8 -*-

import ast
import os
import json
import copy
import sys

# 尝试导入 astunparse，如果失败则提示用户安装
try:
    import astunparse
except ImportError:
    print("错误: 'astunparse' 库未安装。", file=sys.stderr)
    print("请使用以下命令进行安装: pip install astunparse", file=sys.stderr)
    sys.exit(1)

# 导入数据质量过滤器
try:
    from data_quality_filter import create_quality_filter
except ImportError:
    print("警告: 数据质量过滤器模块未找到，将使用基础过滤。", file=sys.stderr)
    create_quality_filter = None

# ==============================================================================
# [NEW] AST 访问器，用于查找测试并记录其类上下文
# ==============================================================================
class TestFinder(ast.NodeVisitor):
    """
    一个 AST 访问器，用于遍历代码树，找到所有测试函数
    并记录下它们自身以及它们所在的类（如果有的话）。
    """
    def __init__(self):
        self.found_tests = []
        self._current_class_name = None

    def visit_ClassDef(self, node: ast.ClassDef):
        # 进入一个类定义，记录下类名
        original_class_name = self._current_class_name
        self._current_class_name = node.name
        # 继续遍历类内部的节点
        self.generic_visit(node)
        # 离开类定义，恢复之前的上下文（支持嵌套类）
        self._current_class_name = original_class_name

    def visit_FunctionDef(self, node: ast.FunctionDef):
        # 访问一个函数定义
        if node.name.startswith("test_"):
            # 如果是测试函数，则保存其节点和当前的类名
            self.found_tests.append({
                "node": node,
                "classname": self._current_class_name  # 如果不在类中，此值为 None
            })
        # 继续遍历函数内部的节点（虽然我们这里不需要）
        self.generic_visit(node)

# ==============================================================================
# 核心数据提取代码 (已修改以使用 TestFinder)
# ==============================================================================
class TaskIdGenerator:
    """一个简单的线程不安全的计数器，用于生成唯一的task ID。"""
    def __init__(self):
        self.current_id = 0

    def next(self) -> int:
        task_id = self.current_id
        self.current_id += 1
        return task_id

class AssertionTransformer(ast.NodeTransformer):
    """AST转换器，用于遮蔽断言语句的右侧。"""
    def __init__(self, target_lineno: int, target_col_offset: int):
        self.target_lineno = target_lineno
        self.target_col_offset = target_col_offset
        self.transformed = False
        self.ground_truth = None

    def visit_Assert(self, node: ast.Assert) -> ast.Assert:
        if not (node.lineno == self.target_lineno and node.col_offset == self.target_col_offset):
            return node
        if not (isinstance(node.test, ast.Compare) and
                node.test.ops and isinstance(node.test.ops[0], ast.Eq)):
            return node
        original_node = node.test.comparators[0]
        self.ground_truth = astunparse.unparse(original_node).strip()
        self.transformed = True
        try:
            node.test.comparators[0] = ast.Constant(value='???', kind=None)
        except AttributeError:
            node.test.comparators[0] = ast.Name(id='???', ctx=ast.Load())
        return node

def process_test_file(file_path: str, repo_root: str, task_id_gen: TaskIdGenerator):
    """
    解析一个 Python 测试文件，找到包含等式断言的测试函数，
    并为 *每一个* 这样的断言创建一条结构化数据。
    现在集成了数据质量过滤和类名捕获功能。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            source_code = f.read()
        tree = ast.parse(source_code)
    except (SyntaxError, UnicodeDecodeError, PermissionError, FileNotFoundError) as e:
        print(f"警告: 无法解析 {file_path}。原因: {e}。已跳过。", file=sys.stderr)
        return []

    imports_list = [astunparse.unparse(node).strip() for node in tree.body if isinstance(node, (ast.Import, ast.ImportFrom))]
    all_data_entries = []
    reponame = os.path.basename(repo_root)
    testpath = os.path.relpath(file_path, repo_root)
    testname = os.path.basename(file_path)

    quality_filter = create_quality_filter() if create_quality_filter else None

    # [MODIFIED] 使用 TestFinder 代替 ast.walk
    finder = TestFinder()
    finder.visit(tree)

    for test_info in finder.found_tests:
        node = test_info["node"]
        classname = test_info["classname"] # 获取类名

        if quality_filter:
            quality_asserts = quality_filter.filter_test_function(node)
            candidate_asserts = [item["assert_node"] for item in quality_asserts]
        else:
            candidate_asserts = [sn for sn in ast.walk(node) if isinstance(sn, ast.Assert) and isinstance(sn.test, ast.Compare) and sn.test.ops and isinstance(sn.test.ops[0], ast.Eq)]

        for assert_to_mask in candidate_asserts:
            func_copy = copy.deepcopy(node)
            transformer = AssertionTransformer(assert_to_mask.lineno, assert_to_mask.col_offset)
            masked_function_node = transformer.visit(func_copy)

            if transformer.transformed and transformer.ground_truth is not None:
                task_id_str = f"{reponame}_{task_id_gen.next()}"
                quality_analysis = quality_filter.analyze_assertion_complexity(assert_to_mask) if quality_filter else None

                data_entry = {
                    "task_id": task_id_str,
                    "reponame": reponame,
                    "testpath": testpath.replace('\\', '/'),
                    "testname": testname,
                    "classname": classname,  # [NEW] 添加 classname 字段
                    "funcname": node.name,
                    "imports": imports_list,
                    "code": astunparse.unparse(node).strip(),
                    "masked_code": astunparse.unparse(masked_function_node).strip(),
                    "ground_truth": transformer.ground_truth
                }

                if quality_analysis:
                    data_entry["quality_analysis"] = quality_analysis

                all_data_entries.append(data_entry)

    return all_data_entries

# ... process_single_repo, process_multiple_repos 和 main 函数保持不变 ...
def process_single_repo(repo_path: str, output_file: str):
    repo_abs_path = os.path.abspath(repo_path)
    if not os.path.isdir(repo_abs_path):
        print(f"错误: 仓库路径 '{repo_path}' 不存在或不是一个目录。已跳过。", file=sys.stderr)
        return
    task_id_gen = TaskIdGenerator()
    print(f"\n--- 正在处理仓库: {os.path.basename(repo_abs_path)} ---")
    all_processed_data = []
    excluded_dirs = {'.venv', 'venv', '.git', 'node_modules', 'build', 'dist', '__pycache__'}
    for root, dirs, files in os.walk(repo_abs_path):
        dirs[:] = [d for d in dirs if d not in excluded_dirs]
        for file in files:
            if file.startswith("test_") and file.endswith(".py"):
                file_path = os.path.join(root, file)
                processed_data = process_test_file(file_path, repo_abs_path, task_id_gen)
                all_processed_data.extend(processed_data)
    if not all_processed_data:
        print(f"在 {os.path.basename(repo_abs_path)} 中未找到带有等式断言的测试函数。")
        return
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        for entry in all_processed_data:
            json_line = json.dumps(entry, ensure_ascii=False)
            f.write(json_line + '\n')
    print(f"✅ 成功! 找到 {len(all_processed_data)} 个断言。数据集已保存至: {output_file}")

def process_multiple_repos(repo_paths: list, output_dir: str):
    os.makedirs(output_dir, exist_ok=True)
    print(f"所有输出将被保存到: {os.path.abspath(output_dir)}")
    for repo_path in repo_paths:
        reponame = os.path.basename(os.path.abspath(repo_path))
        output_file = os.path.join(output_dir, f"{reponame}.jsonl")
        process_single_repo(repo_path, output_file)

def main():
    current_dir = os.getcwd()
    python_repos_dir = os.path.join(current_dir, 'python_repos')
    output_results_dir = os.path.join(current_dir, 'output_results')
    data_collection_dir = os.path.join(current_dir, 'data_collection')
    print("--- 阶段 1: 开始筛选要处理的仓库 ---")
    if not os.path.isdir(output_results_dir):
        print(f"错误: 报告目录 '{output_results_dir}' 未找到。脚本中止。", file=sys.stderr)
        return
    if not os.path.isdir(python_repos_dir):
        print(f"错误: 源码目录 '{python_repos_dir}' 未找到。脚本中止。", file=sys.stderr)
        return
    filtered_repo_names = []
    repo_subdirs = [d for d in os.listdir(output_results_dir) if os.path.isdir(os.path.join(output_results_dir, d))]
    for reponame in repo_subdirs:
        report_file = os.path.join(output_results_dir, reponame, 'report_functions.jsonl')
        if os.path.exists(report_file):
            if os.path.isdir(os.path.join(python_repos_dir, reponame)):
                filtered_repo_names.append(reponame)
                print(f"找到报告 '{report_file}', 将仓库 '{reponame}' 添加到处理列表。")
            else:
                print(f"警告: 找到了仓库 '{reponame}' 的报告文件, 但在 'python_repos' 目录中未找到其源代码。已跳过。")
        else:
            print(f"信息: 在仓库 '{reponame}' 中未找到报告文件，跳过处理。")
    if not filtered_repo_names:
        print("\n未找到任何满足条件的仓库（即同时存在报告文件和源代码）。脚本退出。")
        return
    print(f"\n✅ 筛选完成! 共找到 {len(filtered_repo_names)} 个仓库准备进行数据提取。")
    print("\n--- 阶段 2: 开始从筛选后的仓库中提取断言数据 ---")
    repo_paths_to_process = [os.path.join(python_repos_dir, name) for name in filtered_repo_names]
    process_multiple_repos(repo_paths_to_process, data_collection_dir)
    print("\n--- 所有任务已完成 ---")

if __name__ == '__main__':
    main()