ValuesDetector/github_issues_crawler.py at main · Xuhrrr/ValuesDetector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GitHub Issues 爬虫工具（JSON Lines优化版）
特性：
1. 线性爬取：单线程顺序爬取，避免并发问题
2. JSON Lines格式：逐行写入单个issue数据，大幅降低内存占用
3. 过滤PR：默认排除Pull Request（仅保留纯Issue）
4. 连接优化：复用GitHub连接，减少资源消耗
5. 速率控制：合理设置请求间隔，避免触发API限制
6. 断点续爬：支持从上次中断处继续爬取
"""
import os
import sys
import json
import time
import logging
import psutil
from markdown import markdown
from bs4 import BeautifulSoup
import github3
from dotenv import load_dotenv
from tqdm import tqdm

# 日志配置：ERROR输出控制台，DEBUG写入文件
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
date_format = '%Y-%m-%d %H:%M:%S'
file_handler = logging.FileHandler('crawler.log', encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(log_format, date_format))
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.ERROR)
console_handler.setFormatter(logging.Formatter(log_format, date_format))

logging.basicConfig(
    level=logging.DEBUG,
    handlers=[file_handler, console_handler]
)
logger = logging.getLogger(__name__)

# 加载环境变量
load_dotenv()

# 导入token管理器
try:
    from token_manager import singleton_token_manager
except ImportError as e:
    logger.error("未找到token_manager模块，请确保token_manager.py在当前目录")
    raise ImportError("请创建token_manager.py并实现singleton_token_manager（含get_token()方法）") from e


class GitHubIssuesCrawler:
    def __init__(self, repo_owner='tensorflow', repo_name='tensorflow',
                 max_retries=5, backoff_factor=2, exclude_bots=False):
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.max_retries = max_retries
        self.backoff_factor = backoff_factor
        self.exclude_bots = exclude_bots
        self.gh = None  # GitHub连接实例
        self.repo = None  # 仓库实例

    def _log_memory_usage(self):
        """记录当前内存占用，用于调试"""
        process = psutil.Process(os.getpid())
        mem_mb = process.memory_info().rss / 1024 / 1024
        logger.debug(f"内存占用: {mem_mb:.2f} MB")

    def _connect_to_github(self, token=None):
        """建立并验证GitHub连接"""
        if not token:
            token = singleton_token_manager.get_token()
            logger.info('从token管理器获取token')

        try:
            self.gh = github3.login(token=token)
            if not self.gh:
                raise ValueError("GitHub登录失败，返回空实例")

            self.repo = self.gh.repository(self.repo_owner, self.repo_name)
            if not self.repo:
                raise ValueError(f"无法访问仓库: {self.repo_owner}/{self.repo_name}")

            logger.info(f'成功连接到仓库: {self.repo_owner}/{self.repo_name}')
            return True
        except Exception as e:
            logger.error(f'连接GitHub失败: {str(e)}')
            raise

    def _remove_markdown(self, markdown_text):
        """将markdown文本转换为纯文本"""
        if not markdown_text:
            return ''
        html = markdown(markdown_text)
        soup = BeautifulSoup(html, 'html.parser')
        return soup.get_text(separator=' ', strip=True)

    def _get_reactions(self, issue):
        """提取issue的反应数据"""
        try:
            if not hasattr(issue, 'reactions'):
                return {}

            reactions = issue.reactions() if callable(issue.reactions) else issue.reactions

            # 处理不同格式的反应数据
            if hasattr(reactions, 'plus_one'):
                return {
                    '+1': getattr(reactions, 'plus_one', 0),
                    '-1': getattr(reactions, 'minus_one', 0),
                    'laugh': getattr(reactions, 'laugh', 0),
                    'confused': getattr(reactions, 'confused', 0),
                    'heart': getattr(reactions, 'heart', 0),
                    'hooray': getattr(reactions, 'hooray', 0),
                    'rocket': getattr(reactions, 'rocket', 0),
                    'eyes': getattr(reactions, 'eyes', 0)
                }
            elif isinstance(reactions, dict):
                return {k: v for k, v in reactions.items() if k in ['+1', '-1', 'laugh', 'confused',
                                                                   'heart', 'hooray', 'rocket', 'eyes']}
            return {}
        except Exception as e:
            logger.error(f'获取反应数据失败: {str(e)}')
            return {}

    def _extract_issue_data(self, issue):
        """提取issue的核心数据"""
        try:
            # 基础信息
            issue_data = {
                'id': issue.id,
                'number': issue.number,
                'title': issue.title,
                'html_url': issue.html_url,
                'url': issue.url,
                'state': issue.state,
                'created_at': str(issue.created_at) if issue.created_at else None,
                'updated_at': str(issue.updated_at) if issue.updated_at else None,
                'closed_at': str(issue.closed_at) if issue.closed_at else None,
                'state_reason': issue.state_reason,
                'body': issue.body,
                'body_text': self._remove_markdown(issue.body),
                'labels': [{'id': label.id, 'name': label.name, 'color': label.color}
                          for label in issue.labels()],
                'reactions': self._get_reactions(issue),
                'comments': [],
                'pull_request': None,
                'user': {'login': issue.user.login, 'id': issue.user.id}
                         if hasattr(issue, 'user') and issue.user else None
            }

# 检查是否为PR
            if hasattr(issue, 'pull_request'):
                pull_request = issue.pull_request
                if pull_request:
                    if callable(pull_request):
                        try:
                            pr_data = pull_request()
                            if isinstance(pr_data, dict):
                                issue_data['pull_request'] = pr_data
                            else:
                                issue_data['pull_request'] = {'url': str(pr_data)}
                        except:
                            issue_data['pull_request'] = None
                    elif isinstance(pull_request, dict):
                        issue_data['pull_request'] = pull_request
                    else:
                        issue_data['pull_request'] = str(pull_request)


            # 提取评论
            comments = []
            for comment in issue.comments():
                comments.append({
                    'id': comment.id,
                    'body': comment.body,
                    'body_text': self._remove_markdown(comment.body),
                    'created_at': str(comment.created_at) if comment.created_at else None,
                    'updated_at': str(comment.updated_at) if comment.updated_at else None,
                    'html_url': comment.html_url,
                    'user': {'login': comment.user.login, 'id': comment.user.id}
                })
            issue_data['comments'] = comments

            # 过滤bot创建的无评论issue
            if (self.exclude_bots and
                issue_data['user'] and
                issue_data['user']['login'].endswith('[bot]') and
                len(comments) == 0):
                logger.info(f'跳过bot无评论issue #{issue.number} (用户: {issue_data["user"]["login"]})')
                return None

            return issue_data
        except Exception as e:
            logger.error(f'提取issue数据失败 (ID: {issue.id}): {str(e)}')
            return None

    def _write_issue_to_jsonl(self, output_file, issue_data):
        """将单个issue数据写入JSON Lines文件（逐行）"""
        try:
            # 确保输出目录存在
            dir_path = os.path.dirname(output_file)
            if dir_path and not os.path.exists(dir_path):
                os.makedirs(dir_path, exist_ok=True)

            # 追加模式写入单行JSON
            with open(output_file, 'a', encoding='utf-8') as f:
                json.dump(issue_data, f, ensure_ascii=False)
                f.write('\n')  # 行分隔符
            return True
        except Exception as e:
            logger.error(f"写入JSON Lines失败: {str(e)}")
            return False

    def _load_crawled_issues(self, output_file):
        """从JSON Lines文件加载已爬取的issue编号"""
        crawled_numbers = set()
        if not os.path.exists(output_file):
            return crawled_numbers

        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        issue = json.loads(line)
                        if 'number' in issue:
                            crawled_numbers.add(issue['number'])
                    except json.JSONDecodeError:
                        logger.warning(f"JSON Lines文件第{line_num}行格式错误，已跳过")
            logger.info(f"从{output_file}恢复{len(crawled_numbers)}个已爬取issue")
            return crawled_numbers
        except Exception as e:
            logger.error(f"读取已爬取数据失败: {str(e)}")
            return set()

    def crawl(self, token=None, limit=None, output_file='issues.jsonl', resume=False):
        """主爬取方法：线性爬取并写入JSON Lines"""
        # 启动信息
        print("===== GitHub Issues 爬虫启动 =====")
        print(f"仓库: {self.repo_owner}/{self.repo_name}")
        print(f"输出文件: {os.path.abspath(output_file)}")
        print(f"恢复模式: {'开启' if resume else '关闭'}")
        print(f"爬取限制: {limit or '无限制'}")
        print(f"过滤无评论bot: {'开启' if self.exclude_bots else '关闭'}")

        # 1. 建立GitHub连接
        try:
            self._connect_to_github(token)
        except Exception as e:
            logger.error(f"连接初始化失败，爬取终止: {str(e)}")
            return

        # 2. 初始化爬取状态
        crawled_numbers = self._load_crawled_issues(output_file) if resume else set()
        total_written = len(crawled_numbers)
        total_processed = 0  # 已处理的issue总数（包括跳过的）

        # 3. 清空文件（非恢复模式）
        if not resume and os.path.exists(output_file):
            try:
                os.remove(output_file)
                logger.info(f"已清空历史文件: {output_file}")
            except Exception as e:
                logger.warning(f"清空历史文件失败: {str(e)}，将继续追加写入")

        # 4. 开始线性爬取
        print("\n开始爬取issues...")
        try:
            # 获取issue迭代器（按评论数降序，优先爬取活跃issue）
            issues_iterator = self.repo.issues(
                state='all',
                sort='comments',
                direction='desc'
            )

            # 进度条
            with tqdm(desc="爬取进度", unit="个") as pbar:
                for issue in issues_iterator:
                    # 检查是否达到爬取限制
                    if limit is not None and total_written >= limit or total_processed >= limit:
                        print(f"\n已达到爬取限制({limit}个)，停止爬取")
                        break

                    total_processed += 1

                    # 跳过已爬取的issue
                    if issue.number in crawled_numbers:
                        pbar.update(1)
                        continue

                    # 提取并处理issue数据
                    issue_data = None
                    for attempt in range(self.max_retries):
                        try:
                            issue_data = self._extract_issue_data(issue)
                            break  # 成功提取则跳出重试循环
                        except Exception as e:
                            # 处理速率限制
                            if 'rate limit' in str(e).lower() or '403' in str(e):
                                logger.warning("检测到API速率限制，切换token并重试...")
                                self._connect_to_github()  # 重新连接（切换token）

                            # 指数退避重试
                            wait_time = self.backoff_factor * (2 **attempt)
                            logger.warning(
                                f"提取issue #{issue.number} 失败(尝试{attempt+1}/{self.max_retries})，"
                                f"错误: {str(e)}，等待{wait_time}s"
                            )
                            if attempt < self.max_retries - 1:
                                time.sleep(wait_time)

                    # 写入有效数据
                    if issue_data:
                        if self._write_issue_to_jsonl(output_file, issue_data):
                            total_written += 1
                            crawled_numbers.add(issue.number)
                            logger.info(f"已写入 #{issue.number} (累计: {total_written})")

                    # 速率控制（避免触发API限制）
                    time.sleep(0.05)  # 基础延迟，可根据API限制调整
                    pbar.update(1)

                    # 定期记录内存使用
                    if total_processed % 100 == 0:
                        self._log_memory_usage()

        except Exception as e:
            logger.error(f"爬取过程异常终止: {str(e)}", exc_info=True)

        # 5. 爬取结束统计
        print(f"\n===== 爬取结束 =====")
        print(f"总处理issue数: {total_processed}")
        print(f"成功写入issue数: {total_written}")
        print(f"输出文件: {os.path.abspath(output_file)}")

        # 验证输出文件
        if os.path.exists(output_file):
            try:
                with open(output_file, 'r', encoding='utf-8') as f:
                    line_count = sum(1 for line in f if line.strip())
                print(f"文件实际条目数: {line_count}")
                logger.info(f"爬取完成，文件包含{line_count}个条目")
            except Exception as e:
                print(f"验证文件失败: {str(e)}")

    def test_crawl(self, limit=10, output_file='test_issues.jsonl'):
        """测试爬取方法"""
        logger.info(f"开始测试爬取，限制{limit}个issue")
        self.crawl(limit=limit, output_file=output_file, resume=False)
        logger.info("测试爬取完成")


if __name__ == '__main__':
    # 示例：爬取vue仓库的issues
    vue_crawler = GitHubIssuesCrawler(
        repo_owner='vuejs',
        repo_name='vue',
        exclude_bots=True,
        max_retries=3,
        backoff_factor=1
    )

    # 测试爬取（取消注释）
    #vue_crawler.test_crawl(limit=10)
    # 正式爬取
    vue_crawler.crawl(
        limit=11000,
        output_file='vue_issues.jsonl',
        resume=False  # 需要续爬时改为True
    )

    # 其他仓库示例
    # elastic_crawler = GitHubIssuesCrawler(
    #     repo_owner='elastic',
    #     repo_name='elasticsearch',
    #     exclude_bots=True
    # )
    # elastic_crawler.crawl(limit=40000, output_file='elastic_issues.jsonl', resume=False)