-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub_adapter.py
More file actions
161 lines (133 loc) · 5.41 KB
/
github_adapter.py
File metadata and controls
161 lines (133 loc) · 5.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
GitHub Adapter — Phase 1 (GitHub variant)
Runs AdvancedGitHubScraper and converts its rich output into the same
(person_name, chunks) format the rest of the pipeline expects.
No changes needed to vector_store.py, generator.py, or tts.py.
"""
import re
from github_scraper import AdvancedGitHubScraper
from scraper import chunk_text # reuse the same chunker
def _clean(text: str) -> str:
"""Strip markdown symbols and collapse whitespace."""
text = re.sub(r"[#*`_>\-]{2,}", " ", text)
text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) # [label](url) → label
text = re.sub(r"https?://\S+", "", text) # remove bare URLs
text = re.sub(r"\s+", " ", text).strip()
return text
def _profile_to_text(profile: dict) -> str:
"""Turn the profile dict into a readable paragraph."""
parts = []
if profile.get("name"):
parts.append(f"{profile['name']} is a developer on GitHub (@{profile.get('username', '')}).")
if profile.get("bio"):
parts.append(_clean(profile["bio"]))
if profile.get("company"):
parts.append(f"They work at {profile['company']}.")
if profile.get("location"):
parts.append(f"Based in {profile['location']}.")
if profile.get("followers"):
parts.append(f"They have {profile['followers']:,} followers on GitHub.")
if profile.get("hireable"):
parts.append("They are open to hire.")
return " ".join(parts)
def _stats_to_text(stats: dict, languages: dict) -> str:
"""Turn statistics into a readable paragraph."""
parts = []
if stats.get("total_stars"):
parts.append(f"Their repositories have earned {stats['total_stars']:,} stars in total.")
if stats.get("total_forks"):
parts.append(f"Their work has been forked {stats['total_forks']:,} times.")
if stats.get("most_starred_repo"):
r = stats["most_starred_repo"]
parts.append(f"Their most starred project is '{r['name']}' with {r['stars']:,} stars.")
if languages:
top_langs = list(languages.keys())[:5]
parts.append(f"Top programming languages: {', '.join(top_langs)}.")
return " ".join(parts)
def _repo_to_text(repo: dict) -> str:
"""Turn a single repo into a descriptive sentence or two."""
parts = []
name = repo.get("name", "")
desc = _clean(repo.get("description") or "")
stars = repo.get("stars", 0)
lang = repo.get("language", "")
topics = repo.get("topics", [])
features = (repo.get("readme_extracted_info") or {}).get("features", [])
if name:
line = f"Project '{name}'"
if desc:
line += f": {desc}"
if stars:
line += f" ({stars:,} stars)"
if lang:
line += f", built in {lang}"
parts.append(line + ".")
if topics:
parts.append(f"Topics: {', '.join(topics[:6])}.")
for feat in features[:3]:
cleaned = _clean(feat)
if cleaned:
parts.append(cleaned)
return " ".join(parts)
def _readme_to_text(readme_data: dict) -> str:
"""Extract the most useful text from the profile README."""
if not readme_data:
return ""
parts = []
info = readme_data.get("extracted_info", {})
skills = list(set(info.get("skills", [])))[:20]
if skills:
parts.append(f"Skills and technologies: {', '.join(skills)}.")
for exp in info.get("work_experience", [])[:8]:
cleaned = _clean(exp)
if cleaned:
parts.append(cleaned)
# Also use raw README but trimmed to avoid noise
raw = _clean(readme_data.get("content") or "")
if raw:
words = raw.split()[:300] # cap at 300 words
parts.append(" ".join(words))
return " ".join(parts)
def scrape_github_and_chunk(username: str, token: str = None) -> tuple[str, list[dict]]:
"""
Full Phase 1 (GitHub variant).
Returns (person_name, list of {text, section, chunk_id}) — identical shape
to scrape_and_chunk() in scraper.py.
"""
scraper = AdvancedGitHubScraper(username, token=token or None)
success = scraper.scrape_all(repo_limit=30)
if not success:
raise ValueError(f"GitHub user '{username}' not found or API rate limit hit.")
data = scraper.data
profile = data.get("profile", {})
person_name = profile.get("name") or username
# Build named sections → text
sections = {}
profile_text = _profile_to_text(profile)
if profile_text:
sections["Profile"] = profile_text
stats_text = _stats_to_text(
data.get("statistics", {}),
data.get("languages", {})
)
if stats_text:
sections["Statistics"] = stats_text
readme_text = _readme_to_text(data.get("profile_readme", {}))
if readme_text:
sections["Profile README"] = readme_text
# Top repos — each gets its own section so retrieval can target them
for repo in data.get("top_repositories", [])[:10]:
repo_text = _repo_to_text(repo)
if repo_text:
sections[f"repo_{repo['name']}"] = repo_text
# Chunk everything (reuse same chunker as Wikipedia path)
all_chunks = []
for section, text in sections.items():
for i, chunk in enumerate(chunk_text(text)):
all_chunks.append({
"text": chunk,
"section": section,
"chunk_id": f"{section}_{i}"
})
print(f"[GitHub] '{person_name}' → {len(sections)} sections → {len(all_chunks)} chunks")
return person_name, all_chunks