-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_repo.py
More file actions
78 lines (63 loc) · 2.41 KB
/
get_repo.py
File metadata and controls
78 lines (63 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse
# Configuration
JSON_FILE = 'task_instance.json'
DOWNLOAD_DIR = 'main_repo'
MAX_WORKERS = 8
def load_repo_urls(json_file):
"""Extract repository URLs from the JSON file."""
urls = set()
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
for item in data:
repo_info = item.get('repository_info', {})
url = repo_info.get('url')
if url:
urls.add(url)
print(f"✅ Successfully parsed {len(urls)} unique repository URLs.")
return list(urls)
except FileNotFoundError:
print(f"❌ Error: File {json_file} not found.")
return []
except json.JSONDecodeError:
print(f"❌ Error: {json_file} is not valid JSON.")
return []
def clone_repository(url):
"""Clone the repository using git."""
parsed_url = urlparse(url)
path_parts = parsed_url.path.strip('/').split('/')
# Use 'owner_repo' naming convention to avoid collisions
if len(path_parts) >= 2:
repo_name = f"{path_parts[-2]}_{path_parts[-1]}"
else:
repo_name = path_parts[-1]
target_path = os.path.join(DOWNLOAD_DIR, repo_name)
if os.path.exists(target_path):
return f"⏭️ Skipped: {repo_name} (Already exists)"
try:
subprocess.run(['git', 'clone', url, target_path], check=True, capture_output=True)
return f"⬇️ Completed: {repo_name}"
except subprocess.CalledProcessError:
return f"❌ Failed: {repo_name} (Git error)"
except Exception as e:
return f"❌ Error: {repo_name} ({str(e)})"
def main():
if not os.path.exists(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
print(f"📂 Created directory: {DOWNLOAD_DIR}")
repo_urls = load_repo_urls(JSON_FILE)
if not repo_urls:
return
print(f"🚀 Starting concurrent download (Workers: {MAX_WORKERS})...\n")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_url = {executor.submit(clone_repository, url): url for url in repo_urls}
for future in as_completed(future_to_url):
print(future.result())
print("\n✨ All tasks finished.")
if __name__ == "__main__":
main()