From 11f64c1e2fb0d22d0cc39408c4924b79789b4d7d Mon Sep 17 00:00:00 2001 From: Leo Gao <54557097+leogao2@users.noreply.github.com> Date: Wed, 23 Sep 2020 18:22:51 -0600 Subject: [PATCH 1/2] Update download_repo_info.py --- download_repo_info.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/download_repo_info.py b/download_repo_info.py index c7dce71..4017b24 100644 --- a/download_repo_info.py +++ b/download_repo_info.py @@ -28,17 +28,15 @@ -def save_ckpt(lower_bound: int, upper_bound: int): - global repo_list +def save_ckpt(lower_bound: int, upper_bound: int, repo_list): repo_list = list(set(repo_list)) # remove duplicates print(f"Saving checkpoint {lower_bound, upper_bound}...") with open('repo_ckpt.pkl', 'wb') as f: pickle.dump((lower_bound, upper_bound, repo_list), f) -def get_request(lower_bound: int, upper_bound: int, page: int = 1): +def get_request(lower_bound: int, upper_bound: int, page: int = 1, repo_list): # Returns a request object from querying GitHub # for repos in-between size lower_bound and size upper_bound with over 100 stars. - global REMAINING_REQUESTS, USER, TOKEN, repo_list r = requests.get( f'https://api.github.com/search/repositories?q=size:{lower_bound}..{upper_bound}+stars:>100&per_page=100&page={page}', auth = (USER, TOKEN) @@ -67,16 +65,15 @@ def get_request(lower_bound: int, upper_bound: int, page: int = 1): if REMAINING_REQUESTS == 0: print("Sleeping 60 seconds to stay under GitHub API rate limit...") time.sleep(60) - save_ckpt(lower_bound, upper_bound) + save_ckpt(lower_bound, upper_bound, repo_list) REMAINING_REQUESTS = 30 return r -def download_range(lower_bound, upper_bound): +def download_range(lower_bound, upper_bound, repo_list): # Saves the names of repositories on GitHub to repo_list # in-between size minimum and maximum with over 100 stars. - global repo_list # Github page options start at index 1. for page in range(1, 11): r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page) @@ -147,7 +144,7 @@ def download_range(lower_bound, upper_bound): break print(f"Downloading repositories in size range {lower_bound}..{upper_bound}") - download_range(lower_bound, upper_bound) + download_range(lower_bound, upper_bound, repo_list) lower_bound = upper_bound + 1 save_ckpt(lower_bound, upper_bound) From 631d69a7dcbfd97decc7d8535a717b740f6afee3 Mon Sep 17 00:00:00 2001 From: Leo Gao <54557097+leogao2@users.noreply.github.com> Date: Wed, 23 Sep 2020 18:25:08 -0600 Subject: [PATCH 2/2] Update download_repo_info.py --- download_repo_info.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/download_repo_info.py b/download_repo_info.py index 4017b24..2c0a7e1 100644 --- a/download_repo_info.py +++ b/download_repo_info.py @@ -34,7 +34,7 @@ def save_ckpt(lower_bound: int, upper_bound: int, repo_list): with open('repo_ckpt.pkl', 'wb') as f: pickle.dump((lower_bound, upper_bound, repo_list), f) -def get_request(lower_bound: int, upper_bound: int, page: int = 1, repo_list): +def get_request(lower_bound: int, upper_bound: int, repo_list, page: int = 1): # Returns a request object from querying GitHub # for repos in-between size lower_bound and size upper_bound with over 100 stars. r = requests.get( @@ -76,7 +76,7 @@ def download_range(lower_bound, upper_bound, repo_list): # in-between size minimum and maximum with over 100 stars. # Github page options start at index 1. for page in range(1, 11): - r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, page=page) + r = get_request(lower_bound=lower_bound, upper_bound=upper_bound, repo_list=repo_list, page=page) if page == 1: n_results = r.json()['total_count'] @@ -113,7 +113,7 @@ def download_range(lower_bound, upper_bound, repo_list): exit() - r = get_request(lower_bound, upper_bound) + r = get_request(lower_bound, upper_bound, repo_list) # Initial number of results n_results = r.json()['total_count']