-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathautomatic_update.py
More file actions
128 lines (108 loc) · 4.96 KB
/
automatic_update.py
File metadata and controls
128 lines (108 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# This file automatically updates the bibliography with GitHub Actions
import logging
import requests
import feedparser
from bs4 import BeautifulSoup
import pybtex.scanner
from src.scraper import get_publications_from_issue, enrich_publications
from pybtex.database import BibliographyData
from pybtex.database import parse_string as bibtex_parse_string
from unidecode import unidecode
from src.utils import MATHONCO_BIB_FILE, get_parsed_bibliography
logging.basicConfig(level=logging.INFO)
### --- Initialize Feed --- ###
mathonco_feed = feedparser.parse("https://thisweekmathonco.substack.com/feed")
logging.info(f"bozo: {mathonco_feed.bozo}")
logging.info(f"status: {getattr(mathonco_feed, 'status', None)}")
logging.info(f"entries: {len(getattr(mathonco_feed, 'entries', []))}")
logging.info(f"version: {getattr(mathonco_feed, 'version', None)}")
logging.info(f"bozo_exception: {getattr(mathonco_feed, 'bozo_exception', None)}")
### --- Read Latest Saved Issue Number --- ###
with open(MATHONCO_BIB_FILE, "r") as f:
latest_issue = f.readline().strip()
latest_issue_number = int(latest_issue.split(" ")[-1])
## --- Get parsed bibliography --- ###
full_bib_content_parsed = get_parsed_bibliography()
logging.info(f"Loaded {len(full_bib_content_parsed.entries)} entries from the bibliography.")
### --- Find new issues, if any --- ###
new_issues = []
for issue in mathonco_feed.entries:
issue_number = issue.title.split(" ")[-1]
issue_number = int(issue_number)
if issue_number > latest_issue_number:
new_issues.append(issue)
logging.info(f"New issue found: {issue_number}")
# Inform the user if no new issue is found
if len(new_issues) == 0:
logging.info(f"No new issue found. Latest issue is {latest_issue_number}.")
# reverse to start from the oldest
new_issues = sorted(new_issues, key=lambda x: int(x.title.split(" ")[-1]))
### --- Iterate on issues until you find the last --- ###
for issue in new_issues:
### --- Get issue number --- ###
new_issue_number = issue.title.split(" ")[-1]
new_issue_number = int(new_issue_number)
logging.info(f"Processing issue {new_issue_number}...")
### --- If new issue exist, extract publications --- ###
mathonco_issue_html = issue.content[0].value # get html
html_soup = BeautifulSoup(mathonco_issue_html, 'html.parser')
new_issue_dict = get_publications_from_issue(html_soup, new_issue_number)
new_issue_dict = enrich_publications(new_issue_dict, new_issue_number)
### --- Create string with formatted bibliography --- ###
formatted_bib = f"//MathOnco Issue {new_issue_number}\n"
for pub in new_issue_dict[new_issue_number]:
# get bibtex
pub_bib = pub.get("bibtex")
# if None, skip
if pub_bib is None:
continue
# parse
try:
parsed_bibtex = bibtex_parse_string(pub_bib, "bibtex")
except pybtex.scanner.TokenRequired:
logging.error(f"Something wrong with the entry: {pub_bib}")
raise pybtex.scanner.TokenRequired
# get bibtex label and entry
bib_label, bib_entry = list(parsed_bibtex.entries.items())[0]
# create label in the format surnameYEARfirstword, where:
# - surname is the surname of the first author
# - YEAR is the year of the paper
# - first word is the first word of the title
# get surname
authors_list = list(bib_entry.persons.values())
if len(authors_list) == 0:
continue
first_author_surname = authors_list[0][0].last_names[0]
first_author_surname = first_author_surname.lower()
first_author_surname = unidecode(first_author_surname)
# get year
year = bib_entry.fields["year"]
# get first_word
first_word, _ = bib_entry.fields["title"].split(" ", 1)
if len(first_word) <= 3:
word_1, word_2, _ = bib_entry.fields["title"].split(" ", 2)
first_word = f"{word_1}{word_2}"
first_word = first_word.lower()
# set label of the bibtex
bibtex_label = f"{first_author_surname}{year}{first_word}"
# check if already in the bibliography, if so, skip
if bibtex_label in full_bib_content_parsed.entries:
logging.warning(f"Entry {bibtex_label} already exists in the bibliography. Skipping.")
continue
# else, add to the bibliography
parsed_bibtex = BibliographyData({
bibtex_label: bib_entry
})
formatted_bib += parsed_bibtex.to_string('bibtex')
formatted_bib += "\n"
### --- Write formatted bibliography to file --- ###
# write single issue to file
single_issue_file = f"res/single_issues/issue_{new_issue_number}.bib"
with open(single_issue_file, "w") as f:
f.write(formatted_bib)
# prepend complete file
with open(MATHONCO_BIB_FILE, "r") as f:
big_bib = f.read()
with open(MATHONCO_BIB_FILE, "w") as f:
f.write(formatted_bib)
f.write(big_bib)