Skip to content

Commit 2483f10

Browse files
author
Ang
committed
fix: Crossref User-Agent/mailto headers (#21) and author name field (#22)
- EnricherModule: add _crossref_headers with proper User-Agent and mailto - _get_crossref_metadata: use shared headers/mailto for all requests - _format_authors: handle name field (org authors) and given-only authors - Add regression tests for both fixes Closes #21 Closes #22
1 parent 737f404 commit 2483f10

2 files changed

Lines changed: 51 additions & 8 deletions

File tree

onecite/pipeline.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,6 +2359,11 @@ def __init__(self, use_google_scholar: bool = False):
23592359
self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
23602360
self.use_google_scholar = use_google_scholar
23612361
self._used_keys: set = set()
2362+
self._crossref_headers = {
2363+
'Accept': 'application/json',
2364+
'User-Agent': f'OneCite/0.1.0 (https://github.com/HzaCode/OneCite; mailto:onecite@users.noreply.github.com)',
2365+
}
2366+
self._crossref_mailto = 'onecite@users.noreply.github.com'
23622367

23632368
def enrich(self, identified_entries: List[IdentifiedEntry],
23642369
template: Dict, raw_entries: List[RawEntry] = None) -> List[CompletedEntry]:
@@ -2584,9 +2589,8 @@ def _get_crossref_metadata(self, doi: str) -> Optional[Dict]:
25842589
"""Get metadata from the Crossref API"""
25852590
try:
25862591
url = f"{self.crossref_base_url}/{doi}"
2587-
headers = {'Accept': 'application/json'}
2588-
2589-
response = requests.get(url, headers=headers, timeout=10)
2592+
params = {'mailto': self._crossref_mailto}
2593+
response = requests.get(url, headers=self._crossref_headers, params=params, timeout=10)
25902594
response.raise_for_status()
25912595

25922596
data = response.json()
@@ -2800,13 +2804,17 @@ def _format_authors(self, authors: List[Dict]) -> str:
28002804
"""Format the author list"""
28012805
formatted_authors = []
28022806
for author in authors:
2803-
given = author.get('given', '')
2804-
family = author.get('family', '')
2805-
if family:
2806-
if given:
2807+
if author.get('name'):
2808+
formatted_authors.append(author['name'])
2809+
else:
2810+
given = author.get('given', '')
2811+
family = author.get('family', '')
2812+
if family and given:
28072813
formatted_authors.append(f"{family}, {given}")
2808-
else:
2814+
elif family:
28092815
formatted_authors.append(family)
2816+
elif given:
2817+
formatted_authors.append(given)
28102818

28112819
return ' and '.join(formatted_authors)
28122820

tests/test_pipeline_unit.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,41 @@ def test_strip_html(self):
681681
assert e._strip_html_tags("Human-level <i>control</i> &amp; learning") == \
682682
"Human-level control & learning"
683683

684+
def test_crossref_request_has_user_agent_and_mailto(self):
685+
"""fix #21: _get_crossref_metadata must send User-Agent and mailto."""
686+
e = EnricherModule(use_google_scholar=False)
687+
captured = {}
688+
689+
def fake_get(url, *a, **kw):
690+
captured['headers'] = kw.get('headers', {})
691+
captured['params'] = kw.get('params', {})
692+
return DummyResponse(json_data={"message": {
693+
"DOI": "10.1234/x", "title": ["T"],
694+
"published-print": {"date-parts": [[2020]]},
695+
}})
696+
697+
with patch("onecite.pipeline.requests.get", side_effect=fake_get):
698+
e._get_crossref_metadata("10.1234/x")
699+
700+
assert "User-Agent" in captured["headers"], "User-Agent header missing"
701+
assert "OneCite" in captured["headers"]["User-Agent"]
702+
assert captured["params"].get("mailto"), "mailto param missing"
703+
704+
def test_format_authors_name_field(self):
705+
"""fix #22: org authors with 'name' field must not be dropped."""
706+
e = EnricherModule(use_google_scholar=False)
707+
authors = [
708+
{"given": "John", "family": "Doe"},
709+
{"name": "World Health Organization"},
710+
{"family": "Smith"},
711+
{"given": "Alice"},
712+
]
713+
result = e._format_authors(authors)
714+
assert "Doe, John" in result
715+
assert "World Health Organization" in result
716+
assert "Smith" in result
717+
assert "Alice" in result
718+
684719
def test_google_scholar_disabled_returns_none(self):
685720
e = EnricherModule(use_google_scholar=False)
686721
assert e._fetch_missing_field("pages", ["google_scholar_scraper"], {"title": "T"}) is None

0 commit comments

Comments
 (0)