Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this

## [Unreleased]

### Changed
- Unify CrossRef request and parsing methods in pipeline (#26)

## [0.1.0] - 2025-02-09

### Added
Expand Down
145 changes: 123 additions & 22 deletions onecite/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,25 @@ def _parse_text(self, text_content: str) -> List[RawEntry]:

# If no DOI or URL found, build a concise query string from title/author/year
if not raw_entry['doi'] and not raw_entry['url']:
lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
title_text = lines[0] if lines else block
authors_text = lines[1] if len(lines) > 1 else ''
year_match = re.search(r'(19|20)\d{2}', block)
year_text = year_match.group(0) if year_match else ''

query_parts: List[str] = []
if title_text:
query_parts.append(title_text)
if authors_text:
query_parts.append(authors_text)
if year_text:
query_parts.append(year_text)

raw_entry['query_string'] = ' '.join(query_parts) or block
# Check if block is a bare PMID (7-8 digits, optionally prefixed with "PMID:")
if re.match(r'^(PMID:?\s*)?\d{7,8}$', block.strip(), re.IGNORECASE):
raw_entry['query_string'] = block.strip()
else:
lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
title_text = lines[0] if lines else block
authors_text = lines[1] if len(lines) > 1 else ''
year_match = re.search(r'(19|20)\d{2}', block)
year_text = year_match.group(0) if year_match else ''

query_parts: List[str] = []
if title_text:
query_parts.append(title_text)
if authors_text:
query_parts.append(authors_text)
if year_text:
query_parts.append(year_text)

raw_entry['query_string'] = ' '.join(query_parts) or block

entries.append(raw_entry)

Expand Down Expand Up @@ -2352,6 +2356,7 @@ def __init__(self, use_google_scholar: bool = False):
"""
self.logger = logging.getLogger(__name__)
self.crossref_base_url = "https://api.crossref.org/works"
self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
self.use_google_scholar = use_google_scholar
self._used_keys: set = set()

Expand Down Expand Up @@ -2560,15 +2565,15 @@ def _enrich_single_entry(self, identified_entry: IdentifiedEntry,
}

def _strip_html_tags(self, text: str) -> str:
"""Strip HTML tags from text and convert to plain text."""
"""Strip HTML/XML tags from text and convert to plain text."""
if not text:
return text

# Unescape HTML entities first (e.g., & -> &)
text = unescape(text)
# Remove all HTML/XML tags first (replace with space to avoid word concatenation)
clean_text = re.sub(r'<[^>]+>', ' ', text)

# Remove all HTML tags using regex
clean_text = re.sub(r'<[^>]+>', '', text)
# Unescape HTML entities (may need multiple passes for double-escaped entities)
clean_text = unescape(unescape(clean_text))

# Clean up extra spaces that may result
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
Expand Down Expand Up @@ -2603,9 +2608,14 @@ def _get_crossref_metadata(self, doi: str) -> Optional[Dict]:
'pages': work.get('page'),
'publisher': work.get('publisher'),
'url': work.get('URL'),
'type': work.get('type', '')
'type': work.get('type', ''),
'abstract': work.get('abstract') # Extract abstract if available
}

# Clean up abstract HTML tags if present
if metadata['abstract']:
metadata['abstract'] = self._strip_html_tags(metadata['abstract'])

# Book specific fields
if work.get('type') in ['book', 'monograph', 'edited-book', 'reference-book']:
metadata['is_book'] = True
Expand Down Expand Up @@ -2866,8 +2876,14 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base
"""Try each source in priority order to fill a missing field."""
for source in source_priority:
if source == 'crossref_api':
# Already got fromCrossref, skip
# Already got from Crossref, skip
continue
elif source == 'pubmed_api':
# Try PubMed for abstract
if field_name == 'abstract':
value = self._get_pubmed_abstract(base_record)
if value:
return value
elif source == 'google_scholar_scraper':
# Only use Google Scholar if enabled
if self.use_google_scholar:
Expand All @@ -2882,6 +2898,91 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base

return None

def _get_pubmed_abstract(self, base_record: Dict) -> Optional[str]:
"""Get abstract from PubMed using DOI or title/author search."""
try:
doi = base_record.get('doi')
pmid = None

# Step 1: Try to find PMID by DOI
if doi:
url = f"{self.pubmed_base}/esearch.fcgi"
params = {
'db': 'pubmed',
'term': f'{doi}[DOI]',
'retmode': 'json',
'retmax': 1
}
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
idlist = data.get('esearchresult', {}).get('idlist', [])
if idlist:
pmid = idlist[0]
self.logger.info(f"Found PMID {pmid} for DOI {doi}")

# Step 2: If no PMID found by DOI, try searching by title
if not pmid:
title = base_record.get('title', '')
if title:
# Clean title for search
search_title = re.sub(r'[^\w\s]', ' ', title).strip()
url = f"{self.pubmed_base}/esearch.fcgi"
params = {
'db': 'pubmed',
'term': search_title,
'retmode': 'json',
'retmax': 3
}
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
idlist = data.get('esearchresult', {}).get('idlist', [])
if idlist:
pmid = idlist[0]
self.logger.info(f"Found PMID {pmid} by title search")

# Step 3: Fetch abstract by PMID
if pmid:
url = f"{self.pubmed_base}/efetch.fcgi"
params = {
'db': 'pubmed',
'id': pmid,
'retmode': 'xml'
}
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()

# Parse XML to get abstract
import xml.etree.ElementTree as ET
root = ET.fromstring(response.content)

# Handle structured abstracts (multiple AbstractText with Label)
abstract_elems = root.findall('.//Abstract/AbstractText')
if abstract_elems:
parts = []
for elem in abstract_elems:
label = elem.get('Label', '')
text = ''.join(elem.itertext()).strip()
if text:
if label:
parts.append(f"{label}: {text}")
else:
parts.append(text)
if parts:
abstract = ' '.join(parts)
self.logger.info(f"Successfully retrieved abstract from PubMed (PMID: {pmid})")
return abstract

self.logger.warning(f"No abstract found in PubMed record (PMID: {pmid})")
else:
self.logger.warning(f"Could not find PMID for DOI {doi} or title")

except Exception as e:
self.logger.warning(f"Failed to get abstract from PubMed: {str(e)}")

return None

def _fetch_from_google_scholar(self, field_name: str, base_record: Dict) -> Optional[str]:
"""Get field value from Google Scholar (with improved timeout protection)"""
try:
Expand Down
40 changes: 40 additions & 0 deletions onecite/templates/journal_article_with_abstract.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: journal_article_with_abstract
entry_type: "@article"
fields:
- name: author
required: true
- name: title
required: true
- name: journal
required: true
- name: year
required: true
- name: abstract
required: false
source_priority:
- crossref_api
- pubmed_api
- name: volume
required: false
source_priority:
- crossref_api
- user_prompt
- name: number
required: false
source_priority:
- crossref_api
- user_prompt
- name: pages
required: false
source_priority:
- crossref_api
- google_scholar_scraper
- name: publisher
required: false
source_priority:
- crossref_api
- user_prompt
- name: doi
required: false
source_priority:
- crossref_api
123 changes: 123 additions & 0 deletions tests/test_pipeline_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,129 @@ def test_google_scholar_worker_error(self):
patch("time.sleep"), patch("time.time", return_value=1000.0):
assert e._fetch_from_google_scholar("pages", {"title": "T"}) is None

def test_strip_html_jats_and_entities(self):
"""JATS tags replaced with space (no word merging); double-escaped entities decoded."""
e = EnricherModule(use_google_scholar=False)
jats = "<jats:title>Background</jats:title><jats:p>The treatment.</jats:p>"
result = e._strip_html_tags(jats)
assert "Background" in result and "The treatment" in result
assert "BackgroundThe" not in result
text = "p &amp;gt; 0.05 and p &amp;lt; 0.01"
result2 = e._strip_html_tags(text)
assert "&gt;" not in result2 and ">" in result2
assert "&lt;" not in result2 and "<" in result2

def test_crossref_metadata_abstract(self):
"""Abstract extracted and JATS-cleaned when present; absent when Crossref omits it."""
e = EnricherModule(use_google_scholar=False)
payload_with = {"message": {
"DOI": "10.1234/test", "title": ["Test Article"],
"author": [{"given": "Jane", "family": "Doe"}],
"container-title": ["Test Journal"],
"published-print": {"date-parts": [[2023]]},
"abstract": "<jats:p>This is the <jats:italic>abstract</jats:italic> text.</jats:p>",
}}
with patch("onecite.pipeline.requests.get",
return_value=DummyResponse(json_data=payload_with)):
meta = e._get_crossref_metadata("10.1234/test")
assert "abstract" in meta and "abstract" in meta["abstract"]
assert "<jats:" not in meta["abstract"]

payload_without = {"message": {
"DOI": "10.1234/noabs", "title": ["No Abstract"],
"author": [{"given": "A", "family": "B"}],
"container-title": ["J"],
"published-print": {"date-parts": [[2020]]},
}}
with patch("onecite.pipeline.requests.get",
return_value=DummyResponse(json_data=payload_without)):
assert "abstract" not in e._get_crossref_metadata("10.1234/noabs")

def test_get_pubmed_abstract_via_doi(self):
"""_get_pubmed_abstract resolves DOI → PMID → fetches abstract."""
e = EnricherModule(use_google_scholar=False)
xml_content = b"""<?xml version="1.0"?>
<PubmedArticleSet><PubmedArticle><MedlineCitation>
<Article><Abstract>
<AbstractText>This is the PubMed abstract.</AbstractText>
</Abstract></Article>
</MedlineCitation></PubmedArticle></PubmedArticleSet>"""

def fake_get(url, *a, **kw):
params = kw.get("params", {})
if "esearch" in url:
return DummyResponse(json_data={
"esearchresult": {"idlist": ["12345678"]}
})
if "efetch" in url:
return DummyResponse(content=xml_content)
return DummyResponse(status_code=404, json_data={})

with patch("onecite.pipeline.requests.get", side_effect=fake_get):
result = e._get_pubmed_abstract({"doi": "10.1234/test"})

assert result == "This is the PubMed abstract."

def test_get_pubmed_abstract_structured(self):
"""Structured abstracts (multiple AbstractText with Label) are joined."""
e = EnricherModule(use_google_scholar=False)
xml_content = b"""<?xml version="1.0"?>
<PubmedArticleSet><PubmedArticle><MedlineCitation>
<Article><Abstract>
<AbstractText Label="BACKGROUND">Background text.</AbstractText>
<AbstractText Label="METHODS">Methods text.</AbstractText>
<AbstractText Label="RESULTS">Results text.</AbstractText>
</Abstract></Article>
</MedlineCitation></PubmedArticle></PubmedArticleSet>"""

def fake_get(url, *a, **kw):
if "esearch" in url:
return DummyResponse(json_data={"esearchresult": {"idlist": ["99999"]}})
if "efetch" in url:
return DummyResponse(content=xml_content)
return DummyResponse(status_code=404, json_data={})

with patch("onecite.pipeline.requests.get", side_effect=fake_get):
result = e._get_pubmed_abstract({"doi": "10.1234/struct"})

assert result is not None
assert "BACKGROUND: Background text." in result
assert "METHODS: Methods text." in result
assert "RESULTS: Results text." in result

def test_get_pubmed_abstract_returns_none(self):
"""Returns None when PMID not found, or when PubMed record has no Abstract."""
e = EnricherModule(use_google_scholar=False)

with patch("onecite.pipeline.requests.get",
return_value=DummyResponse(json_data={"esearchresult": {"idlist": []}})):
assert e._get_pubmed_abstract({"doi": "10.9999/notinpubmed"}) is None

xml_no_abstract = b"""<?xml version="1.0"?>
<PubmedArticleSet><PubmedArticle><MedlineCitation>
<Article><ArticleTitle>No abstract here</ArticleTitle></Article>
</MedlineCitation></PubmedArticle></PubmedArticleSet>"""

def fake_get(url, *a, **kw):
if "esearch" in url:
return DummyResponse(json_data={"esearchresult": {"idlist": ["77777"]}})
return DummyResponse(content=xml_no_abstract)

with patch("onecite.pipeline.requests.get", side_effect=fake_get):
assert e._get_pubmed_abstract({"doi": "10.1234/noabs"}) is None

def test_fetch_missing_field_abstract_sources(self):
"""pubmed_api delegates to _get_pubmed_abstract; crossref_api is always skipped."""
e = EnricherModule(use_google_scholar=False)
with patch.object(e, "_get_pubmed_abstract", return_value="Mocked abstract") as m:
val = e._fetch_missing_field("abstract", ["pubmed_api"], {"doi": "10.1/x"})
assert val == "Mocked abstract"
m.assert_called_once_with({"doi": "10.1/x"})

with patch.object(e, "_get_pubmed_abstract", return_value=None) as m:
assert e._fetch_missing_field("abstract", ["crossref_api"], {"doi": "10.1/x"}) is None
m.assert_not_called()


# ===================================================================
# FormatterModule
Expand Down
Loading