diff --git a/CHANGELOG.md b/CHANGELOG.md
index 817c5f6..2c988ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this
## [Unreleased]
+### Changed
+- Unify CrossRef request and parsing methods in pipeline (#26)
+
## [0.1.0] - 2025-02-09
### Added
diff --git a/onecite/pipeline.py b/onecite/pipeline.py
index 13ca8a3..d0d299b 100644
--- a/onecite/pipeline.py
+++ b/onecite/pipeline.py
@@ -142,21 +142,25 @@ def _parse_text(self, text_content: str) -> List[RawEntry]:
# If no DOI or URL found, build a concise query string from title/author/year
if not raw_entry['doi'] and not raw_entry['url']:
- lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
- title_text = lines[0] if lines else block
- authors_text = lines[1] if len(lines) > 1 else ''
- year_match = re.search(r'(19|20)\d{2}', block)
- year_text = year_match.group(0) if year_match else ''
-
- query_parts: List[str] = []
- if title_text:
- query_parts.append(title_text)
- if authors_text:
- query_parts.append(authors_text)
- if year_text:
- query_parts.append(year_text)
-
- raw_entry['query_string'] = ' '.join(query_parts) or block
+ # Check if block is a bare PMID (7-8 digits, optionally prefixed with "PMID:")
+ if re.match(r'^(PMID:?\s*)?\d{7,8}$', block.strip(), re.IGNORECASE):
+ raw_entry['query_string'] = block.strip()
+ else:
+ lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
+ title_text = lines[0] if lines else block
+ authors_text = lines[1] if len(lines) > 1 else ''
+ year_match = re.search(r'(19|20)\d{2}', block)
+ year_text = year_match.group(0) if year_match else ''
+
+ query_parts: List[str] = []
+ if title_text:
+ query_parts.append(title_text)
+ if authors_text:
+ query_parts.append(authors_text)
+ if year_text:
+ query_parts.append(year_text)
+
+ raw_entry['query_string'] = ' '.join(query_parts) or block
entries.append(raw_entry)
@@ -2352,6 +2356,7 @@ def __init__(self, use_google_scholar: bool = False):
"""
self.logger = logging.getLogger(__name__)
self.crossref_base_url = "https://api.crossref.org/works"
+ self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
self.use_google_scholar = use_google_scholar
self._used_keys: set = set()
@@ -2560,15 +2565,15 @@ def _enrich_single_entry(self, identified_entry: IdentifiedEntry,
}
def _strip_html_tags(self, text: str) -> str:
- """Strip HTML tags from text and convert to plain text."""
+ """Strip HTML/XML tags from text and convert to plain text."""
if not text:
return text
- # Unescape HTML entities first (e.g., & -> &)
- text = unescape(text)
+ # Remove all HTML/XML tags first (replace with space to avoid word concatenation)
+ clean_text = re.sub(r'<[^>]+>', ' ', text)
- # Remove all HTML tags using regex
- clean_text = re.sub(r'<[^>]+>', '', text)
+ # Unescape HTML entities (may need multiple passes for double-escaped entities)
+ clean_text = unescape(unescape(clean_text))
# Clean up extra spaces that may result
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
@@ -2603,9 +2608,14 @@ def _get_crossref_metadata(self, doi: str) -> Optional[Dict]:
'pages': work.get('page'),
'publisher': work.get('publisher'),
'url': work.get('URL'),
- 'type': work.get('type', '')
+ 'type': work.get('type', ''),
+ 'abstract': work.get('abstract') # Extract abstract if available
}
+ # Clean up abstract HTML tags if present
+ if metadata['abstract']:
+ metadata['abstract'] = self._strip_html_tags(metadata['abstract'])
+
# Book specific fields
if work.get('type') in ['book', 'monograph', 'edited-book', 'reference-book']:
metadata['is_book'] = True
@@ -2866,8 +2876,14 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base
"""Try each source in priority order to fill a missing field."""
for source in source_priority:
if source == 'crossref_api':
- # Already got fromCrossref, skip
+ # Already got from Crossref, skip
continue
+ elif source == 'pubmed_api':
+ # Try PubMed for abstract
+ if field_name == 'abstract':
+ value = self._get_pubmed_abstract(base_record)
+ if value:
+ return value
elif source == 'google_scholar_scraper':
# Only use Google Scholar if enabled
if self.use_google_scholar:
@@ -2882,6 +2898,91 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base
return None
+ def _get_pubmed_abstract(self, base_record: Dict) -> Optional[str]:
+ """Get abstract from PubMed using DOI or title/author search."""
+ try:
+ doi = base_record.get('doi')
+ pmid = None
+
+ # Step 1: Try to find PMID by DOI
+ if doi:
+ url = f"{self.pubmed_base}/esearch.fcgi"
+ params = {
+ 'db': 'pubmed',
+ 'term': f'{doi}[DOI]',
+ 'retmode': 'json',
+ 'retmax': 1
+ }
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+ idlist = data.get('esearchresult', {}).get('idlist', [])
+ if idlist:
+ pmid = idlist[0]
+ self.logger.info(f"Found PMID {pmid} for DOI {doi}")
+
+ # Step 2: If no PMID found by DOI, try searching by title
+ if not pmid:
+ title = base_record.get('title', '')
+ if title:
+ # Clean title for search
+ search_title = re.sub(r'[^\w\s]', ' ', title).strip()
+ url = f"{self.pubmed_base}/esearch.fcgi"
+ params = {
+ 'db': 'pubmed',
+ 'term': search_title,
+ 'retmode': 'json',
+ 'retmax': 3
+ }
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+ data = response.json()
+ idlist = data.get('esearchresult', {}).get('idlist', [])
+ if idlist:
+ pmid = idlist[0]
+ self.logger.info(f"Found PMID {pmid} by title search")
+
+ # Step 3: Fetch abstract by PMID
+ if pmid:
+ url = f"{self.pubmed_base}/efetch.fcgi"
+ params = {
+ 'db': 'pubmed',
+ 'id': pmid,
+ 'retmode': 'xml'
+ }
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+
+ # Parse XML to get abstract
+ import xml.etree.ElementTree as ET
+ root = ET.fromstring(response.content)
+
+ # Handle structured abstracts (multiple AbstractText with Label)
+ abstract_elems = root.findall('.//Abstract/AbstractText')
+ if abstract_elems:
+ parts = []
+ for elem in abstract_elems:
+ label = elem.get('Label', '')
+ text = ''.join(elem.itertext()).strip()
+ if text:
+ if label:
+ parts.append(f"{label}: {text}")
+ else:
+ parts.append(text)
+ if parts:
+ abstract = ' '.join(parts)
+ self.logger.info(f"Successfully retrieved abstract from PubMed (PMID: {pmid})")
+ return abstract
+
+ self.logger.warning(f"No abstract found in PubMed record (PMID: {pmid})")
+ else:
+ self.logger.warning(f"Could not find PMID for DOI {doi} or title")
+
+ except Exception as e:
+ self.logger.warning(f"Failed to get abstract from PubMed: {str(e)}")
+
+ return None
+
def _fetch_from_google_scholar(self, field_name: str, base_record: Dict) -> Optional[str]:
"""Get field value from Google Scholar (with improved timeout protection)"""
try:
diff --git a/onecite/templates/journal_article_with_abstract.yaml b/onecite/templates/journal_article_with_abstract.yaml
new file mode 100644
index 0000000..980e297
--- /dev/null
+++ b/onecite/templates/journal_article_with_abstract.yaml
@@ -0,0 +1,40 @@
+name: journal_article_with_abstract
+entry_type: "@article"
+fields:
+ - name: author
+ required: true
+ - name: title
+ required: true
+ - name: journal
+ required: true
+ - name: year
+ required: true
+ - name: abstract
+ required: false
+ source_priority:
+ - crossref_api
+ - pubmed_api
+ - name: volume
+ required: false
+ source_priority:
+ - crossref_api
+ - user_prompt
+ - name: number
+ required: false
+ source_priority:
+ - crossref_api
+ - user_prompt
+ - name: pages
+ required: false
+ source_priority:
+ - crossref_api
+ - google_scholar_scraper
+ - name: publisher
+ required: false
+ source_priority:
+ - crossref_api
+ - user_prompt
+ - name: doi
+ required: false
+ source_priority:
+ - crossref_api
diff --git a/tests/test_pipeline_unit.py b/tests/test_pipeline_unit.py
index 65b81cf..a88e96d 100644
--- a/tests/test_pipeline_unit.py
+++ b/tests/test_pipeline_unit.py
@@ -777,6 +777,129 @@ def test_google_scholar_worker_error(self):
patch("time.sleep"), patch("time.time", return_value=1000.0):
assert e._fetch_from_google_scholar("pages", {"title": "T"}) is None
+ def test_strip_html_jats_and_entities(self):
+ """JATS tags replaced with space (no word merging); double-escaped entities decoded."""
+ e = EnricherModule(use_google_scholar=False)
+ jats = "BackgroundThe treatment."
+ result = e._strip_html_tags(jats)
+ assert "Background" in result and "The treatment" in result
+ assert "BackgroundThe" not in result
+ text = "p > 0.05 and p < 0.01"
+ result2 = e._strip_html_tags(text)
+ assert ">" not in result2 and ">" in result2
+ assert "<" not in result2 and "<" in result2
+
+ def test_crossref_metadata_abstract(self):
+ """Abstract extracted and JATS-cleaned when present; absent when Crossref omits it."""
+ e = EnricherModule(use_google_scholar=False)
+ payload_with = {"message": {
+ "DOI": "10.1234/test", "title": ["Test Article"],
+ "author": [{"given": "Jane", "family": "Doe"}],
+ "container-title": ["Test Journal"],
+ "published-print": {"date-parts": [[2023]]},
+ "abstract": "This is the abstract text.",
+ }}
+ with patch("onecite.pipeline.requests.get",
+ return_value=DummyResponse(json_data=payload_with)):
+ meta = e._get_crossref_metadata("10.1234/test")
+ assert "abstract" in meta and "abstract" in meta["abstract"]
+ assert "
+
+
+ This is the PubMed abstract.
+
+ """
+
+ def fake_get(url, *a, **kw):
+ params = kw.get("params", {})
+ if "esearch" in url:
+ return DummyResponse(json_data={
+ "esearchresult": {"idlist": ["12345678"]}
+ })
+ if "efetch" in url:
+ return DummyResponse(content=xml_content)
+ return DummyResponse(status_code=404, json_data={})
+
+ with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+ result = e._get_pubmed_abstract({"doi": "10.1234/test"})
+
+ assert result == "This is the PubMed abstract."
+
+ def test_get_pubmed_abstract_structured(self):
+ """Structured abstracts (multiple AbstractText with Label) are joined."""
+ e = EnricherModule(use_google_scholar=False)
+ xml_content = b"""
+
+
+ Background text.
+ Methods text.
+ Results text.
+
+ """
+
+ def fake_get(url, *a, **kw):
+ if "esearch" in url:
+ return DummyResponse(json_data={"esearchresult": {"idlist": ["99999"]}})
+ if "efetch" in url:
+ return DummyResponse(content=xml_content)
+ return DummyResponse(status_code=404, json_data={})
+
+ with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+ result = e._get_pubmed_abstract({"doi": "10.1234/struct"})
+
+ assert result is not None
+ assert "BACKGROUND: Background text." in result
+ assert "METHODS: Methods text." in result
+ assert "RESULTS: Results text." in result
+
+ def test_get_pubmed_abstract_returns_none(self):
+ """Returns None when PMID not found, or when PubMed record has no Abstract."""
+ e = EnricherModule(use_google_scholar=False)
+
+ with patch("onecite.pipeline.requests.get",
+ return_value=DummyResponse(json_data={"esearchresult": {"idlist": []}})):
+ assert e._get_pubmed_abstract({"doi": "10.9999/notinpubmed"}) is None
+
+ xml_no_abstract = b"""
+
+ No abstract here
+ """
+
+ def fake_get(url, *a, **kw):
+ if "esearch" in url:
+ return DummyResponse(json_data={"esearchresult": {"idlist": ["77777"]}})
+ return DummyResponse(content=xml_no_abstract)
+
+ with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+ assert e._get_pubmed_abstract({"doi": "10.1234/noabs"}) is None
+
+ def test_fetch_missing_field_abstract_sources(self):
+ """pubmed_api delegates to _get_pubmed_abstract; crossref_api is always skipped."""
+ e = EnricherModule(use_google_scholar=False)
+ with patch.object(e, "_get_pubmed_abstract", return_value="Mocked abstract") as m:
+ val = e._fetch_missing_field("abstract", ["pubmed_api"], {"doi": "10.1/x"})
+ assert val == "Mocked abstract"
+ m.assert_called_once_with({"doi": "10.1/x"})
+
+ with patch.object(e, "_get_pubmed_abstract", return_value=None) as m:
+ assert e._fetch_missing_field("abstract", ["crossref_api"], {"doi": "10.1/x"}) is None
+ m.assert_not_called()
+
# ===================================================================
# FormatterModule