From 6b43b3cb807da8bfd62e3cf8119d52280112cbd6 Mon Sep 17 00:00:00 2001 From: Ang Date: Fri, 20 Feb 2026 02:12:15 +0800 Subject: [PATCH 1/2] docs: add changelog entry for CrossRef unification (#26) --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 817c5f6..2c988ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this ## [Unreleased] +### Changed +- Unify CrossRef request and parsing methods in pipeline (#26) + ## [0.1.0] - 2025-02-09 ### Added From 3dc8904ca62bf36a99039fccbe79fe500a94ad45 Mon Sep 17 00:00:00 2001 From: Ang Date: Mon, 6 Apr 2026 18:56:13 +0800 Subject: [PATCH 2/2] feat: add abstract extraction via Crossref and PubMed fallback - Add journal_article_with_abstract template with pubmed_api source - Extract abstract from Crossref API response with JATS cleanup - Add _get_pubmed_abstract method (DOI->PMID->efetch path) - Add pubmed_api source support in _fetch_missing_field - Fix _strip_html_tags: replace tags with space, double-unescape entities - Fix parser to preserve bare PMID inputs as query_string - Add 6 unit tests covering all new abstract-related code paths --- onecite/pipeline.py | 145 +++++++++++++++--- .../journal_article_with_abstract.yaml | 40 +++++ tests/test_pipeline_unit.py | 123 +++++++++++++++ 3 files changed, 286 insertions(+), 22 deletions(-) create mode 100644 onecite/templates/journal_article_with_abstract.yaml diff --git a/onecite/pipeline.py b/onecite/pipeline.py index 13ca8a3..d0d299b 100644 --- a/onecite/pipeline.py +++ b/onecite/pipeline.py @@ -142,21 +142,25 @@ def _parse_text(self, text_content: str) -> List[RawEntry]: # If no DOI or URL found, build a concise query string from title/author/year if not raw_entry['doi'] and not raw_entry['url']: - lines = [ln.strip() for ln in block.splitlines() if ln.strip()] - title_text = lines[0] if lines else block - authors_text = lines[1] if len(lines) > 1 else '' - year_match = re.search(r'(19|20)\d{2}', block) - year_text = year_match.group(0) if year_match else '' - - query_parts: List[str] = [] - if title_text: - query_parts.append(title_text) - if authors_text: - query_parts.append(authors_text) - if year_text: - query_parts.append(year_text) - - raw_entry['query_string'] = ' '.join(query_parts) or block + # Check if block is a bare PMID (7-8 digits, optionally prefixed with "PMID:") + if re.match(r'^(PMID:?\s*)?\d{7,8}$', block.strip(), re.IGNORECASE): + raw_entry['query_string'] = block.strip() + else: + lines = [ln.strip() for ln in block.splitlines() if ln.strip()] + title_text = lines[0] if lines else block + authors_text = lines[1] if len(lines) > 1 else '' + year_match = re.search(r'(19|20)\d{2}', block) + year_text = year_match.group(0) if year_match else '' + + query_parts: List[str] = [] + if title_text: + query_parts.append(title_text) + if authors_text: + query_parts.append(authors_text) + if year_text: + query_parts.append(year_text) + + raw_entry['query_string'] = ' '.join(query_parts) or block entries.append(raw_entry) @@ -2352,6 +2356,7 @@ def __init__(self, use_google_scholar: bool = False): """ self.logger = logging.getLogger(__name__) self.crossref_base_url = "https://api.crossref.org/works" + self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" self.use_google_scholar = use_google_scholar self._used_keys: set = set() @@ -2560,15 +2565,15 @@ def _enrich_single_entry(self, identified_entry: IdentifiedEntry, } def _strip_html_tags(self, text: str) -> str: - """Strip HTML tags from text and convert to plain text.""" + """Strip HTML/XML tags from text and convert to plain text.""" if not text: return text - # Unescape HTML entities first (e.g., & -> &) - text = unescape(text) + # Remove all HTML/XML tags first (replace with space to avoid word concatenation) + clean_text = re.sub(r'<[^>]+>', ' ', text) - # Remove all HTML tags using regex - clean_text = re.sub(r'<[^>]+>', '', text) + # Unescape HTML entities (may need multiple passes for double-escaped entities) + clean_text = unescape(unescape(clean_text)) # Clean up extra spaces that may result clean_text = re.sub(r'\s+', ' ', clean_text).strip() @@ -2603,9 +2608,14 @@ def _get_crossref_metadata(self, doi: str) -> Optional[Dict]: 'pages': work.get('page'), 'publisher': work.get('publisher'), 'url': work.get('URL'), - 'type': work.get('type', '') + 'type': work.get('type', ''), + 'abstract': work.get('abstract') # Extract abstract if available } + # Clean up abstract HTML tags if present + if metadata['abstract']: + metadata['abstract'] = self._strip_html_tags(metadata['abstract']) + # Book specific fields if work.get('type') in ['book', 'monograph', 'edited-book', 'reference-book']: metadata['is_book'] = True @@ -2866,8 +2876,14 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base """Try each source in priority order to fill a missing field.""" for source in source_priority: if source == 'crossref_api': - # Already got fromCrossref, skip + # Already got from Crossref, skip continue + elif source == 'pubmed_api': + # Try PubMed for abstract + if field_name == 'abstract': + value = self._get_pubmed_abstract(base_record) + if value: + return value elif source == 'google_scholar_scraper': # Only use Google Scholar if enabled if self.use_google_scholar: @@ -2882,6 +2898,91 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base return None + def _get_pubmed_abstract(self, base_record: Dict) -> Optional[str]: + """Get abstract from PubMed using DOI or title/author search.""" + try: + doi = base_record.get('doi') + pmid = None + + # Step 1: Try to find PMID by DOI + if doi: + url = f"{self.pubmed_base}/esearch.fcgi" + params = { + 'db': 'pubmed', + 'term': f'{doi}[DOI]', + 'retmode': 'json', + 'retmax': 1 + } + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + idlist = data.get('esearchresult', {}).get('idlist', []) + if idlist: + pmid = idlist[0] + self.logger.info(f"Found PMID {pmid} for DOI {doi}") + + # Step 2: If no PMID found by DOI, try searching by title + if not pmid: + title = base_record.get('title', '') + if title: + # Clean title for search + search_title = re.sub(r'[^\w\s]', ' ', title).strip() + url = f"{self.pubmed_base}/esearch.fcgi" + params = { + 'db': 'pubmed', + 'term': search_title, + 'retmode': 'json', + 'retmax': 3 + } + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + idlist = data.get('esearchresult', {}).get('idlist', []) + if idlist: + pmid = idlist[0] + self.logger.info(f"Found PMID {pmid} by title search") + + # Step 3: Fetch abstract by PMID + if pmid: + url = f"{self.pubmed_base}/efetch.fcgi" + params = { + 'db': 'pubmed', + 'id': pmid, + 'retmode': 'xml' + } + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + + # Parse XML to get abstract + import xml.etree.ElementTree as ET + root = ET.fromstring(response.content) + + # Handle structured abstracts (multiple AbstractText with Label) + abstract_elems = root.findall('.//Abstract/AbstractText') + if abstract_elems: + parts = [] + for elem in abstract_elems: + label = elem.get('Label', '') + text = ''.join(elem.itertext()).strip() + if text: + if label: + parts.append(f"{label}: {text}") + else: + parts.append(text) + if parts: + abstract = ' '.join(parts) + self.logger.info(f"Successfully retrieved abstract from PubMed (PMID: {pmid})") + return abstract + + self.logger.warning(f"No abstract found in PubMed record (PMID: {pmid})") + else: + self.logger.warning(f"Could not find PMID for DOI {doi} or title") + + except Exception as e: + self.logger.warning(f"Failed to get abstract from PubMed: {str(e)}") + + return None + def _fetch_from_google_scholar(self, field_name: str, base_record: Dict) -> Optional[str]: """Get field value from Google Scholar (with improved timeout protection)""" try: diff --git a/onecite/templates/journal_article_with_abstract.yaml b/onecite/templates/journal_article_with_abstract.yaml new file mode 100644 index 0000000..980e297 --- /dev/null +++ b/onecite/templates/journal_article_with_abstract.yaml @@ -0,0 +1,40 @@ +name: journal_article_with_abstract +entry_type: "@article" +fields: + - name: author + required: true + - name: title + required: true + - name: journal + required: true + - name: year + required: true + - name: abstract + required: false + source_priority: + - crossref_api + - pubmed_api + - name: volume + required: false + source_priority: + - crossref_api + - user_prompt + - name: number + required: false + source_priority: + - crossref_api + - user_prompt + - name: pages + required: false + source_priority: + - crossref_api + - google_scholar_scraper + - name: publisher + required: false + source_priority: + - crossref_api + - user_prompt + - name: doi + required: false + source_priority: + - crossref_api diff --git a/tests/test_pipeline_unit.py b/tests/test_pipeline_unit.py index 65b81cf..a88e96d 100644 --- a/tests/test_pipeline_unit.py +++ b/tests/test_pipeline_unit.py @@ -777,6 +777,129 @@ def test_google_scholar_worker_error(self): patch("time.sleep"), patch("time.time", return_value=1000.0): assert e._fetch_from_google_scholar("pages", {"title": "T"}) is None + def test_strip_html_jats_and_entities(self): + """JATS tags replaced with space (no word merging); double-escaped entities decoded.""" + e = EnricherModule(use_google_scholar=False) + jats = "BackgroundThe treatment." + result = e._strip_html_tags(jats) + assert "Background" in result and "The treatment" in result + assert "BackgroundThe" not in result + text = "p &gt; 0.05 and p &lt; 0.01" + result2 = e._strip_html_tags(text) + assert ">" not in result2 and ">" in result2 + assert "<" not in result2 and "<" in result2 + + def test_crossref_metadata_abstract(self): + """Abstract extracted and JATS-cleaned when present; absent when Crossref omits it.""" + e = EnricherModule(use_google_scholar=False) + payload_with = {"message": { + "DOI": "10.1234/test", "title": ["Test Article"], + "author": [{"given": "Jane", "family": "Doe"}], + "container-title": ["Test Journal"], + "published-print": {"date-parts": [[2023]]}, + "abstract": "This is the abstract text.", + }} + with patch("onecite.pipeline.requests.get", + return_value=DummyResponse(json_data=payload_with)): + meta = e._get_crossref_metadata("10.1234/test") + assert "abstract" in meta and "abstract" in meta["abstract"] + assert " + +
+ This is the PubMed abstract. +
+
""" + + def fake_get(url, *a, **kw): + params = kw.get("params", {}) + if "esearch" in url: + return DummyResponse(json_data={ + "esearchresult": {"idlist": ["12345678"]} + }) + if "efetch" in url: + return DummyResponse(content=xml_content) + return DummyResponse(status_code=404, json_data={}) + + with patch("onecite.pipeline.requests.get", side_effect=fake_get): + result = e._get_pubmed_abstract({"doi": "10.1234/test"}) + + assert result == "This is the PubMed abstract." + + def test_get_pubmed_abstract_structured(self): + """Structured abstracts (multiple AbstractText with Label) are joined.""" + e = EnricherModule(use_google_scholar=False) + xml_content = b""" + +
+ Background text. + Methods text. + Results text. +
+
""" + + def fake_get(url, *a, **kw): + if "esearch" in url: + return DummyResponse(json_data={"esearchresult": {"idlist": ["99999"]}}) + if "efetch" in url: + return DummyResponse(content=xml_content) + return DummyResponse(status_code=404, json_data={}) + + with patch("onecite.pipeline.requests.get", side_effect=fake_get): + result = e._get_pubmed_abstract({"doi": "10.1234/struct"}) + + assert result is not None + assert "BACKGROUND: Background text." in result + assert "METHODS: Methods text." in result + assert "RESULTS: Results text." in result + + def test_get_pubmed_abstract_returns_none(self): + """Returns None when PMID not found, or when PubMed record has no Abstract.""" + e = EnricherModule(use_google_scholar=False) + + with patch("onecite.pipeline.requests.get", + return_value=DummyResponse(json_data={"esearchresult": {"idlist": []}})): + assert e._get_pubmed_abstract({"doi": "10.9999/notinpubmed"}) is None + + xml_no_abstract = b""" + +
No abstract here
+
""" + + def fake_get(url, *a, **kw): + if "esearch" in url: + return DummyResponse(json_data={"esearchresult": {"idlist": ["77777"]}}) + return DummyResponse(content=xml_no_abstract) + + with patch("onecite.pipeline.requests.get", side_effect=fake_get): + assert e._get_pubmed_abstract({"doi": "10.1234/noabs"}) is None + + def test_fetch_missing_field_abstract_sources(self): + """pubmed_api delegates to _get_pubmed_abstract; crossref_api is always skipped.""" + e = EnricherModule(use_google_scholar=False) + with patch.object(e, "_get_pubmed_abstract", return_value="Mocked abstract") as m: + val = e._fetch_missing_field("abstract", ["pubmed_api"], {"doi": "10.1/x"}) + assert val == "Mocked abstract" + m.assert_called_once_with({"doi": "10.1/x"}) + + with patch.object(e, "_get_pubmed_abstract", return_value=None) as m: + assert e._fetch_missing_field("abstract", ["crossref_api"], {"doi": "10.1/x"}) is None + m.assert_not_called() + # =================================================================== # FormatterModule