From 6b43b3cb807da8bfd62e3cf8119d52280112cbd6 Mon Sep 17 00:00:00 2001
From: Ang <hexiaoang@gmail.com>
Date: Fri, 20 Feb 2026 02:12:15 +0800
Subject: [PATCH 1/2] docs: add changelog entry for CrossRef unification (#26)

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 817c5f6..2c988ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/), and this
 
 ## [Unreleased]
 
+### Changed
+- Unify CrossRef request and parsing methods in pipeline (#26)
+
 ## [0.1.0] - 2025-02-09
 
 ### Added

From 3dc8904ca62bf36a99039fccbe79fe500a94ad45 Mon Sep 17 00:00:00 2001
From: Ang <ang@obsidian.local>
Date: Mon, 6 Apr 2026 18:56:13 +0800
Subject: [PATCH 2/2] feat: add abstract extraction via Crossref and PubMed
 fallback

- Add journal_article_with_abstract template with pubmed_api source
- Extract abstract from Crossref API response with JATS cleanup
- Add _get_pubmed_abstract method (DOI->PMID->efetch path)
- Add pubmed_api source support in _fetch_missing_field
- Fix _strip_html_tags: replace tags with space, double-unescape entities
- Fix parser to preserve bare PMID inputs as query_string
- Add 6 unit tests covering all new abstract-related code paths
---
 onecite/pipeline.py                           | 145 +++++++++++++++---
 .../journal_article_with_abstract.yaml        |  40 +++++
 tests/test_pipeline_unit.py                   | 123 +++++++++++++++
 3 files changed, 286 insertions(+), 22 deletions(-)
 create mode 100644 onecite/templates/journal_article_with_abstract.yaml

diff --git a/onecite/pipeline.py b/onecite/pipeline.py
index 13ca8a3..d0d299b 100644
--- a/onecite/pipeline.py
+++ b/onecite/pipeline.py
@@ -142,21 +142,25 @@ def _parse_text(self, text_content: str) -> List[RawEntry]:
             
             # If no DOI or URL found, build a concise query string from title/author/year
             if not raw_entry['doi'] and not raw_entry['url']:
-                lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
-                title_text = lines[0] if lines else block
-                authors_text = lines[1] if len(lines) > 1 else ''
-                year_match = re.search(r'(19|20)\d{2}', block)
-                year_text = year_match.group(0) if year_match else ''
-
-                query_parts: List[str] = []
-                if title_text:
-                    query_parts.append(title_text)
-                if authors_text:
-                    query_parts.append(authors_text)
-                if year_text:
-                    query_parts.append(year_text)
-
-                raw_entry['query_string'] = ' '.join(query_parts) or block
+                # Check if block is a bare PMID (7-8 digits, optionally prefixed with "PMID:")
+                if re.match(r'^(PMID:?\s*)?\d{7,8}$', block.strip(), re.IGNORECASE):
+                    raw_entry['query_string'] = block.strip()
+                else:
+                    lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
+                    title_text = lines[0] if lines else block
+                    authors_text = lines[1] if len(lines) > 1 else ''
+                    year_match = re.search(r'(19|20)\d{2}', block)
+                    year_text = year_match.group(0) if year_match else ''
+
+                    query_parts: List[str] = []
+                    if title_text:
+                        query_parts.append(title_text)
+                    if authors_text:
+                        query_parts.append(authors_text)
+                    if year_text:
+                        query_parts.append(year_text)
+
+                    raw_entry['query_string'] = ' '.join(query_parts) or block
             
             entries.append(raw_entry)
         
@@ -2352,6 +2356,7 @@ def __init__(self, use_google_scholar: bool = False):
         """
         self.logger = logging.getLogger(__name__)
         self.crossref_base_url = "https://api.crossref.org/works"
+        self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
         self.use_google_scholar = use_google_scholar
         self._used_keys: set = set()
     
@@ -2560,15 +2565,15 @@ def _enrich_single_entry(self, identified_entry: IdentifiedEntry,
             }
     
     def _strip_html_tags(self, text: str) -> str:
-        """Strip HTML tags from text and convert to plain text."""
+        """Strip HTML/XML tags from text and convert to plain text."""
         if not text:
             return text
         
-        # Unescape HTML entities first (e.g., & -> &)
-        text = unescape(text)
+        # Remove all HTML/XML tags first (replace with space to avoid word concatenation)
+        clean_text = re.sub(r'<[^>]+>', ' ', text)
         
-        # Remove all HTML tags using regex
-        clean_text = re.sub(r'<[^>]+>', '', text)
+        # Unescape HTML entities (may need multiple passes for double-escaped entities)
+        clean_text = unescape(unescape(clean_text))
         
         # Clean up extra spaces that may result
         clean_text = re.sub(r'\s+', ' ', clean_text).strip()
@@ -2603,9 +2608,14 @@ def _get_crossref_metadata(self, doi: str) -> Optional[Dict]:
                 'pages': work.get('page'),
                 'publisher': work.get('publisher'),
                 'url': work.get('URL'),
-                'type': work.get('type', '')
+                'type': work.get('type', ''),
+                'abstract': work.get('abstract')  # Extract abstract if available
             }
             
+            # Clean up abstract HTML tags if present
+            if metadata['abstract']:
+                metadata['abstract'] = self._strip_html_tags(metadata['abstract'])
+            
             # Book specific fields
             if work.get('type') in ['book', 'monograph', 'edited-book', 'reference-book']:
                 metadata['is_book'] = True
@@ -2866,8 +2876,14 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base
         """Try each source in priority order to fill a missing field."""
         for source in source_priority:
             if source == 'crossref_api':
-                # Already got fromCrossref, skip
+                # Already got from Crossref, skip
                 continue
+            elif source == 'pubmed_api':
+                # Try PubMed for abstract
+                if field_name == 'abstract':
+                    value = self._get_pubmed_abstract(base_record)
+                    if value:
+                        return value
             elif source == 'google_scholar_scraper':
                 # Only use Google Scholar if enabled
                 if self.use_google_scholar:
@@ -2882,6 +2898,91 @@ def _fetch_missing_field(self, field_name: str, source_priority: List[str], base
         
         return None
     
+    def _get_pubmed_abstract(self, base_record: Dict) -> Optional[str]:
+        """Get abstract from PubMed using DOI or title/author search."""
+        try:
+            doi = base_record.get('doi')
+            pmid = None
+            
+            # Step 1: Try to find PMID by DOI
+            if doi:
+                url = f"{self.pubmed_base}/esearch.fcgi"
+                params = {
+                    'db': 'pubmed',
+                    'term': f'{doi}[DOI]',
+                    'retmode': 'json',
+                    'retmax': 1
+                }
+                response = requests.get(url, params=params, timeout=10)
+                response.raise_for_status()
+                data = response.json()
+                idlist = data.get('esearchresult', {}).get('idlist', [])
+                if idlist:
+                    pmid = idlist[0]
+                    self.logger.info(f"Found PMID {pmid} for DOI {doi}")
+            
+            # Step 2: If no PMID found by DOI, try searching by title
+            if not pmid:
+                title = base_record.get('title', '')
+                if title:
+                    # Clean title for search
+                    search_title = re.sub(r'[^\w\s]', ' ', title).strip()
+                    url = f"{self.pubmed_base}/esearch.fcgi"
+                    params = {
+                        'db': 'pubmed',
+                        'term': search_title,
+                        'retmode': 'json',
+                        'retmax': 3
+                    }
+                    response = requests.get(url, params=params, timeout=10)
+                    response.raise_for_status()
+                    data = response.json()
+                    idlist = data.get('esearchresult', {}).get('idlist', [])
+                    if idlist:
+                        pmid = idlist[0]
+                        self.logger.info(f"Found PMID {pmid} by title search")
+            
+            # Step 3: Fetch abstract by PMID
+            if pmid:
+                url = f"{self.pubmed_base}/efetch.fcgi"
+                params = {
+                    'db': 'pubmed',
+                    'id': pmid,
+                    'retmode': 'xml'
+                }
+                response = requests.get(url, params=params, timeout=10)
+                response.raise_for_status()
+                
+                # Parse XML to get abstract
+                import xml.etree.ElementTree as ET
+                root = ET.fromstring(response.content)
+                
+                # Handle structured abstracts (multiple AbstractText with Label)
+                abstract_elems = root.findall('.//Abstract/AbstractText')
+                if abstract_elems:
+                    parts = []
+                    for elem in abstract_elems:
+                        label = elem.get('Label', '')
+                        text = ''.join(elem.itertext()).strip()
+                        if text:
+                            if label:
+                                parts.append(f"{label}: {text}")
+                            else:
+                                parts.append(text)
+                    if parts:
+                        abstract = ' '.join(parts)
+                        self.logger.info(f"Successfully retrieved abstract from PubMed (PMID: {pmid})")
+                        return abstract
+                
+                self.logger.warning(f"No abstract found in PubMed record (PMID: {pmid})")
+            else:
+                self.logger.warning(f"Could not find PMID for DOI {doi} or title")
+            
+        except Exception as e:
+            self.logger.warning(f"Failed to get abstract from PubMed: {str(e)}")
+        
+        return None
+    
     def _fetch_from_google_scholar(self, field_name: str, base_record: Dict) -> Optional[str]:
         """Get field value from Google Scholar (with improved timeout protection)"""
         try:
diff --git a/onecite/templates/journal_article_with_abstract.yaml b/onecite/templates/journal_article_with_abstract.yaml
new file mode 100644
index 0000000..980e297
--- /dev/null
+++ b/onecite/templates/journal_article_with_abstract.yaml
@@ -0,0 +1,40 @@
+name: journal_article_with_abstract
+entry_type: "@article"
+fields:
+  - name: author
+    required: true
+  - name: title
+    required: true
+  - name: journal
+    required: true
+  - name: year
+    required: true
+  - name: abstract
+    required: false
+    source_priority:
+      - crossref_api
+      - pubmed_api
+  - name: volume
+    required: false
+    source_priority:
+      - crossref_api
+      - user_prompt
+  - name: number
+    required: false
+    source_priority:
+      - crossref_api
+      - user_prompt
+  - name: pages
+    required: false
+    source_priority:
+      - crossref_api
+      - google_scholar_scraper
+  - name: publisher
+    required: false
+    source_priority:
+      - crossref_api
+      - user_prompt
+  - name: doi
+    required: false
+    source_priority:
+      - crossref_api
diff --git a/tests/test_pipeline_unit.py b/tests/test_pipeline_unit.py
index 65b81cf..a88e96d 100644
--- a/tests/test_pipeline_unit.py
+++ b/tests/test_pipeline_unit.py
@@ -777,6 +777,129 @@ def test_google_scholar_worker_error(self):
              patch("time.sleep"), patch("time.time", return_value=1000.0):
             assert e._fetch_from_google_scholar("pages", {"title": "T"}) is None
 
+    def test_strip_html_jats_and_entities(self):
+        """JATS tags replaced with space (no word merging); double-escaped entities decoded."""
+        e = EnricherModule(use_google_scholar=False)
+        jats = "<jats:title>Background</jats:title><jats:p>The treatment.</jats:p>"
+        result = e._strip_html_tags(jats)
+        assert "Background" in result and "The treatment" in result
+        assert "BackgroundThe" not in result
+        text = "p &amp;gt; 0.05 and p &amp;lt; 0.01"
+        result2 = e._strip_html_tags(text)
+        assert "&gt;" not in result2 and ">" in result2
+        assert "&lt;" not in result2 and "<" in result2
+
+    def test_crossref_metadata_abstract(self):
+        """Abstract extracted and JATS-cleaned when present; absent when Crossref omits it."""
+        e = EnricherModule(use_google_scholar=False)
+        payload_with = {"message": {
+            "DOI": "10.1234/test", "title": ["Test Article"],
+            "author": [{"given": "Jane", "family": "Doe"}],
+            "container-title": ["Test Journal"],
+            "published-print": {"date-parts": [[2023]]},
+            "abstract": "<jats:p>This is the <jats:italic>abstract</jats:italic> text.</jats:p>",
+        }}
+        with patch("onecite.pipeline.requests.get",
+                   return_value=DummyResponse(json_data=payload_with)):
+            meta = e._get_crossref_metadata("10.1234/test")
+        assert "abstract" in meta and "abstract" in meta["abstract"]
+        assert "<jats:" not in meta["abstract"]
+
+        payload_without = {"message": {
+            "DOI": "10.1234/noabs", "title": ["No Abstract"],
+            "author": [{"given": "A", "family": "B"}],
+            "container-title": ["J"],
+            "published-print": {"date-parts": [[2020]]},
+        }}
+        with patch("onecite.pipeline.requests.get",
+                   return_value=DummyResponse(json_data=payload_without)):
+            assert "abstract" not in e._get_crossref_metadata("10.1234/noabs")
+
+    def test_get_pubmed_abstract_via_doi(self):
+        """_get_pubmed_abstract resolves DOI → PMID → fetches abstract."""
+        e = EnricherModule(use_google_scholar=False)
+        xml_content = b"""<?xml version="1.0"?>
+        <PubmedArticleSet><PubmedArticle><MedlineCitation>
+          <Article><Abstract>
+            <AbstractText>This is the PubMed abstract.</AbstractText>
+          </Abstract></Article>
+        </MedlineCitation></PubmedArticle></PubmedArticleSet>"""
+
+        def fake_get(url, *a, **kw):
+            params = kw.get("params", {})
+            if "esearch" in url:
+                return DummyResponse(json_data={
+                    "esearchresult": {"idlist": ["12345678"]}
+                })
+            if "efetch" in url:
+                return DummyResponse(content=xml_content)
+            return DummyResponse(status_code=404, json_data={})
+
+        with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+            result = e._get_pubmed_abstract({"doi": "10.1234/test"})
+
+        assert result == "This is the PubMed abstract."
+
+    def test_get_pubmed_abstract_structured(self):
+        """Structured abstracts (multiple AbstractText with Label) are joined."""
+        e = EnricherModule(use_google_scholar=False)
+        xml_content = b"""<?xml version="1.0"?>
+        <PubmedArticleSet><PubmedArticle><MedlineCitation>
+          <Article><Abstract>
+            <AbstractText Label="BACKGROUND">Background text.</AbstractText>
+            <AbstractText Label="METHODS">Methods text.</AbstractText>
+            <AbstractText Label="RESULTS">Results text.</AbstractText>
+          </Abstract></Article>
+        </MedlineCitation></PubmedArticle></PubmedArticleSet>"""
+
+        def fake_get(url, *a, **kw):
+            if "esearch" in url:
+                return DummyResponse(json_data={"esearchresult": {"idlist": ["99999"]}})
+            if "efetch" in url:
+                return DummyResponse(content=xml_content)
+            return DummyResponse(status_code=404, json_data={})
+
+        with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+            result = e._get_pubmed_abstract({"doi": "10.1234/struct"})
+
+        assert result is not None
+        assert "BACKGROUND: Background text." in result
+        assert "METHODS: Methods text." in result
+        assert "RESULTS: Results text." in result
+
+    def test_get_pubmed_abstract_returns_none(self):
+        """Returns None when PMID not found, or when PubMed record has no Abstract."""
+        e = EnricherModule(use_google_scholar=False)
+
+        with patch("onecite.pipeline.requests.get",
+                   return_value=DummyResponse(json_data={"esearchresult": {"idlist": []}})):
+            assert e._get_pubmed_abstract({"doi": "10.9999/notinpubmed"}) is None
+
+        xml_no_abstract = b"""<?xml version="1.0"?>
+        <PubmedArticleSet><PubmedArticle><MedlineCitation>
+          <Article><ArticleTitle>No abstract here</ArticleTitle></Article>
+        </MedlineCitation></PubmedArticle></PubmedArticleSet>"""
+
+        def fake_get(url, *a, **kw):
+            if "esearch" in url:
+                return DummyResponse(json_data={"esearchresult": {"idlist": ["77777"]}})
+            return DummyResponse(content=xml_no_abstract)
+
+        with patch("onecite.pipeline.requests.get", side_effect=fake_get):
+            assert e._get_pubmed_abstract({"doi": "10.1234/noabs"}) is None
+
+    def test_fetch_missing_field_abstract_sources(self):
+        """pubmed_api delegates to _get_pubmed_abstract; crossref_api is always skipped."""
+        e = EnricherModule(use_google_scholar=False)
+        with patch.object(e, "_get_pubmed_abstract", return_value="Mocked abstract") as m:
+            val = e._fetch_missing_field("abstract", ["pubmed_api"], {"doi": "10.1/x"})
+        assert val == "Mocked abstract"
+        m.assert_called_once_with({"doi": "10.1/x"})
+
+        with patch.object(e, "_get_pubmed_abstract", return_value=None) as m:
+            assert e._fetch_missing_field("abstract", ["crossref_api"], {"doi": "10.1/x"}) is None
+        m.assert_not_called()
+
 
 # ===================================================================
 # FormatterModule