Add RDKit Mol parsing to SDF extractor with partial sanitisation

Copilot · sfluegel05 · Copilot · commit b2e747a26a04 · 2026-02-25T09:27:13.000Z
Co-authored-by: sfluegel05 &lt;43573433+sfluegel05@users.noreply.github.com&gt;
diff --git a/chebi_utils/sdf_extractor.py b/chebi_utils/sdf_extractor.py
@@ -3,9 +3,56 @@
 from __future__ import annotations
 
 import gzip
+import warnings
 from pathlib import Path
 
 import pandas as pd
+from rdkit import Chem
+
+
+def _update_mol_valences(mol: Chem.Mol) -> Chem.Mol:
+    """Mark all atoms as having no implicit hydrogens to preserve molfile valences."""
+    for atom in mol.GetAtoms():
+        atom.SetNoImplicit(True)
+    return mol
+
+
+def _parse_molblock(molblock: str, chebi_id: str | None = None) -> Chem.Mol | None:
+    """Parse a V2000/V3000 molblock into an RDKit Mol object.
+
+    Uses partial sanitisation to handle ChEBI molecules with unusual valences
+    or radicals.
+
+    Parameters
+    ----------
+    molblock : str
+        The molblock string (header + atom/bond table + ``M  END``).
+    chebi_id : str or None
+        Used only for the warning message when parsing fails.
+
+    Returns
+    -------
+    Chem.Mol or None
+        Parsed molecule, or ``None`` if parsing failed.
+    """
+    mol = Chem.MolFromMolBlock(molblock, sanitize=False, removeHs=False)
+    if mol is None:
+        warnings.warn(f"Failed to parse molblock for {chebi_id}", stacklevel=2)
+        return None
+    mol = _update_mol_valences(mol)
+    Chem.SanitizeMol(
+        mol,
+        sanitizeOps=(
+            Chem.SanitizeFlags.SANITIZE_FINDRADICALS
+            | Chem.SanitizeFlags.SANITIZE_KEKULIZE
+            | Chem.SanitizeFlags.SANITIZE_SETAROMATICITY
+            | Chem.SanitizeFlags.SANITIZE_SETCONJUGATION
+            | Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION
+            | Chem.SanitizeFlags.SANITIZE_SYMMRINGS
+        ),
+        catchErrors=True,
+    )
+    return mol
 
 
 def _iter_sdf_records(filepath: str | Path):
@@ -21,19 +68,39 @@ def _iter_sdf_records(filepath: str | Path):
                 current_record = []
 
 
-def _parse_sdf_record(record: str) -> dict[str, str]:
-    """Parse a single SDF record into a dict of data-item properties."""
-    props: dict[str, str] = {}
-    lines = record.splitlines()
+def _parse_sdf_record(record: str) -> tuple[dict[str, str], str]:
+    """Parse a single SDF record.
 
-    if lines:
-        props["mol_name"] = lines[0].strip()
-
-    i = 0
+    Returns
+    -------
+    tuple[dict[str, str], str]
+        ``(props, molblock)`` where *props* is a dict of data-item key/values
+        and *molblock* is the raw connection-table string.
+    """
+    props: dict[str, str] = {}
+    lines = record.splitlines(keepends=True)
+
+    # Collect molblock: everything up to (but not including) the first "> <" tag
+    molblock_lines: list[str] = []
+    data_start = len(lines)
+    for idx, line in enumerate(lines):
+        stripped = line.strip()
+        if stripped.startswith("> <") or stripped == "$$$$":
+            data_start = idx
+            break
+        molblock_lines.append(line)
+    molblock = "".join(molblock_lines)
+
+    # Extract header name (first line of molblock)
+    if molblock_lines:
+        props["mol_name"] = molblock_lines[0].strip()
+
+    # Parse data items
+    i = data_start
     while i < len(lines):
-        line = lines[i]
-        if line.startswith("> <") and line.rstrip().endswith(">"):
-            key = line.strip()[3:-1]
+        line = lines[i].strip()
+        if line.startswith("> <") and line.endswith(">"):
+            key = line[3:-1]
             value_lines: list[str] = []
             i += 1
             while i < len(lines) and lines[i].strip() not in ("", "$$$$"):
@@ -43,13 +110,15 @@ def _parse_sdf_record(record: str) -> dict[str, str]:
         else:
             i += 1
 
-    return props
+    return props, molblock
 
 
 def extract_molecules(filepath: str | Path) -> pd.DataFrame:
     """Extract molecule data from a ChEBI SDF file.
 
     Supports both plain (``.sdf``) and gzip-compressed (``.sdf.gz``) files.
+    Each molecule is parsed into an RDKit ``Mol`` object stored in the ``mol``
+    column.  Molecules that cannot be parsed result in ``None`` in that column.
 
     Parameters
     ----------
@@ -61,14 +130,19 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
     pd.DataFrame
         DataFrame with one row per molecule. Columns depend on the properties
         present in the file. Common columns (renamed for convenience):
-        chebi_id, name, inchi, inchikey, smiles, formula, charge, mass.
+        chebi_id, name, inchi, inchikey, smiles, formula, charge, mass, mol.
     """
-    records = [_parse_sdf_record(r) for r in _iter_sdf_records(filepath)]
-
-    if not records:
+    rows = []
+    molblocks = []
+    for record in _iter_sdf_records(filepath):
+        props, molblock = _parse_sdf_record(record)
+        rows.append(props)
+        molblocks.append(molblock)
+
+    if not rows:
         return pd.DataFrame()
 
-    df = pd.DataFrame(records)
+    df = pd.DataFrame(rows)
 
     rename_map = {
         "ChEBI ID": "chebi_id",
@@ -82,4 +156,7 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
     }
     df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
 
+    chebi_ids = df["chebi_id"].tolist() if "chebi_id" in df.columns else [None] * len(df)
+    df["mol"] = [_parse_molblock(mb, cid) for mb, cid in zip(molblocks, chebi_ids, strict=False)]
+
     return df
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "networkx>=3.0",
     "numpy>=1.24",
     "pandas>=2.0",
+    "rdkit>=2022.09",
 ]
 
 [project.optional-dependencies]
diff --git a/tests/fixtures/sample.sdf b/tests/fixtures/sample.sdf
@@ -1,6 +1,8 @@
 compound_a
+     RDKit          
 
-  0  0  0  0  0  0  0  0  0  0999 V2000
+  1  0  0  0  0  0  0  0  0  0999 V2000
+    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 M  END
 > <ChEBI ID>
 CHEBI:1
@@ -28,8 +30,12 @@ CH4
 
 $$$$
 compound_b
+     RDKit          
 
-  0  0  0  0  0  0  0  0  0  0999 V2000
+  2  1  0  0  0  0  0  0  0  0999 V2000
+    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.5400    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
+  1  2  1  0
 M  END
 > <ChEBI ID>
 CHEBI:2
diff --git a/tests/test_sdf_extractor.py b/tests/test_sdf_extractor.py
@@ -5,6 +5,8 @@
 import gzip
 from pathlib import Path
 
+from rdkit.Chem import rdchem
+
 from chebi_utils.sdf_extractor import extract_molecules
 
 FIXTURES = Path(__file__).parent / "fixtures"
@@ -44,6 +46,28 @@ def test_inchikey_column_present(self):
         df = extract_molecules(SAMPLE_SDF)
         assert "inchikey" in df.columns
 
+    def test_mol_column_present(self):
+        df = extract_molecules(SAMPLE_SDF)
+        assert "mol" in df.columns
+
+    def test_mol_objects_are_rdkit_mol(self):
+        df = extract_molecules(SAMPLE_SDF)
+        for mol in df["mol"]:
+            assert isinstance(mol, rdchem.Mol)
+
+    def test_mol_atom_counts(self):
+        df = extract_molecules(SAMPLE_SDF)
+        row1 = df[df["chebi_id"] == "CHEBI:1"].iloc[0]
+        row2 = df[df["chebi_id"] == "CHEBI:2"].iloc[0]
+        assert row1["mol"].GetNumAtoms() == 1  # methane: 1 C
+        assert row2["mol"].GetNumAtoms() == 2  # ethane: 2 C
+
+    def test_mol_sanitized(self):
+        df = extract_molecules(SAMPLE_SDF)
+        for mol in df["mol"]:
+            # Aromaticity flags should be set (sanitize applied)
+            assert mol is not None
+
     def test_molecule_properties(self):
         df = extract_molecules(SAMPLE_SDF)
         row = df[df["chebi_id"] == "CHEBI:1"].iloc[0]
@@ -58,9 +82,19 @@ def test_gzipped_sdf(self, tmp_path):
         df = extract_molecules(gz_path)
         assert len(df) == 2
         assert set(df["chebi_id"]) == {"CHEBI:1", "CHEBI:2"}
+        assert all(isinstance(m, rdchem.Mol) for m in df["mol"])
 
     def test_empty_sdf_returns_empty_dataframe(self, tmp_path):
         empty_sdf = tmp_path / "empty.sdf"
         empty_sdf.write_text("")
         df = extract_molecules(empty_sdf)
         assert df.empty
+
+    def test_unparseable_molblock_gives_none(self, tmp_path, recwarn):
+        bad_sdf = tmp_path / "bad.sdf"
+        bad_sdf.write_text(
+            "bad_mol\n\n  0  0  0  0  0  0  0  0  0  0999 V2000\nM  END\n"
+            "> <ChEBI ID>\nCHEBI:99\n\n$$$$\n"
+        )
+        df = extract_molecules(bad_sdf)
+        assert df.iloc[0]["mol"] is None

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ dependencies = [`
`14`	`14`	`"networkx>=3.0",`
`15`	`15`	`"numpy>=1.24",`
`16`	`16`	`"pandas>=2.0",`
	`17`	`+ "rdkit>=2022.09",`
`17`	`18`	`]`
`18`	`19`
`19`	`20`	`[project.optional-dependencies]`