Skip to content

Commit b2e747a

Browse files
Copilotsfluegel05
andcommitted
Add RDKit Mol parsing to SDF extractor with partial sanitisation
Co-authored-by: sfluegel05 <43573433+sfluegel05@users.noreply.github.com>
1 parent 1df6eee commit b2e747a

File tree

4 files changed

+137
-19
lines changed

4 files changed

+137
-19
lines changed

chebi_utils/sdf_extractor.py

Lines changed: 94 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,56 @@
33
from __future__ import annotations
44

55
import gzip
6+
import warnings
67
from pathlib import Path
78

89
import pandas as pd
10+
from rdkit import Chem
11+
12+
13+
def _update_mol_valences(mol: Chem.Mol) -> Chem.Mol:
14+
"""Mark all atoms as having no implicit hydrogens to preserve molfile valences."""
15+
for atom in mol.GetAtoms():
16+
atom.SetNoImplicit(True)
17+
return mol
18+
19+
20+
def _parse_molblock(molblock: str, chebi_id: str | None = None) -> Chem.Mol | None:
21+
"""Parse a V2000/V3000 molblock into an RDKit Mol object.
22+
23+
Uses partial sanitisation to handle ChEBI molecules with unusual valences
24+
or radicals.
25+
26+
Parameters
27+
----------
28+
molblock : str
29+
The molblock string (header + atom/bond table + ``M END``).
30+
chebi_id : str or None
31+
Used only for the warning message when parsing fails.
32+
33+
Returns
34+
-------
35+
Chem.Mol or None
36+
Parsed molecule, or ``None`` if parsing failed.
37+
"""
38+
mol = Chem.MolFromMolBlock(molblock, sanitize=False, removeHs=False)
39+
if mol is None:
40+
warnings.warn(f"Failed to parse molblock for {chebi_id}", stacklevel=2)
41+
return None
42+
mol = _update_mol_valences(mol)
43+
Chem.SanitizeMol(
44+
mol,
45+
sanitizeOps=(
46+
Chem.SanitizeFlags.SANITIZE_FINDRADICALS
47+
| Chem.SanitizeFlags.SANITIZE_KEKULIZE
48+
| Chem.SanitizeFlags.SANITIZE_SETAROMATICITY
49+
| Chem.SanitizeFlags.SANITIZE_SETCONJUGATION
50+
| Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION
51+
| Chem.SanitizeFlags.SANITIZE_SYMMRINGS
52+
),
53+
catchErrors=True,
54+
)
55+
return mol
956

1057

1158
def _iter_sdf_records(filepath: str | Path):
@@ -21,19 +68,39 @@ def _iter_sdf_records(filepath: str | Path):
2168
current_record = []
2269

2370

24-
def _parse_sdf_record(record: str) -> dict[str, str]:
25-
"""Parse a single SDF record into a dict of data-item properties."""
26-
props: dict[str, str] = {}
27-
lines = record.splitlines()
71+
def _parse_sdf_record(record: str) -> tuple[dict[str, str], str]:
72+
"""Parse a single SDF record.
2873
29-
if lines:
30-
props["mol_name"] = lines[0].strip()
31-
32-
i = 0
74+
Returns
75+
-------
76+
tuple[dict[str, str], str]
77+
``(props, molblock)`` where *props* is a dict of data-item key/values
78+
and *molblock* is the raw connection-table string.
79+
"""
80+
props: dict[str, str] = {}
81+
lines = record.splitlines(keepends=True)
82+
83+
# Collect molblock: everything up to (but not including) the first "> <" tag
84+
molblock_lines: list[str] = []
85+
data_start = len(lines)
86+
for idx, line in enumerate(lines):
87+
stripped = line.strip()
88+
if stripped.startswith("> <") or stripped == "$$$$":
89+
data_start = idx
90+
break
91+
molblock_lines.append(line)
92+
molblock = "".join(molblock_lines)
93+
94+
# Extract header name (first line of molblock)
95+
if molblock_lines:
96+
props["mol_name"] = molblock_lines[0].strip()
97+
98+
# Parse data items
99+
i = data_start
33100
while i < len(lines):
34-
line = lines[i]
35-
if line.startswith("> <") and line.rstrip().endswith(">"):
36-
key = line.strip()[3:-1]
101+
line = lines[i].strip()
102+
if line.startswith("> <") and line.endswith(">"):
103+
key = line[3:-1]
37104
value_lines: list[str] = []
38105
i += 1
39106
while i < len(lines) and lines[i].strip() not in ("", "$$$$"):
@@ -43,13 +110,15 @@ def _parse_sdf_record(record: str) -> dict[str, str]:
43110
else:
44111
i += 1
45112

46-
return props
113+
return props, molblock
47114

48115

49116
def extract_molecules(filepath: str | Path) -> pd.DataFrame:
50117
"""Extract molecule data from a ChEBI SDF file.
51118
52119
Supports both plain (``.sdf``) and gzip-compressed (``.sdf.gz``) files.
120+
Each molecule is parsed into an RDKit ``Mol`` object stored in the ``mol``
121+
column. Molecules that cannot be parsed result in ``None`` in that column.
53122
54123
Parameters
55124
----------
@@ -61,14 +130,19 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
61130
pd.DataFrame
62131
DataFrame with one row per molecule. Columns depend on the properties
63132
present in the file. Common columns (renamed for convenience):
64-
chebi_id, name, inchi, inchikey, smiles, formula, charge, mass.
133+
chebi_id, name, inchi, inchikey, smiles, formula, charge, mass, mol.
65134
"""
66-
records = [_parse_sdf_record(r) for r in _iter_sdf_records(filepath)]
67-
68-
if not records:
135+
rows = []
136+
molblocks = []
137+
for record in _iter_sdf_records(filepath):
138+
props, molblock = _parse_sdf_record(record)
139+
rows.append(props)
140+
molblocks.append(molblock)
141+
142+
if not rows:
69143
return pd.DataFrame()
70144

71-
df = pd.DataFrame(records)
145+
df = pd.DataFrame(rows)
72146

73147
rename_map = {
74148
"ChEBI ID": "chebi_id",
@@ -82,4 +156,7 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
82156
}
83157
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
84158

159+
chebi_ids = df["chebi_id"].tolist() if "chebi_id" in df.columns else [None] * len(df)
160+
df["mol"] = [_parse_molblock(mb, cid) for mb, cid in zip(molblocks, chebi_ids, strict=False)]
161+
85162
return df

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ dependencies = [
1414
"networkx>=3.0",
1515
"numpy>=1.24",
1616
"pandas>=2.0",
17+
"rdkit>=2022.09",
1718
]
1819

1920
[project.optional-dependencies]

tests/fixtures/sample.sdf

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
compound_a
2+
RDKit
23

3-
0 0 0 0 0 0 0 0 0 0999 V2000
4+
1 0 0 0 0 0 0 0 0 0999 V2000
5+
0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
46
M END
57
> <ChEBI ID>
68
CHEBI:1
@@ -28,8 +30,12 @@ CH4
2830

2931
$$$$
3032
compound_b
33+
RDKit
3134

32-
0 0 0 0 0 0 0 0 0 0999 V2000
35+
2 1 0 0 0 0 0 0 0 0999 V2000
36+
0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
37+
1.5400 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
38+
1 2 1 0
3339
M END
3440
> <ChEBI ID>
3541
CHEBI:2

tests/test_sdf_extractor.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import gzip
66
from pathlib import Path
77

8+
from rdkit.Chem import rdchem
9+
810
from chebi_utils.sdf_extractor import extract_molecules
911

1012
FIXTURES = Path(__file__).parent / "fixtures"
@@ -44,6 +46,28 @@ def test_inchikey_column_present(self):
4446
df = extract_molecules(SAMPLE_SDF)
4547
assert "inchikey" in df.columns
4648

49+
def test_mol_column_present(self):
50+
df = extract_molecules(SAMPLE_SDF)
51+
assert "mol" in df.columns
52+
53+
def test_mol_objects_are_rdkit_mol(self):
54+
df = extract_molecules(SAMPLE_SDF)
55+
for mol in df["mol"]:
56+
assert isinstance(mol, rdchem.Mol)
57+
58+
def test_mol_atom_counts(self):
59+
df = extract_molecules(SAMPLE_SDF)
60+
row1 = df[df["chebi_id"] == "CHEBI:1"].iloc[0]
61+
row2 = df[df["chebi_id"] == "CHEBI:2"].iloc[0]
62+
assert row1["mol"].GetNumAtoms() == 1 # methane: 1 C
63+
assert row2["mol"].GetNumAtoms() == 2 # ethane: 2 C
64+
65+
def test_mol_sanitized(self):
66+
df = extract_molecules(SAMPLE_SDF)
67+
for mol in df["mol"]:
68+
# Aromaticity flags should be set (sanitize applied)
69+
assert mol is not None
70+
4771
def test_molecule_properties(self):
4872
df = extract_molecules(SAMPLE_SDF)
4973
row = df[df["chebi_id"] == "CHEBI:1"].iloc[0]
@@ -58,9 +82,19 @@ def test_gzipped_sdf(self, tmp_path):
5882
df = extract_molecules(gz_path)
5983
assert len(df) == 2
6084
assert set(df["chebi_id"]) == {"CHEBI:1", "CHEBI:2"}
85+
assert all(isinstance(m, rdchem.Mol) for m in df["mol"])
6186

6287
def test_empty_sdf_returns_empty_dataframe(self, tmp_path):
6388
empty_sdf = tmp_path / "empty.sdf"
6489
empty_sdf.write_text("")
6590
df = extract_molecules(empty_sdf)
6691
assert df.empty
92+
93+
def test_unparseable_molblock_gives_none(self, tmp_path, recwarn):
94+
bad_sdf = tmp_path / "bad.sdf"
95+
bad_sdf.write_text(
96+
"bad_mol\n\n 0 0 0 0 0 0 0 0 0 0999 V2000\nM END\n"
97+
"> <ChEBI ID>\nCHEBI:99\n\n$$$$\n"
98+
)
99+
df = extract_molecules(bad_sdf)
100+
assert df.iloc[0]["mol"] is None

0 commit comments

Comments
 (0)