33from __future__ import annotations
44
55import gzip
6+ import warnings
67from pathlib import Path
78
89import pandas as pd
10+ from rdkit import Chem
11+
12+
13+ def _update_mol_valences (mol : Chem .Mol ) -> Chem .Mol :
14+ """Mark all atoms as having no implicit hydrogens to preserve molfile valences."""
15+ for atom in mol .GetAtoms ():
16+ atom .SetNoImplicit (True )
17+ return mol
18+
19+
20+ def _parse_molblock (molblock : str , chebi_id : str | None = None ) -> Chem .Mol | None :
21+ """Parse a V2000/V3000 molblock into an RDKit Mol object.
22+
23+ Uses partial sanitisation to handle ChEBI molecules with unusual valences
24+ or radicals.
25+
26+ Parameters
27+ ----------
28+ molblock : str
29+ The molblock string (header + atom/bond table + ``M END``).
30+ chebi_id : str or None
31+ Used only for the warning message when parsing fails.
32+
33+ Returns
34+ -------
35+ Chem.Mol or None
36+ Parsed molecule, or ``None`` if parsing failed.
37+ """
38+ mol = Chem .MolFromMolBlock (molblock , sanitize = False , removeHs = False )
39+ if mol is None :
40+ warnings .warn (f"Failed to parse molblock for { chebi_id } " , stacklevel = 2 )
41+ return None
42+ mol = _update_mol_valences (mol )
43+ Chem .SanitizeMol (
44+ mol ,
45+ sanitizeOps = (
46+ Chem .SanitizeFlags .SANITIZE_FINDRADICALS
47+ | Chem .SanitizeFlags .SANITIZE_KEKULIZE
48+ | Chem .SanitizeFlags .SANITIZE_SETAROMATICITY
49+ | Chem .SanitizeFlags .SANITIZE_SETCONJUGATION
50+ | Chem .SanitizeFlags .SANITIZE_SETHYBRIDIZATION
51+ | Chem .SanitizeFlags .SANITIZE_SYMMRINGS
52+ ),
53+ catchErrors = True ,
54+ )
55+ return mol
956
1057
1158def _iter_sdf_records (filepath : str | Path ):
@@ -21,19 +68,39 @@ def _iter_sdf_records(filepath: str | Path):
2168 current_record = []
2269
2370
24- def _parse_sdf_record (record : str ) -> dict [str , str ]:
25- """Parse a single SDF record into a dict of data-item properties."""
26- props : dict [str , str ] = {}
27- lines = record .splitlines ()
71+ def _parse_sdf_record (record : str ) -> tuple [dict [str , str ], str ]:
72+ """Parse a single SDF record.
2873
29- if lines :
30- props ["mol_name" ] = lines [0 ].strip ()
31-
32- i = 0
74+ Returns
75+ -------
76+ tuple[dict[str, str], str]
77+ ``(props, molblock)`` where *props* is a dict of data-item key/values
78+ and *molblock* is the raw connection-table string.
79+ """
80+ props : dict [str , str ] = {}
81+ lines = record .splitlines (keepends = True )
82+
83+ # Collect molblock: everything up to (but not including) the first "> <" tag
84+ molblock_lines : list [str ] = []
85+ data_start = len (lines )
86+ for idx , line in enumerate (lines ):
87+ stripped = line .strip ()
88+ if stripped .startswith ("> <" ) or stripped == "$$$$" :
89+ data_start = idx
90+ break
91+ molblock_lines .append (line )
92+ molblock = "" .join (molblock_lines )
93+
94+ # Extract header name (first line of molblock)
95+ if molblock_lines :
96+ props ["mol_name" ] = molblock_lines [0 ].strip ()
97+
98+ # Parse data items
99+ i = data_start
33100 while i < len (lines ):
34- line = lines [i ]
35- if line .startswith ("> <" ) and line .rstrip (). endswith (">" ):
36- key = line . strip () [3 :- 1 ]
101+ line = lines [i ]. strip ()
102+ if line .startswith ("> <" ) and line .endswith (">" ):
103+ key = line [3 :- 1 ]
37104 value_lines : list [str ] = []
38105 i += 1
39106 while i < len (lines ) and lines [i ].strip () not in ("" , "$$$$" ):
@@ -43,13 +110,15 @@ def _parse_sdf_record(record: str) -> dict[str, str]:
43110 else :
44111 i += 1
45112
46- return props
113+ return props , molblock
47114
48115
49116def extract_molecules (filepath : str | Path ) -> pd .DataFrame :
50117 """Extract molecule data from a ChEBI SDF file.
51118
52119 Supports both plain (``.sdf``) and gzip-compressed (``.sdf.gz``) files.
120+ Each molecule is parsed into an RDKit ``Mol`` object stored in the ``mol``
121+ column. Molecules that cannot be parsed result in ``None`` in that column.
53122
54123 Parameters
55124 ----------
@@ -61,14 +130,19 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
61130 pd.DataFrame
62131 DataFrame with one row per molecule. Columns depend on the properties
63132 present in the file. Common columns (renamed for convenience):
64- chebi_id, name, inchi, inchikey, smiles, formula, charge, mass.
133+ chebi_id, name, inchi, inchikey, smiles, formula, charge, mass, mol .
65134 """
66- records = [_parse_sdf_record (r ) for r in _iter_sdf_records (filepath )]
67-
68- if not records :
135+ rows = []
136+ molblocks = []
137+ for record in _iter_sdf_records (filepath ):
138+ props , molblock = _parse_sdf_record (record )
139+ rows .append (props )
140+ molblocks .append (molblock )
141+
142+ if not rows :
69143 return pd .DataFrame ()
70144
71- df = pd .DataFrame (records )
145+ df = pd .DataFrame (rows )
72146
73147 rename_map = {
74148 "ChEBI ID" : "chebi_id" ,
@@ -82,4 +156,7 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
82156 }
83157 df = df .rename (columns = {k : v for k , v in rename_map .items () if k in df .columns })
84158
159+ chebi_ids = df ["chebi_id" ].tolist () if "chebi_id" in df .columns else [None ] * len (df )
160+ df ["mol" ] = [_parse_molblock (mb , cid ) for mb , cid in zip (molblocks , chebi_ids , strict = False )]
161+
85162 return df
0 commit comments