only calculate entended molecule graph if needed, sanitize molecule with custom method in fg rules

sfluegel05 · sfluegel05 · commit 0ba4f1115e53 · 2026-03-02T14:13:41.000+01:00
diff --git a/chebai_graph/preprocessing/datasets/chebi.py b/chebai_graph/preprocessing/datasets/chebi.py
@@ -127,45 +127,53 @@ def enc_if_not_none(encode, value):
                 if value is not None and len(value) > 0
                 else None
             )
-        
-        # augment molecule graph if possible (this would also happen for the properties if needed, but this avoids redundancy)
-        if isinstance(self.reader, _AugmentorReader):
-            returned_results = []
-            for mol in features:
-                try:
-                    r = self.reader._create_augmented_graph(mol)
-                except Exception as e:
-                    r = None
-                returned_results.append(r)
-            mols = [augmented_mol[1] for augmented_mol in returned_results if augmented_mol is not None]
-        else:
-            mols = features
 
-        for property in self.properties:
-            if not os.path.isfile(self.get_property_path(property)):
-                rank_zero_info(f"Processing property {property.name}")
-                # read all property values first, then encode
-                rank_zero_info(f"\tReading property values of {property.name}...")
-                property_values = [
-                    self.reader.read_property(mol, property)
-                    for mol in tqdm.tqdm(mols)
-                ]
-                rank_zero_info(f"\tEncoding property values of {property.name}...")
-                property.encoder.on_start(property_values=property_values)
-                encoded_values = [
-                    enc_if_not_none(property.encoder.encode, value)
-                    for value in tqdm.tqdm(property_values)
+        if any(
+            not os.path.isfile(self.get_property_path(property))
+            for property in self.properties
+        ):
+            # augment molecule graph if possible (this would also happen for the properties if needed, but this avoids redundancy)
+            if isinstance(self.reader, _AugmentorReader):
+                returned_results = []
+                for mol in features:
+                    try:
+                        r = self.reader._create_augmented_graph(mol)
+                    except Exception as e:
+                        r = None
+                    returned_results.append(r)
+                mols = [
+                    augmented_mol[1]
+                    for augmented_mol in returned_results
+                    if augmented_mol is not None
                 ]
-
-                torch.save(
-                    [
-                        {property.name: torch.cat(feat), "ident": id}
-                        for feat, id in zip(encoded_values, idents)
-                        if feat is not None
-                    ],
-                    self.get_property_path(property),
-                )
-                property.on_finish()
+            else:
+                mols = features
+
+            for property in self.properties:
+                if not os.path.isfile(self.get_property_path(property)):
+                    rank_zero_info(f"Processing property {property.name}")
+                    # read all property values first, then encode
+                    rank_zero_info(f"\tReading property values of {property.name}...")
+                    property_values = [
+                        self.reader.read_property(mol, property)
+                        for mol in tqdm.tqdm(mols)
+                    ]
+                    rank_zero_info(f"\tEncoding property values of {property.name}...")
+                    property.encoder.on_start(property_values=property_values)
+                    encoded_values = [
+                        enc_if_not_none(property.encoder.encode, value)
+                        for value in tqdm.tqdm(property_values)
+                    ]
+
+                    torch.save(
+                        [
+                            {property.name: torch.cat(feat), "ident": id}
+                            for feat, id in zip(encoded_values, idents)
+                            if feat is not None
+                        ],
+                        self.get_property_path(property),
+                    )
+                    property.on_finish()
 
     @property
     def processed_properties_dir(self) -> str:
@@ -268,7 +276,9 @@ def __init__(
             assert (
                 distribution is not None
                 and distribution in RandomFeatureInitializationReader.DISTRIBUTIONS
-            ), "When using padding for features, a valid distribution must be specified."
+            ), (
+                "When using padding for features, a valid distribution must be specified."
+            )
             self.distribution = distribution
             if self.pad_node_features:
                 print(
@@ -297,7 +307,9 @@ def _merge_props_into_base(self, row: pd.Series | dict) -> GeomData:
             A GeomData object with merged features.
         """
         if isinstance(row["features"], tuple):
-            geom_data, _ = row["features"] # ignore additional returned data from _read_data (e.g. augmented molecule dict)
+            geom_data, _ = row[
+                "features"
+            ]  # ignore additional returned data from _read_data (e.g. augmented molecule dict)
         else:
             geom_data = row["features"]
         assert isinstance(geom_data, GeomData)
@@ -560,7 +572,9 @@ def _merge_props_into_base(
         if geom_data is None:
             return None
         if isinstance(geom_data, tuple):
-            geom_data = geom_data[0]  # ignore additional returned data from _read_data (e.g. augmented molecule dict)
+            geom_data = geom_data[
+                0
+            ]  # ignore additional returned data from _read_data (e.g. augmented molecule dict)
         assert isinstance(geom_data, GeomData)
 
         is_atom_node = geom_data.is_atom_node
@@ -573,9 +587,9 @@ def _merge_props_into_base(
         edge_attr = geom_data.edge_attr
 
         # Initialize node feature matrix
-        assert (
-            max_len_node_properties is not None
-        ), "Maximum len of node properties should not be None"
+        assert max_len_node_properties is not None, (
+            "Maximum len of node properties should not be None"
+        )
         x = torch.zeros((num_nodes, max_len_node_properties))
 
         # Track column offsets for each node type
@@ -630,9 +644,9 @@ def _merge_props_into_base(
                 raise TypeError(f"Unsupported property type: {type(property).__name__}")
 
             total_used_columns = max(atom_offset, fg_offset, graph_offset)
-            assert (
-                total_used_columns <= max_len_node_properties
-            ), f"Used {total_used_columns} columns, but max allowed is {max_len_node_properties}"
+            assert total_used_columns <= max_len_node_properties, (
+                f"Used {total_used_columns} columns, but max allowed is {max_len_node_properties}"
+            )
 
         return GeomData(
             x=x,
@@ -833,4 +847,4 @@ class ChEBI100_WFGE_WGN_AsPerNodeType(GraphPropAsPerNodeType, ChEBIOver100):
 class ChEBI25_WFGE_WGN_AsPerNodeType(GraphPropAsPerNodeType, ChEBIOverX):
     READER = AtomFGReader_WithFGEdges_WithGraphNode
 
-    THRESHOLD = 25
+    THRESHOLD = 25
diff --git a/chebai_graph/preprocessing/fg_detection/fg_aware_rule_based.py b/chebai_graph/preprocessing/fg_detection/fg_aware_rule_based.py
@@ -8,6 +8,8 @@
 from rdkit.Chem import AllChem
 from rdkit.Chem import MolToSmiles as m2s
 
+from chebi_utils.sdf_extractor import _sanitize_molecule
+
 from .fg_constants import ELEMENTS, FLAG_NO_FG
 
 
@@ -1911,7 +1913,11 @@ def get_structure(mol):
         structure[frag] = {"atom": atom_idx, "is_ring_fg": False}
 
         # Convert fragment SMILES back to mol to match with fused ring atom indices
-        frag_mol = Chem.MolFromSmiles(frag)
+        frag_mol = Chem.MolFromSmiles(frag, sanitize=False)
+        try:
+            frag_mol = _sanitize_molecule(frag_mol)
+        except:
+            pass
         frag_rings = frag_mol.GetRingInfo().AtomRings()
         if len(frag_rings) >= 1:
             structure[frag]["is_ring_fg"] = True