Squashed commit of the following:

aditya0by0 · aditya0by0 · commit 754705767f2a · 2026-01-22T17:41:06.000+01:00
commit efac8aa Merge: 0a66ef4 2ead405 Author: Simon Flügel <43573433+sfluegel05@users.noreply.github.com> Date: Thu Jan 22 16:40:22 2026 +0100 Merge pull request #138 from ChEB-AI/fix/read_data Raise error for invalid smiles and return None commit 2ead405 Author: aditya0by0 <aditya0by0@gmail.com> Date: Thu Jan 22 16:34:51 2026 +0100 avoid repeatition of smiles-mol conv commit 0a66ef4 Merge: b32e6c5 203b2b3 Author: Simon Flügel <43573433+sfluegel05@users.noreply.github.com> Date: Thu Jan 22 09:43:01 2026 +0100 Merge pull request #143 from schnamo/dev tidy up config files for loss, fix missing labels issue, etc commit 203b2b3 Merge: f034269 b32e6c5 Author: Charlotte Tumescheit <18518966+schnamo@users.noreply.github.com> Date: Tue Jan 20 13:18:09 2026 +0100 Merge branch 'ChEB-AI:dev' into dev commit f034269 Author: schnamo <ch.tumescheit@gmail.com> Date: Tue Jan 20 13:05:33 2026 +0100 tidy up config files for loss, fix missing labels issue, fix a number of other small issues commit b32e6c5 Merge: c9c08dc a5ea56a Author: Simon Flügel <43573433+sfluegel05@users.noreply.github.com> Date: Mon Jan 19 10:35:01 2026 +0100 Merge pull request #141 from ChEB-AI/fix/file_not_found_for_loss BCE Loss unable to locate processed files commit a5ea56a Author: aditya0by0 <aditya0by0@gmail.com> Date: Thu Jan 15 15:54:15 2026 +0100 docstring commit 89cb005 Author: aditya0by0 <aditya0by0@gmail.com> Date: Fri Jan 9 16:21:26 2026 +0100 pre-commit format commit 0094e6c Author: aditya0by0 <aditya0by0@gmail.com> Date: Fri Jan 9 16:06:18 2026 +0100 File not found error for loss commit 9052aca Author: aditya0by0 <aditya0by0@gmail.com> Date: Thu Dec 18 15:14:24 2025 +0100 Update error msg
diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -298,6 +298,7 @@ def _execute(
                 loss_kwargs = dict()
                 if self.pass_loss_kwargs:
                     loss_kwargs = loss_kwargs_candidates
+                loss_kwargs["current_epoch"] = self.trainer.current_epoch
                 loss = self.criterion(loss_data, loss_labels, **loss_kwargs)
                 if isinstance(loss, tuple):
                     unnamed_loss_index = 1
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -241,7 +241,6 @@ def __init__(
         self.config = ElectraConfig(**config, output_attentions=True)
         self.word_dropout = nn.Dropout(config.get("word_dropout", 0))
         self.model_type = model_type
-        self.pass_loss_kwargs = True
 
         in_d = self.config.hidden_size
         self.output = nn.Sequential(
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -96,9 +96,9 @@ def __init__(
         self.prediction_kind = prediction_kind
         self.data_limit = data_limit
         self.label_filter = label_filter
-        assert (balance_after_filter is not None) or (self.label_filter is None), (
-            "Filter balancing requires a filter"
-        )
+        assert (balance_after_filter is not None) or (
+            self.label_filter is None
+        ), "Filter balancing requires a filter"
         self.balance_after_filter = balance_after_filter
         self.num_workers = num_workers
         self.persistent_workers: bool = bool(persistent_workers)
@@ -108,13 +108,13 @@ def __init__(
         self.use_inner_cross_validation = (
             inner_k_folds > 1
         )  # only use cv if there are at least 2 folds
-        assert fold_index is None or self.use_inner_cross_validation is not None, (
-            "fold_index can only be set if cross validation is used"
-        )
+        assert (
+            fold_index is None or self.use_inner_cross_validation is not None
+        ), "fold_index can only be set if cross validation is used"
         if fold_index is not None and self.inner_k_folds is not None:
-            assert fold_index < self.inner_k_folds, (
-                "fold_index can't be larger than the total number of folds"
-            )
+            assert (
+                fold_index < self.inner_k_folds
+            ), "fold_index can't be larger than the total number of folds"
         self.fold_index = fold_index
         self._base_dir = base_dir
         self.n_token_limit = n_token_limit
@@ -137,9 +137,9 @@ def num_of_labels(self):
 
     @property
     def feature_vector_size(self):
-        assert self._feature_vector_size is not None, (
-            "size of feature vector must be set"
-        )
+        assert (
+            self._feature_vector_size is not None
+        ), "size of feature vector must be set"
         return self._feature_vector_size
 
     @property
@@ -1322,7 +1322,20 @@ def load_processed_data(
         # If filename is provided
         return self.load_processed_data_from_file(filename)
 
-    def load_processed_data_from_file(self, filename):
+    def load_processed_data_from_file(self, filename: str) -> list[dict[str, Any]]:
+        """Load processed data from a file.
+
+        The full path is not required; only the filename is needed, as it will be joined with the processed directory.
+
+        Args:
+            filename (str): The name of the file to load the processed data from.
+
+        Returns:
+            List[Dict[str, Any]]: The loaded processed data.
+
+        Example:
+            data = self.load_processed_data_from_file('data.pt')
+        """
         return torch.load(
             os.path.join(self.processed_dir, filename), weights_only=False
         )
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -199,26 +199,26 @@ def _read_data(self, raw_data: str) -> List[int]:
         Returns:
             List[int]: A list of integers representing the indices of the SMILES tokens.
         """
-        if self.canonicalize_smiles:
-            try:
-                mol = Chem.MolFromSmiles(raw_data.strip())
-                if mol is not None:
-                    raw_data = Chem.MolToSmiles(mol, canonical=True)
-            except Exception as e:
-                print(f"RDKit failed to process {raw_data}")
-                print(f"\t{e}")
         try:
             mol = Chem.MolFromSmiles(raw_data.strip())
             if mol is None:
                 raise ValueError(f"Invalid SMILES: {raw_data}")
-            return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
         except ValueError as e:
             print(f"could not process {raw_data}")
             print(f"\tError: {e}")
             return None
 
-    def _back_to_smiles(self, smiles_encoded):
+        if self.canonicalize_smiles:
+            try:
+                raw_data = Chem.MolToSmiles(mol, canonical=True)
+            except Exception as e:
+                print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
+                print(f"\t{e}")
+                return None
 
+        return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
+
+    def _back_to_smiles(self, smiles_encoded):
         token_file = self.reader.token_path
         token_coding = {}
         counter = 0
diff --git a/configs/loss/bce_new.yml b/configs/loss/bce_new.yml
diff --git a/configs/loss/bce_try.yml b/configs/loss/bce_try.yml
diff --git a/configs/loss/bce_unweighted.yml b/configs/loss/bce_unweighted.yml
@@ -0,0 +1 @@
+class_path: torch.nn.BCEWithLogitsLoss
diff --git a/configs/loss/bce_weighted.yml b/configs/loss/bce_weighted.yml
@@ -1,3 +1,3 @@
 class_path: chebai.loss.bce_weighted.BCEWeighted
 init_args:
-  beta: 1000
+  beta: 0.99
diff --git a/configs/loss/focal_loss.yml b/configs/loss/focal_loss.yml
@@ -1,4 +1,3 @@
 class_path: chebai.loss.focal_loss.FocalLoss
 init_args:
   task_type: multi-label
-  num_classes: 12
diff --git a/configs/model/electra.yml b/configs/model/electra.yml
@@ -1,6 +1,6 @@
 class_path: chebai.models.Electra
 init_args:
-  model_type: regression
+  model_type: classification
   optimizer_kwargs:
     lr: 1e-4
   config:
@@ -9,4 +9,4 @@ init_args:
     num_attention_heads: 8
     num_hidden_layers: 6
     type_vocab_size: 1
-    hidden_size: 256
+    hidden_size: 256
diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
@@ -111,6 +111,21 @@ def test_read_data_with_invalid_input(self) -> None:
                 f"The output for invalid token '{raw_data}' should be None.",
             )
 
+    def test_read_data_with_invalid_input_with_no_canonicalize(self) -> None:
+        """
+        Test the _read_data method with an invalid input.
+        The invalid token should prompt a return value None
+        """
+        self.reader.canonicalize_smiles = False
+        raw_datas = ["%INVALID%", "ADADAD", "ADASDAD", "CC(=O)NC1[Mg-2]"]
+        for raw_data in raw_datas:
+            result = self.reader._read_data(raw_data)
+            self.assertIsNone(
+                result,
+                f"The output for invalid token '{raw_data}' should be None.",
+            )
+        self.reader.canonicalize_smiles = True  # Reset to original state
+
     @patch("builtins.open", new_callable=mock_open)
     def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None:
         """