basf · JochenSiegWork · May 17, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/molpipeline/mol2any/mol2rdkit_phys_chem.py b/molpipeline/mol2any/mol2rdkit_phys_chem.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import numpy.typing as npt
+from loguru import logger
 from rdkit import Chem
 from rdkit.Chem import Descriptors
 from sklearn.preprocessing import StandardScaler
@@ -43,7 +44,9 @@ class MolToRDKitPhysChem(MolToDescriptorPipelineElement):
     def __init__(
         self,
         descriptor_list: Optional[list[str]] = None,
+        return_with_errors: bool = False,
         standardizer: Optional[AnyTransformer] = StandardScaler(),
+        log_exceptions: bool = True,
         name: str = "Mol2RDKitPhysChem",
         n_jobs: int = 1,
         uuid: Optional[str] = None,
@@ -54,16 +57,31 @@ def __init__(
         ----------
         descriptor_list: Optional[list[str]], optional (default=None)
             List of descriptor names to calculate. If None, DEFAULT_DESCRIPTORS are used.
+        return_with_errors: bool, optional (default = False)
+            If False, the pipeline element will return an InvalidInstance if any error occurs during calculations.
+            If True, the pipeline element will return a vector with NaN values for failed descriptor calculations.
         standardizer: Optional[AnyTransformer], optional (default=StandardScaler())
             Standardizer to use.
+        log_exceptions: bool, optional (default=True)
+            If True, traceback of exceptions occurring during descriptor calculation are logged.
         name: str, optional (default="Mol2RDKitPhysChem")
             Name of the PipelineElement.
         n_jobs: int, optional (default=1)
             Number of jobs to use for parallelization.
         uuid: Optional[str], optional (default=None)
             UUID of the PipelineElement. If None, a new UUID is generated.
         """
-        self._descriptor_list = descriptor_list or DEFAULT_DESCRIPTORS
+        if descriptor_list is not None:
+            for descriptor_name in descriptor_list:
+                if descriptor_name not in RDKIT_DESCRIPTOR_DICT:
+                    raise ValueError(
+                        f"Unknown descriptor function with name: {descriptor_name}"
+                    )
+            self._descriptor_list = descriptor_list
+        else:
+            self._descriptor_list = DEFAULT_DESCRIPTORS
+        self._return_with_errors = return_with_errors
+        self._log_exceptions = log_exceptions
         super().__init__(
             standardizer=standardizer,
             name=name,
@@ -108,10 +126,15 @@ def pretransform_single(
         Optional[npt.NDArray[np.float_]]
             Descriptor vector for given molecule. None if calculation failed.
         """
-        vec = np.array(
-            [RDKIT_DESCRIPTOR_DICT[name](value) for name in self._descriptor_list]
-        )
-        if np.any(np.isnan(vec)):
+        vec = np.full((len(self._descriptor_list),), np.nan)
+        for i, name in enumerate(self._descriptor_list):
+            descriptor_func = RDKIT_DESCRIPTOR_DICT[name]
+            try:
+                vec[i] = descriptor_func(value)
+            except Exception:  # pylint: disable=broad-except
+                if self._log_exceptions:
+                    logger.exception(f"Failed calculating descriptor: {name}")
+        if not self._return_with_errors and np.any(np.isnan(vec)):
             return InvalidInstance(self.uuid, "NaN in descriptor vector", self.name)
         return vec
 
@@ -131,8 +154,12 @@ def get_params(self, deep: bool = True) -> dict[str, Any]:
         parent_dict = dict(super().get_params(deep=deep))
         if deep:
             parent_dict["descriptor_list"] = copy.deepcopy(self._descriptor_list)
+            parent_dict["return_with_errors"] = copy.deepcopy(self._return_with_errors)
+            parent_dict["log_exceptions"] = copy.deepcopy(self._log_exceptions)
         else:
             parent_dict["descriptor_list"] = self._descriptor_list
+            parent_dict["return_with_errors"] = self._return_with_errors
+            parent_dict["log_exceptions"] = self._log_exceptions
         return parent_dict
 
     def set_params(self, **parameters: dict[str, Any]) -> Self:
@@ -149,8 +176,10 @@ def set_params(self, **parameters: dict[str, Any]) -> Self:
             Self
         """
         parameters_shallow_copy = dict(parameters)
-        descriptor_list = parameters_shallow_copy.pop("descriptor_list", None)
-        if descriptor_list is not None:
-            self._descriptor_list = descriptor_list  # type: ignore
+        params_list = ["descriptor_list", "return_with_errors", "log_exceptions"]
+        for param_name in params_list:
+            if param_name in parameters:
+                setattr(self, f"_{param_name}", parameters[param_name])
+                parameters_shallow_copy.pop(param_name)
         super().set_params(**parameters_shallow_copy)
         return self
diff --git a/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py b/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 
-from molpipeline import Pipeline
+from molpipeline import ErrorFilter, FilterReinserter, Pipeline
 from molpipeline.any2mol import SmilesToMol
 from molpipeline.mol2any import MolToRDKitPhysChem
 from molpipeline.mol2any.mol2rdkit_phys_chem import DEFAULT_DESCRIPTORS
@@ -262,9 +262,7 @@ def test_descriptor_calculation(self) -> None:
         property_vector = expected_df[descriptor_names].values
 
         output = pipeline.fit_transform(smiles)
-        difference = output - property_vector
-
-        self.assertTrue(difference.max() < 0.0001)  # add assertion here
+        self.assertTrue(np.allclose(output, property_vector))  # add assertion here
 
     def test_descriptor_normalization(self) -> None:
         """Test if the normalization of RDKitPhysChem Descriptors works as expected.
@@ -297,10 +295,109 @@ def test_descriptor_normalization(self) -> None:
         output = pipeline.fit_transform(smiles)
         non_zero_descriptors = output[:, (np.abs(output).sum(axis=0) != 0)]
         self.assertTrue(
-            non_zero_descriptors.mean(axis=0).max() < 0.0000001
+            np.allclose(non_zero_descriptors.mean(axis=0), 0.0)
         )  # add assertion here
-        self.assertTrue(non_zero_descriptors.std(axis=0).max() < 1.0000001)
-        self.assertTrue(non_zero_descriptors.std(axis=0).min() > 0.9999999)
+        self.assertTrue(np.allclose(non_zero_descriptors.std(axis=0), 1.0))
+
+    def test_optional_nan_value_handling(self) -> None:
+        """Test the handling of partly failed descriptor calculations."""
+
+        ok_smiles_list = [
+            "CC",
+            "C(C)CCO",
+        ]
+        bad_smiles_list = [
+            "F[P-](F)(F)(F)(F)F.CCCC[N+]1=CC=CC=C1C",
+        ]
+
+        # test with return_with_errors=False
+        property_element = MolToRDKitPhysChem(
+            standardizer=None, return_with_errors=False
+        )
+
+        error_filter = ErrorFilter.from_element_list([property_element])
+        error_replacer = FilterReinserter.from_error_filter(
+            error_filter, fill_value=np.nan
+        )
+
+        # note that we need the error filter and replacer here. Otherwise, the pipeline would fail on any error
+        # irrespective of the return_with_errors parameter
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("property_element", property_element),
+                ("error_filter", error_filter),
+                ("error_replacer", error_replacer),
+            ]
+        )
+
+        output = pipeline.fit_transform(bad_smiles_list + ok_smiles_list)
+        self.assertEqual(len(output), len(bad_smiles_list + ok_smiles_list))
+        # check expect-to-fail rows are ALL nan values
+        self.assertTrue(
+            np.equal(np.isnan(output).all(axis=1), [True, False, False]).all()
+        )
+        # check expected-not-to-fail rows contain zero nan values
+        self.assertTrue(
+            np.equal(np.isnan(output).any(axis=1), [True, False, False]).all()
+        )
+
+        # test with return_with_errors=True
+        property_element2 = MolToRDKitPhysChem(
+            standardizer=None, return_with_errors=True
+        )
+
+        error_filter2 = ErrorFilter.from_element_list([property_element2])
+        filter_reinserter = FilterReinserter.from_error_filter(
+            error_filter2, fill_value=np.nan
+        )
+        pipeline2 = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("property_element", property_element2),
+                ("error_filter", error_filter2),
+                ("error_replacer", filter_reinserter),
+            ]
+        )
+
+        output2 = pipeline2.fit_transform(bad_smiles_list + ok_smiles_list)
+        self.assertEqual(len(output2), len(bad_smiles_list + ok_smiles_list))
+        # check expect-to-fail rows are ALL nan values
+        self.assertTrue(
+            np.equal(np.isnan(output2).all(axis=1), [False, False, False]).all()
+        )
+        # check expected-not-to-fail rows contain zero nan values
+        self.assertTrue(
+            np.equal(np.isnan(output2).any(axis=1), [True, False, False]).all()
+        )
+
+    def test_unknown_descriptor_name(self) -> None:
+        """Test the handling of unknown descriptor names."""
+
+        self.assertRaises(
+            ValueError,
+            MolToRDKitPhysChem,
+            **{"descriptor_list": ["__NotADescriptor11Name4374737834hggghgddd"]},
+        )
+
+    def test_exception_handling(self) -> None:
+        """Test exception handling during descriptor calculation."""
+
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                (
+                    "property_element",
+                    MolToRDKitPhysChem(
+                        standardizer=None, return_with_errors=True, log_exceptions=False
+                    ),
+                ),
+            ]
+        )
+
+        # Without exception handling [HH] would raise a division-by-zero exception because it has 0 heavy atoms
+        output = pipeline.fit_transform(["[HH]"])
+        self.assertTrue(output.shape == (1, len(DEFAULT_DESCRIPTORS)))
 
 
 if __name__ == "__main__":