diff --git a/molpipeline/mol2any/mol2rdkit_phys_chem.py b/molpipeline/mol2any/mol2rdkit_phys_chem.py index 9b9d7255..2682fdfa 100644 --- a/molpipeline/mol2any/mol2rdkit_phys_chem.py +++ b/molpipeline/mol2any/mol2rdkit_phys_chem.py @@ -15,6 +15,7 @@ import numpy as np import numpy.typing as npt +from loguru import logger from rdkit import Chem from rdkit.Chem import Descriptors from sklearn.preprocessing import StandardScaler @@ -43,7 +44,9 @@ class MolToRDKitPhysChem(MolToDescriptorPipelineElement): def __init__( self, descriptor_list: Optional[list[str]] = None, + return_with_errors: bool = False, standardizer: Optional[AnyTransformer] = StandardScaler(), + log_exceptions: bool = True, name: str = "Mol2RDKitPhysChem", n_jobs: int = 1, uuid: Optional[str] = None, @@ -54,8 +57,13 @@ def __init__( ---------- descriptor_list: Optional[list[str]], optional (default=None) List of descriptor names to calculate. If None, DEFAULT_DESCRIPTORS are used. + return_with_errors: bool, optional (default = False) + If False, the pipeline element will return an InvalidInstance if any error occurs during calculations. + If True, the pipeline element will return a vector with NaN values for failed descriptor calculations. standardizer: Optional[AnyTransformer], optional (default=StandardScaler()) Standardizer to use. + log_exceptions: bool, optional (default=True) + If True, traceback of exceptions occurring during descriptor calculation are logged. name: str, optional (default="Mol2RDKitPhysChem") Name of the PipelineElement. n_jobs: int, optional (default=1) @@ -63,7 +71,9 @@ def __init__( uuid: Optional[str], optional (default=None) UUID of the PipelineElement. If None, a new UUID is generated. """ - self._descriptor_list = descriptor_list or DEFAULT_DESCRIPTORS + self.descriptor_list = descriptor_list # type: ignore + self._return_with_errors = return_with_errors + self._log_exceptions = log_exceptions super().__init__( standardizer=standardizer, name=name, @@ -93,6 +103,32 @@ def descriptor_list(self) -> list[str]: """ return self._descriptor_list[:] + @descriptor_list.setter + def descriptor_list(self, descriptor_list: list[str] | None) -> None: + """Set the descriptor list. + + Parameters + ---------- + descriptor_list: list[str] | None + List of descriptor names to calculate. If None, DEFAULT_DESCRIPTORS are used. + + Raises + ------ + ValueError + If an unknown descriptor name is used. + """ + if descriptor_list is None or descriptor_list is DEFAULT_DESCRIPTORS: + # if None or DEFAULT_DESCRIPTORS are used, set the default descriptors + self._descriptor_list = DEFAULT_DESCRIPTORS + else: + # check all user defined descriptors are valid + for descriptor_name in descriptor_list: + if descriptor_name not in RDKIT_DESCRIPTOR_DICT: + raise ValueError( + f"Unknown descriptor function with name: {descriptor_name}" + ) + self._descriptor_list = descriptor_list + def pretransform_single( self, value: RDKitMol ) -> Union[npt.NDArray[np.float_], InvalidInstance]: @@ -108,10 +144,15 @@ def pretransform_single( Optional[npt.NDArray[np.float_]] Descriptor vector for given molecule. None if calculation failed. """ - vec = np.array( - [RDKIT_DESCRIPTOR_DICT[name](value) for name in self._descriptor_list] - ) - if np.any(np.isnan(vec)): + vec = np.full((len(self._descriptor_list),), np.nan) + for i, name in enumerate(self._descriptor_list): + descriptor_func = RDKIT_DESCRIPTOR_DICT[name] + try: + vec[i] = descriptor_func(value) + except Exception: # pylint: disable=broad-except + if self._log_exceptions: + logger.exception(f"Failed calculating descriptor: {name}") + if not self._return_with_errors and np.any(np.isnan(vec)): return InvalidInstance(self.uuid, "NaN in descriptor vector", self.name) return vec @@ -131,8 +172,12 @@ def get_params(self, deep: bool = True) -> dict[str, Any]: parent_dict = dict(super().get_params(deep=deep)) if deep: parent_dict["descriptor_list"] = copy.deepcopy(self._descriptor_list) + parent_dict["return_with_errors"] = copy.deepcopy(self._return_with_errors) + parent_dict["log_exceptions"] = copy.deepcopy(self._log_exceptions) else: parent_dict["descriptor_list"] = self._descriptor_list + parent_dict["return_with_errors"] = self._return_with_errors + parent_dict["log_exceptions"] = self._log_exceptions return parent_dict def set_params(self, **parameters: dict[str, Any]) -> Self: @@ -149,8 +194,10 @@ def set_params(self, **parameters: dict[str, Any]) -> Self: Self """ parameters_shallow_copy = dict(parameters) - descriptor_list = parameters_shallow_copy.pop("descriptor_list", None) - if descriptor_list is not None: - self._descriptor_list = descriptor_list # type: ignore + params_list = ["descriptor_list", "return_with_errors", "log_exceptions"] + for param_name in params_list: + if param_name in parameters: + setattr(self, f"_{param_name}", parameters[param_name]) + parameters_shallow_copy.pop(param_name) super().set_params(**parameters_shallow_copy) return self diff --git a/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py b/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py index 2b9c7c95..8f9e2090 100644 --- a/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py +++ b/tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py @@ -7,7 +7,7 @@ import pandas as pd from sklearn.preprocessing import StandardScaler -from molpipeline import Pipeline +from molpipeline import ErrorFilter, FilterReinserter, Pipeline from molpipeline.any2mol import SmilesToMol from molpipeline.mol2any import MolToRDKitPhysChem from molpipeline.mol2any.mol2rdkit_phys_chem import DEFAULT_DESCRIPTORS @@ -262,9 +262,7 @@ def test_descriptor_calculation(self) -> None: property_vector = expected_df[descriptor_names].values output = pipeline.fit_transform(smiles) - difference = output - property_vector - - self.assertTrue(difference.max() < 0.0001) # add assertion here + self.assertTrue(np.allclose(output, property_vector)) # add assertion here def test_descriptor_normalization(self) -> None: """Test if the normalization of RDKitPhysChem Descriptors works as expected. @@ -297,10 +295,109 @@ def test_descriptor_normalization(self) -> None: output = pipeline.fit_transform(smiles) non_zero_descriptors = output[:, (np.abs(output).sum(axis=0) != 0)] self.assertTrue( - non_zero_descriptors.mean(axis=0).max() < 0.0000001 + np.allclose(non_zero_descriptors.mean(axis=0), 0.0) ) # add assertion here - self.assertTrue(non_zero_descriptors.std(axis=0).max() < 1.0000001) - self.assertTrue(non_zero_descriptors.std(axis=0).min() > 0.9999999) + self.assertTrue(np.allclose(non_zero_descriptors.std(axis=0), 1.0)) + + def test_optional_nan_value_handling(self) -> None: + """Test the handling of partly failed descriptor calculations.""" + + ok_smiles_list = [ + "CC", + "C(C)CCO", + ] + bad_smiles_list = [ + "F[P-](F)(F)(F)(F)F.CCCC[N+]1=CC=CC=C1C", + ] + + # test with return_with_errors=False + property_element = MolToRDKitPhysChem( + standardizer=None, return_with_errors=False + ) + + error_filter = ErrorFilter.from_element_list([property_element]) + error_replacer = FilterReinserter.from_error_filter( + error_filter, fill_value=np.nan + ) + + # note that we need the error filter and replacer here. Otherwise, the pipeline would fail on any error + # irrespective of the return_with_errors parameter + pipeline = Pipeline( + [ + ("smi2mol", SmilesToMol()), + ("property_element", property_element), + ("error_filter", error_filter), + ("error_replacer", error_replacer), + ] + ) + + output = pipeline.fit_transform(bad_smiles_list + ok_smiles_list) + self.assertEqual(len(output), len(bad_smiles_list + ok_smiles_list)) + # check expect-to-fail rows are ALL nan values + self.assertTrue( + np.equal(np.isnan(output).all(axis=1), [True, False, False]).all() + ) + # check expected-not-to-fail rows contain zero nan values + self.assertTrue( + np.equal(np.isnan(output).any(axis=1), [True, False, False]).all() + ) + + # test with return_with_errors=True + property_element2 = MolToRDKitPhysChem( + standardizer=None, return_with_errors=True + ) + + error_filter2 = ErrorFilter.from_element_list([property_element2]) + filter_reinserter = FilterReinserter.from_error_filter( + error_filter2, fill_value=np.nan + ) + pipeline2 = Pipeline( + [ + ("smi2mol", SmilesToMol()), + ("property_element", property_element2), + ("error_filter", error_filter2), + ("error_replacer", filter_reinserter), + ] + ) + + output2 = pipeline2.fit_transform(bad_smiles_list + ok_smiles_list) + self.assertEqual(len(output2), len(bad_smiles_list + ok_smiles_list)) + # check expect-to-fail rows are ALL nan values + self.assertTrue( + np.equal(np.isnan(output2).all(axis=1), [False, False, False]).all() + ) + # check expected-not-to-fail rows contain zero nan values + self.assertTrue( + np.equal(np.isnan(output2).any(axis=1), [True, False, False]).all() + ) + + def test_unknown_descriptor_name(self) -> None: + """Test the handling of unknown descriptor names.""" + + self.assertRaises( + ValueError, + MolToRDKitPhysChem, + **{"descriptor_list": ["__NotADescriptor11Name:)"]}, + ) + + def test_exception_handling(self) -> None: + """Test exception handling during descriptor calculation.""" + + pipeline = Pipeline( + [ + ("smi2mol", SmilesToMol()), + ( + "property_element", + MolToRDKitPhysChem( + standardizer=None, return_with_errors=True, log_exceptions=False + ), + ), + ] + ) + + # Without exception handling [HH] would raise a division-by-zero exception because it has 0 heavy atoms + output = pipeline.fit_transform(["[HH]"]) + self.assertTrue(output.shape == (1, len(DEFAULT_DESCRIPTORS))) if __name__ == "__main__":