Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mol2rdkit_phys_chem: make failure on any error optional #11

Merged
merged 7 commits into from
May 17, 2024
45 changes: 37 additions & 8 deletions molpipeline/mol2any/mol2rdkit_phys_chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import numpy as np
import numpy.typing as npt
from loguru import logger
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
Expand Down Expand Up @@ -43,7 +44,9 @@ class MolToRDKitPhysChem(MolToDescriptorPipelineElement):
def __init__(
self,
descriptor_list: Optional[list[str]] = None,
return_with_errors: bool = False,
standardizer: Optional[AnyTransformer] = StandardScaler(),
log_exceptions: bool = True,
name: str = "Mol2RDKitPhysChem",
n_jobs: int = 1,
uuid: Optional[str] = None,
Expand All @@ -54,16 +57,31 @@ def __init__(
----------
descriptor_list: Optional[list[str]], optional (default=None)
List of descriptor names to calculate. If None, DEFAULT_DESCRIPTORS are used.
return_with_errors: bool, optional (default = False)
If False, the pipeline element will return an InvalidInstance if any error occurs during calculations.
If True, the pipeline element will return a vector with NaN values for failed descriptor calculations.
standardizer: Optional[AnyTransformer], optional (default=StandardScaler())
Standardizer to use.
log_exceptions: bool, optional (default=True)
If True, traceback of exceptions occurring during descriptor calculation are logged.
name: str, optional (default="Mol2RDKitPhysChem")
Name of the PipelineElement.
n_jobs: int, optional (default=1)
Number of jobs to use for parallelization.
uuid: Optional[str], optional (default=None)
UUID of the PipelineElement. If None, a new UUID is generated.
"""
self._descriptor_list = descriptor_list or DEFAULT_DESCRIPTORS
if descriptor_list is not None:
for descriptor_name in descriptor_list:
if descriptor_name not in RDKIT_DESCRIPTOR_DICT:
raise ValueError(
f"Unknown descriptor function with name: {descriptor_name}"
)
self._descriptor_list = descriptor_list
JochenSiegWork marked this conversation as resolved.
Show resolved Hide resolved
else:
self._descriptor_list = DEFAULT_DESCRIPTORS
self._return_with_errors = return_with_errors
self._log_exceptions = log_exceptions
super().__init__(
standardizer=standardizer,
name=name,
Expand Down Expand Up @@ -108,10 +126,15 @@ def pretransform_single(
Optional[npt.NDArray[np.float_]]
Descriptor vector for given molecule. None if calculation failed.
"""
vec = np.array(
[RDKIT_DESCRIPTOR_DICT[name](value) for name in self._descriptor_list]
)
if np.any(np.isnan(vec)):
vec = np.full((len(self._descriptor_list),), np.nan)
for i, name in enumerate(self._descriptor_list):
descriptor_func = RDKIT_DESCRIPTOR_DICT[name]
try:
vec[i] = descriptor_func(value)
except Exception: # pylint: disable=broad-except
if self._log_exceptions:
logger.exception(f"Failed calculating descriptor: {name}")
if not self._return_with_errors and np.any(np.isnan(vec)):
return InvalidInstance(self.uuid, "NaN in descriptor vector", self.name)
return vec

Expand All @@ -131,8 +154,12 @@ def get_params(self, deep: bool = True) -> dict[str, Any]:
parent_dict = dict(super().get_params(deep=deep))
if deep:
parent_dict["descriptor_list"] = copy.deepcopy(self._descriptor_list)
parent_dict["return_with_errors"] = copy.deepcopy(self._return_with_errors)
parent_dict["log_exceptions"] = copy.deepcopy(self._log_exceptions)
else:
parent_dict["descriptor_list"] = self._descriptor_list
parent_dict["return_with_errors"] = self._return_with_errors
parent_dict["log_exceptions"] = self._log_exceptions
return parent_dict

def set_params(self, **parameters: dict[str, Any]) -> Self:
Expand All @@ -149,8 +176,10 @@ def set_params(self, **parameters: dict[str, Any]) -> Self:
Self
"""
parameters_shallow_copy = dict(parameters)
descriptor_list = parameters_shallow_copy.pop("descriptor_list", None)
if descriptor_list is not None:
self._descriptor_list = descriptor_list # type: ignore
params_list = ["descriptor_list", "return_with_errors", "log_exceptions"]
for param_name in params_list:
if param_name in parameters:
setattr(self, f"_{param_name}", parameters[param_name])
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
parameters_shallow_copy.pop(param_name)
super().set_params(**parameters_shallow_copy)
return self
111 changes: 104 additions & 7 deletions tests/test_elements/test_mol2any/test_mol2rdkit_phys_chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler

from molpipeline import Pipeline
from molpipeline import ErrorFilter, FilterReinserter, Pipeline
from molpipeline.any2mol import SmilesToMol
from molpipeline.mol2any import MolToRDKitPhysChem
from molpipeline.mol2any.mol2rdkit_phys_chem import DEFAULT_DESCRIPTORS
Expand Down Expand Up @@ -262,9 +262,7 @@ def test_descriptor_calculation(self) -> None:
property_vector = expected_df[descriptor_names].values

output = pipeline.fit_transform(smiles)
difference = output - property_vector

self.assertTrue(difference.max() < 0.0001) # add assertion here
self.assertTrue(np.allclose(output, property_vector)) # add assertion here

def test_descriptor_normalization(self) -> None:
"""Test if the normalization of RDKitPhysChem Descriptors works as expected.
Expand Down Expand Up @@ -297,10 +295,109 @@ def test_descriptor_normalization(self) -> None:
output = pipeline.fit_transform(smiles)
non_zero_descriptors = output[:, (np.abs(output).sum(axis=0) != 0)]
self.assertTrue(
non_zero_descriptors.mean(axis=0).max() < 0.0000001
np.allclose(non_zero_descriptors.mean(axis=0), 0.0)
) # add assertion here
self.assertTrue(non_zero_descriptors.std(axis=0).max() < 1.0000001)
self.assertTrue(non_zero_descriptors.std(axis=0).min() > 0.9999999)
self.assertTrue(np.allclose(non_zero_descriptors.std(axis=0), 1.0))

def test_optional_nan_value_handling(self) -> None:
"""Test the handling of partly failed descriptor calculations."""

ok_smiles_list = [
"CC",
"C(C)CCO",
]
bad_smiles_list = [
"F[P-](F)(F)(F)(F)F.CCCC[N+]1=CC=CC=C1C",
]

# test with return_with_errors=False
property_element = MolToRDKitPhysChem(
standardizer=None, return_with_errors=False
)

error_filter = ErrorFilter.from_element_list([property_element])
error_replacer = FilterReinserter.from_error_filter(
error_filter, fill_value=np.nan
)

# note that we need the error filter and replacer here. Otherwise, the pipeline would fail on any error
# irrespective of the return_with_errors parameter
pipeline = Pipeline(
[
("smi2mol", SmilesToMol()),
("property_element", property_element),
("error_filter", error_filter),
("error_replacer", error_replacer),
]
)

output = pipeline.fit_transform(bad_smiles_list + ok_smiles_list)
self.assertEqual(len(output), len(bad_smiles_list + ok_smiles_list))
# check expect-to-fail rows are ALL nan values
self.assertTrue(
np.equal(np.isnan(output).all(axis=1), [True, False, False]).all()
)
# check expected-not-to-fail rows contain zero nan values
self.assertTrue(
np.equal(np.isnan(output).any(axis=1), [True, False, False]).all()
)

# test with return_with_errors=True
property_element2 = MolToRDKitPhysChem(
standardizer=None, return_with_errors=True
)

error_filter2 = ErrorFilter.from_element_list([property_element2])
filter_reinserter = FilterReinserter.from_error_filter(
error_filter2, fill_value=np.nan
)
pipeline2 = Pipeline(
[
("smi2mol", SmilesToMol()),
("property_element", property_element2),
("error_filter", error_filter2),
("error_replacer", filter_reinserter),
]
)

output2 = pipeline2.fit_transform(bad_smiles_list + ok_smiles_list)
self.assertEqual(len(output2), len(bad_smiles_list + ok_smiles_list))
# check expect-to-fail rows are ALL nan values
self.assertTrue(
np.equal(np.isnan(output2).all(axis=1), [False, False, False]).all()
)
# check expected-not-to-fail rows contain zero nan values
self.assertTrue(
np.equal(np.isnan(output2).any(axis=1), [True, False, False]).all()
)

def test_unknown_descriptor_name(self) -> None:
"""Test the handling of unknown descriptor names."""

self.assertRaises(
ValueError,
MolToRDKitPhysChem,
**{"descriptor_list": ["__NotADescriptor11Name4374737834hggghgddd"]},
JochenSiegWork marked this conversation as resolved.
Show resolved Hide resolved
)

def test_exception_handling(self) -> None:
"""Test exception handling during descriptor calculation."""

pipeline = Pipeline(
[
("smi2mol", SmilesToMol()),
(
"property_element",
MolToRDKitPhysChem(
standardizer=None, return_with_errors=True, log_exceptions=False
),
),
]
)

# Without exception handling [HH] would raise a division-by-zero exception because it has 0 heavy atoms
output = pipeline.fit_transform(["[HH]"])
self.assertTrue(output.shape == (1, len(DEFAULT_DESCRIPTORS)))


if __name__ == "__main__":
Expand Down
Loading