From 96daff0ded90920e83c84cb242319a4f7a88d081 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Fri, 22 Nov 2024 11:11:15 +0100 Subject: [PATCH 01/24] Added draft support for multi-output prediction and AD estimation --- scikit_mol/adapters.py | 175 +++++++++++++++++++++++ scikit_mol/applicability.py | 277 ++++++++++++++++++++++++++++++++++++ 2 files changed, 452 insertions(+) create mode 100644 scikit_mol/adapters.py create mode 100644 scikit_mol/applicability.py diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py new file mode 100644 index 0000000..e5451f9 --- /dev/null +++ b/scikit_mol/adapters.py @@ -0,0 +1,175 @@ +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from sklearn.utils._pprint import _EstimatorPrettyPrinter +from sklearn.utils._param_validation import validate_params +from sklearn.utils.metaestimators import available_if +from sklearn.utils.validation import check_is_fitted, check_array +from sklearn.utils._estimator_html_repr import _VisualBlock +from sklearn.utils._set_output import _safe_set_output +from scipy.stats import chi2 + + +class EstimatorUnion(BaseEstimator, TransformerMixin): + def __init__(self, estimators): + self.estimators = estimators + + def fit(self, X, y=None): + self.fitted_estimators_ = [] + for name, estimator in self.estimators: + if hasattr(estimator, "fit"): + fitted_estimator = estimator.fit(X, y) + self.fitted_estimators_.append((name, fitted_estimator)) + else: + self.fitted_estimators_.append((name, estimator)) + return self + + def transform(self, X): + check_is_fitted(self) + results = [] + for name, estimator in self.fitted_estimators_: + if hasattr(estimator, "predict"): + results.append(estimator.predict(X)) + elif hasattr(estimator, "transform"): + results.append(estimator.transform(X)) + return np.column_stack(results) + + def predict(self, X): + return self.transform(X) + + @available_if(lambda self: hasattr(self, "fitted_estimators_")) + def get_feature_names_out(self, input_features=None): + feature_names = [] + for name, estimator in self.fitted_estimators_: + if hasattr(estimator, "get_feature_names_out"): + feature_names.extend(estimator.get_feature_names_out()) + else: + feature_names.append(name) + return np.array(feature_names) + + def set_output(self, *, transform=None): + """Set output container for all estimators. + + Parameters + ---------- + transform : {"default", "pandas"}, default=None + Configure output of `transform` and `fit_transform`. + + Returns + ------- + self : estimator instance + Estimator instance. + """ + for _, estimator in self.estimators: + _safe_set_output(estimator, transform=transform) + return super().set_output(transform=transform) + + def __repr__(self): + class_name = self.__class__.__name__ + estimator_reprs = [] + for name, estimator in self.estimators: + estimator_repr = f"{name}={estimator.__repr__()}" + estimator_reprs.append(estimator_repr) + estimators_str = ",\n".join(estimator_reprs) + return f"{class_name}([\n{estimators_str}\n])" + + def _sk_visual_block_(self): + names, transformers = zip(*self.estimators) + return _VisualBlock("parallel", transformers, names=names) + + +class SigmoidThresholdTransformer(BaseEstimator, TransformerMixin): + def __init__(self, threshold, steepness=1, feature_name="Sigmoid_", prefix=True): + self.threshold = threshold + self.steepness = steepness + self.feature_name = feature_name + self.prefix = prefix + + def fit(self, X, y=None): + return self + + def transform(self, X): + return 1 / (1 + np.exp(self.steepness * (X - self.threshold))) + + def predict(self, X): + return self.transform(X) + + @available_if(lambda self: hasattr(self, "fitted_estimators_")) + def get_feature_names_out(self, input_features=None): + check_is_fitted(self) + + if input_features is None: + if ( + hasattr(self, "feature_names_in_") + and self.feature_names_in_ is not None + ): + input_features = self.feature_names_in_ + else: + input_features = [f"x{i}" for i in range(self.n_features_in_)] + + if self.feature_name: + if self.prefix: + return np.array( + [f"{self.feature_name}{feature}" for feature in input_features] + ) + else: + if len(input_features) > 1: + return np.array( + [f"{self.feature_name}{i}" for i in range(len(input_features))] + ) + else: + return np.array([self.feature_name]) + else: + return np.array(input_features) + + +class NullEstimator(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): + def __init__( + self, + accept_sparse=False, + ): + self.accept_sparse = accept_sparse + + def fit(self, X, y=None): + # Check and store the input + self.X_ = check_array( + X, accept_sparse=self.accept_sparse, force_all_finite="allow-nan" + ) + self.n_features_in_ = self.X_.shape[1] + self.feature_names_in_ = getattr(X, "columns", None) + return self + + def transform(self, X): + check_is_fitted(self) + X = check_array( + X, accept_sparse=self.accept_sparse, force_all_finite="allow-nan" + ) + + # Check that the input is of the same shape as the one passed during fit. + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"Shape of input is different from what was seen in `fit`" + f" Expected {self.n_features_in_} features, got {X.shape[1]}" + ) + return X + + def predict(self, X): + return self.transform(X) + + @available_if(lambda self: hasattr(self, "fitted_estimators_")) + def get_feature_names_out(self, input_features=None): + check_is_fitted(self) + + # Do I need to heck that the size of input_features is correct? + # if len(input_features) != self.n_features_out_: + # raise ValueError(f"Expected {self.n_features_in_} features, got {len(input_features)}") + + if input_features: + return input_features + else: + return np.array([f"x{i}" for i in range(self.n_features_in_)]) + + def _more_tags(self): + return { + "allow_nan": True, + "X_types": ["2darray"] + (["sparse"] if self.accept_sparse else []), + } diff --git a/scikit_mol/applicability.py b/scikit_mol/applicability.py new file mode 100644 index 0000000..8cfda1b --- /dev/null +++ b/scikit_mol/applicability.py @@ -0,0 +1,277 @@ +import numpy as np +from scipy.sparse import csr_matrix +from sklearn.neighbors import NearestNeighbors +from sklearn.base import BaseEstimator, TransformerMixin, check_array, check_is_fitted +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted, check_array +import numpy as np +from scipy import linalg + + +class NearestNeighborsDistance(BaseEstimator, TransformerMixin): + def __init__(self, n_neighbors=1): + self.n_neighbors = n_neighbors + self.feature_name = "nn_distance" + + def fit(self, X, y=None): + self.X_sparse = csr_matrix(X) + self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric="cosine") + self.nn.fit(self.X_sparse) + return self + + def transform(self, X): + X_sparse = csr_matrix(X) + distances, _ = self.nn.kneighbors(X_sparse) + avg_distances = np.mean(distances, axis=1) + return avg_distances.reshape(-1, 1) # Return 2D array for consistency + + def predict(self, X): + return self.transform(X) + + def get_feature_names_out(self, input_features=None): + return np.array([self.feature_name]) + + +class LeverageDistanceSlow(BaseEstimator, TransformerMixin): + """Calculate leverage-based distances for applicability domain assessment. + + The leverage approach measures how far a sample is from the center of the + X variable space. It's based on the hat matrix H = X(X'X)^(-1)X'. + + Parameters + ---------- + threshold_factor : float, default=3 + Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n + where p is the number of features and n is the number of samples. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + X_fit_ : ndarray + Training data used in fit. + leverage_threshold_ : float + Calculated leverage threshold (h*). + """ + + def __init__(self, threshold_factor=3): + self.threshold_factor = threshold_factor + + def fit(self, X, y=None): + """Fit the model using X as training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present here for API consistency by convention. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X, accept_sparse=False) + self.n_features_in_ = X.shape[1] + self.X_fit_ = X + + # Calculate leverage threshold h* + n_samples = X.shape[0] + self.leverage_threshold_ = ( + self.threshold_factor * (self.n_features_in_ + 1) / n_samples + ) + + # Store (X'X)^(-1) for later use + self.xtx_inv_ = np.linalg.inv(X.T @ X) + + return self + + def transform(self, X): + """Calculate leverage-based distances for X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to calculate leverage distances for. + + Returns + ------- + h : ndarray of shape (n_samples, 1) + The leverage values for each sample. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=False) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but LeverageDistance " + f"was fitted with {self.n_features_in_} features." + ) + + # Calculate leverage values h = diag(X(X'X)^(-1)X') + # Slighlty different implementation (from another package) + # hat_matrix = X @ self.xtx_inv_ @ X.T + # leverages = np.diag(hat_matrix) + + h = np.sum(X @ self.xtx_inv_ * X, axis=1) + + return h.reshape(-1, 1) + + def predict(self, X): + """Alias for transform, following scikit-learn conventions.""" + return self.transform(X) + + def get_feature_names_out(self, input_features=None): + """Get output feature names. + + Parameters + ---------- + input_features : None + Ignored as the transformer generates new feature names. + + Returns + ------- + feature_names_out : ndarray of str objects + Leverage distance feature name. + """ + check_is_fitted(self) + return np.array(["leverage_distance"]) + + +# Faster but gives some _very_ large distances for some compounds! +class LeverageDistance(BaseEstimator, TransformerMixin): + """Calculate leverage-based distances for applicability domain assessment. + + Parameters + ---------- + threshold_factor : float, default=3 + Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n + """ + + def __init__(self, threshold_factor=3): + self.threshold_factor = threshold_factor + + def fit(self, X, y=None): + X = check_array(X, accept_sparse=False) + self.n_features_in_ = X.shape[1] + n_samples = X.shape[0] + + # Calculate leverage threshold h* + self.leverage_threshold_ = ( + self.threshold_factor * (self.n_features_in_ + 1) / n_samples + ) + + # Use more efficient matrix operations + # Calculate (X'X)^(-1) using SVD which is more stable + U, s, Vh = linalg.svd(X, full_matrices=False) + + # Store components for faster transform + self.s_inv_ = 1 / s + self.U_ = U + self.Vh_ = Vh + + return self + + def transform(self, X): + check_is_fitted(self) + X = check_array(X, accept_sparse=False) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but LeverageDistance " + f"was fitted with {self.n_features_in_} features." + ) + + # Efficient leverage calculation using stored SVD components + # This avoids explicit matrix inversion + Z = X @ self.Vh_.T * self.s_inv_ + h = np.sum(Z * Z, axis=1) + + return h.reshape(-1, 1) + + def predict(self, X): + return self.transform(X) + + def get_feature_names_out(self, input_features=None): + check_is_fitted(self) + return np.array(["leverage_distance"]) + + +class MahalanobisDistance(BaseEstimator, TransformerMixin): + """Calculate Mahalanobis distances for applicability domain assessment. + + Parameters + ---------- + threshold_quantile : float, default=0.975 + Quantile of chi-square distribution to use as threshold. + threshold_strategy : str, default='chi2' + Strategy to compute threshold. Options: + - 'chi2': Use chi-square distribution (theoretical) + - 'empirical': Use empirical distribution from training data + - None: Don't compute threshold (useful for CV) + """ + + def __init__(self, threshold_quantile=0.975, threshold_strategy="chi2"): + self.threshold_quantile = threshold_quantile + self.threshold_strategy = threshold_strategy + + def fit(self, X, y=None): + X = check_array(X) + self.n_features_in_ = X.shape[1] + + # Compute mean and covariance + self.mean_ = np.mean(X, axis=0) + self.covariance_ = np.cov(X, rowvar=False) + self.inv_covariance_ = np.linalg.inv(self.covariance_) + + # Calculate distances for training set + train_distances = self._mahalanobis(X) + self.train_distances_ = train_distances + + # Set threshold based on strategy + if self.threshold_strategy == "chi2": + self.threshold_ = chi2.ppf(self.threshold_quantile, df=self.n_features_in_) + elif self.threshold_strategy == "empirical": + self.threshold_ = np.quantile(train_distances, self.threshold_quantile) + elif self.threshold_strategy is None: + self.threshold_ = None + else: + raise ValueError(f"Unknown threshold_strategy: {self.threshold_strategy}") + + return self + + def _mahalanobis(self, X): + """Calculate Mahalanobis distances.""" + X_centered = X - self.mean_ + return np.sqrt(np.sum(X_centered @ self.inv_covariance_ * X_centered, axis=1)) + + def transform(self, X): + check_is_fitted(self) + X = check_array(X) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but {self.__class__.__name__} " + f"was fitted with {self.n_features_in_} features." + ) + + distances = self._mahalanobis(X) + return distances.reshape(-1, 1) + + def set_threshold(self, threshold): + """Set threshold manually, e.g., from cross-validation.""" + self.threshold_ = threshold + return self + + def get_feature_names_out(self, input_features=None): + check_is_fitted(self) + return np.array(["mahalanobis_distance"]) + + def _more_tags(self): + return { + "requires_fit": True, + "X_types": ["2darray"], + "poor_score": False, + "allow_nan": False, + } From 83cde658225718f848cd21844ab716954c2ed206 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Thu, 9 Jan 2025 16:03:52 +0100 Subject: [PATCH 02/24] Fixed type on Readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c6255f..72117ed 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,8 @@ The first draft for the project was created at the [RDKIT UGM 2022 hackathon](ht - Standardizer
-- safeinference + +- Safeinference - SafeInferenceWrapper - set_safe_inference_mode From 7c48055e6b5e0cc79c8e1d35939d37df1201aeb0 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Feb 2025 08:47:54 +0100 Subject: [PATCH 03/24] Added AD domain estimators from MLChemAD, need some work on consistent API --- scikit_mol/applicability/LICENSE.MIT | 19 ++ scikit_mol/applicability/README.md | 22 +++ scikit_mol/applicability/__init__.py | 25 +++ scikit_mol/applicability/bounding_box.py | 139 ++++++++++++++ scikit_mol/applicability/convex_hull.py | 113 ++++++++++++ scikit_mol/applicability/hotelling.py | 169 ++++++++++++++++++ scikit_mol/applicability/isolation_forest.py | 154 ++++++++++++++++ scikit_mol/applicability/kernel_density.py | 143 +++++++++++++++ scikit_mol/applicability/knn.py | 150 ++++++++++++++++ scikit_mol/applicability/leverage.py | 103 +++++++++++ scikit_mol/applicability/local_outlier.py | 119 ++++++++++++ scikit_mol/applicability/mahalanobis.py | 158 ++++++++++++++++ scikit_mol/applicability/standardization.py | 165 +++++++++++++++++ scikit_mol/applicability/topkat.py | 148 +++++++++++++++ ...{applicability.py => applicability_old.py} | 11 +- 15 files changed, 1632 insertions(+), 6 deletions(-) create mode 100644 scikit_mol/applicability/LICENSE.MIT create mode 100644 scikit_mol/applicability/README.md create mode 100644 scikit_mol/applicability/__init__.py create mode 100644 scikit_mol/applicability/bounding_box.py create mode 100644 scikit_mol/applicability/convex_hull.py create mode 100644 scikit_mol/applicability/hotelling.py create mode 100644 scikit_mol/applicability/isolation_forest.py create mode 100644 scikit_mol/applicability/kernel_density.py create mode 100644 scikit_mol/applicability/knn.py create mode 100644 scikit_mol/applicability/leverage.py create mode 100644 scikit_mol/applicability/local_outlier.py create mode 100644 scikit_mol/applicability/mahalanobis.py create mode 100644 scikit_mol/applicability/standardization.py create mode 100644 scikit_mol/applicability/topkat.py rename scikit_mol/{applicability.py => applicability_old.py} (97%) diff --git a/scikit_mol/applicability/LICENSE.MIT b/scikit_mol/applicability/LICENSE.MIT new file mode 100644 index 0000000..5d7ac46 --- /dev/null +++ b/scikit_mol/applicability/LICENSE.MIT @@ -0,0 +1,19 @@ +Copyright (c) 2023 Olivier J. M. Béquignon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/scikit_mol/applicability/README.md b/scikit_mol/applicability/README.md new file mode 100644 index 0000000..3fd2bfb --- /dev/null +++ b/scikit_mol/applicability/README.md @@ -0,0 +1,22 @@ +# Applicability Domain Estimators + +This module contains applicability domain estimators for chemical modeling. + +## License Information + +Files in this module are licensed under LGPL as part of scikit-mol, with some files containing code adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD). + +- Files containing the following header are adapted from MLChemAD (originally MIT licensed): + + ```python + """ + This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) + Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) + Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) + See LICENSE.MIT in this directory for the original MIT license. + """ + ``` + +- All other files are original implementations under scikit-mol's GPL/LGPL license. + +The original MLChemAD MIT license is preserved in LICENSE.MIT for reference. diff --git a/scikit_mol/applicability/__init__.py b/scikit_mol/applicability/__init__.py new file mode 100644 index 0000000..003c3be --- /dev/null +++ b/scikit_mol/applicability/__init__.py @@ -0,0 +1,25 @@ +from .bounding_box import BoundingBoxApplicabilityDomain +from .convex_hull import ConvexHullApplicabilityDomain +from .hotelling import HotellingT2ApplicabilityDomain +from .isolation_forest import IsolationForestApplicabilityDomain +from .kernel_density import KernelDensityApplicabilityDomain +from .knn import KNNApplicabilityDomain +from .leverage import LeverageApplicabilityDomain +from .local_outlier import LocalOutlierFactorApplicabilityDomain +from .mahalanobis import MahalanobisApplicabilityDomain +from .standardization import StandardizationApplicabilityDomain +from .topkat import TopkatApplicabilityDomain + +__all__ = [ + "BoundingBoxApplicabilityDomain", + "ConvexHullApplicabilityDomain", + "HotellingT2ApplicabilityDomain", + "IsolationForestApplicabilityDomain", + "KNNApplicabilityDomain", + "KernelDensityApplicabilityDomain", + "LeverageApplicabilityDomain", + "LocalOutlierFactorApplicabilityDomain", + "MahalanobisApplicabilityDomain", + "StandardizationApplicabilityDomain", + "TopkatApplicabilityDomain", +] diff --git a/scikit_mol/applicability/bounding_box.py b/scikit_mol/applicability/bounding_box.py new file mode 100644 index 0000000..4c946fe --- /dev/null +++ b/scikit_mol/applicability/bounding_box.py @@ -0,0 +1,139 @@ +""" +Bounding box applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class BoundingBoxApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain defined by feature value ranges. + + Samples falling outside the allowed range for any feature are considered + outside the domain. + + Parameters + ---------- + percentile : float or tuple of float, default=(0.1, 99.9) + Percentile(s) of the training set distribution used to define + the bounding box. If float, uses (percentile, 100-percentile). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + min_ : ndarray of shape (n_features,) + Minimum allowed value for each feature. + max_ : ndarray of shape (n_features,) + Maximum allowed value for each feature. + + Examples + -------- + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.decomposition import PCA + >>> from scikit_mol.applicability import BoundingBoxApplicabilityDomain + + Basic usage: + >>> ad = BoundingBoxApplicabilityDomain(percentile=1) + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + + With preprocessing: + >>> pipe = make_pipeline( + ... StandardScaler(), + ... BoundingBoxApplicabilityDomain(percentile=1) + ... ) + >>> pipe.fit(X_train) + >>> predictions = pipe.predict(X_test) + + With PCA preprocessing: + >>> pipe = make_pipeline( + ... StandardScaler(), + ... PCA(n_components=0.9), + ... BoundingBoxApplicabilityDomain(percentile=1) + ... ) + >>> pipe.fit(X_train) + >>> predictions = pipe.predict(X_test) + """ + + def __init__(self, percentile=(0.1, 99.9)): + if isinstance(percentile, (int, float)): + if not 0 <= percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + self.percentile = (percentile, 100 - percentile) + else: + if not all(0 <= p <= 100 for p in percentile): + raise ValueError("percentiles must be between 0 and 100") + if len(percentile) != 2: + raise ValueError("percentile must be a float or tuple of 2 floats") + if percentile[0] >= percentile[1]: + raise ValueError("first percentile must be less than second") + self.percentile = percentile + + def fit(self, X, y=None): + """Fit the bounding box applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + # Calculate bounds + self.min_ = np.percentile(X, self.percentile[0], axis=0) + self.max_ = np.percentile(X, self.percentile[1], axis=0) + + return self + + def transform(self, X): + """Calculate the number of features outside their bounds for each sample. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + violations : ndarray of shape (n_samples, 1) + Number of features outside their bounds for each sample. + Zero indicates all features within bounds. + """ + check_is_fitted(self) + X = check_array(X) + + violations = np.sum((X < self.min_) | (X > self.max_), axis=1) + return violations.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + violations = self.transform(X).ravel() + return np.where(violations == 0, 1, -1) diff --git a/scikit_mol/applicability/convex_hull.py b/scikit_mol/applicability/convex_hull.py new file mode 100644 index 0000000..4f46eb9 --- /dev/null +++ b/scikit_mol/applicability/convex_hull.py @@ -0,0 +1,113 @@ +""" +Convex hull applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from scipy import optimize +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class ConvexHullApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain defined as the convex hull of the training data. + + The convex hull approach determines if a point belongs to the convex hull of the + training set by checking if it can be represented as a convex combination of + training points. + + The method is based on the `highs` solver from the `scipy.optimize` module, but is still + slow at inference time. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + points_ : ndarray of shape (n_features + 1, n_samples) + Transformed training points used for convex hull calculations. + """ + + def fit(self, X, y=None): + """Fit the convex hull applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + # Add ones column and transpose for convex hull calculations + self.points_ = np.r_[X.T, np.ones((1, X.shape[0]))].astype(np.float32) + + return self + + def transform(self, X): + """Calculate distance from convex hull for each sample. + + A distance of 0 indicates the sample lies within the convex hull. + Positive values indicate distance outside the hull. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + distances : ndarray of shape (n_samples, 1) + Distance from convex hull. Zero for points inside the hull, + positive for points outside. + """ + check_is_fitted(self) + X = check_array(X) + + # Calculate distances + if X.ndim == 1: + X = X.reshape(1, -1) + + distances = [] + for sample in X: + # Append 1 to sample vector + sample_ext = np.r_[sample, 1].astype(np.float16) + + # Try to solve the linear programming problem + result = optimize.linprog( + np.ones(self.points_.shape[1], dtype=np.float32), + A_eq=self.points_, + b_eq=sample_ext, + method="highs", + ) + # Distance is positive if no solution found, 0 if solution exists + distances.append(0.0 if result.success else 1.0) + + return np.array(distances).reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores == 0, 1, -1) diff --git a/scikit_mol/applicability/hotelling.py b/scikit_mol/applicability/hotelling.py new file mode 100644 index 0000000..1494d48 --- /dev/null +++ b/scikit_mol/applicability/hotelling.py @@ -0,0 +1,169 @@ +""" +Hotelling T² applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from scipy.stats import f as f_dist +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class HotellingT2ApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on Hotelling's T² statistic. + + Uses Hotelling's T² statistic to define an elliptical confidence region + around the training data. The threshold can be set using either the + F-distribution (statistical approach) or adjusted using a validation set. + + Lower volume protrusion scores indicate samples closer to the training + data center. By default, the threshold is set using the F-distribution + with a significance level of 0.05 (95% confidence). When using fit_threshold, + a target_percentile of 95 means that 95% of the validation samples with + the lowest protrusion scores will be considered inside the domain. + + Parameters + ---------- + significance : float, default=0.05 + Significance level for F-distribution threshold. + Only used if fit_threshold is not called. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + t2_ : ndarray of shape (n_features,) + Hotelling T² ellipse parameters. + threshold_ : float + Current threshold for volume protrusions. + + Examples + -------- + >>> from scikit_mol.applicability import HotellingT2ApplicabilityDomain + >>> ad = HotellingT2ApplicabilityDomain() + >>> # Using F-distribution threshold (default) + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + >>> + >>> # Adjusting threshold using validation set + >>> ad.fit_threshold(X_val, target_percentile=95) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] Hotelling, H. (1931). The generalization of Student's ratio. + The Annals of Mathematical Statistics, 2(3), 360-378. + """ + + def __init__(self, significance=0.05): + if not 0 < significance < 1: + raise ValueError("significance must be between 0 and 1") + self.significance = significance + + def fit(self, X, y=None): + """Fit the Hotelling T² applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + n_samples = X.shape[0] + + # Determine the Hotelling T² ellipse + self.t2_ = np.sqrt((1 / n_samples) * (X**2).sum(axis=0)) + + # Set initial threshold using F-distribution + f_stat = ( + (n_samples - 1) + / n_samples + * self.n_features_in_ + * (n_samples**2 - 1) + / (n_samples * (n_samples - self.n_features_in_)) + ) + f_stat *= f_dist.ppf( + 1 - self.significance, self.n_features_in_, n_samples - self.n_features_in_ + ) + self.threshold_ = f_stat + + return self + + def fit_threshold(self, X, target_percentile=95): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + target_percentile : float, default=95 + Target percentile of samples to include within domain. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X) + + if not 0 <= target_percentile <= 100: + raise ValueError("target_percentile must be between 0 and 100") + + # Calculate volume protrusions for validation set + scores = self.transform(X).ravel() + + # Set threshold to achieve desired percentile (lower scores = inside domain) + self.threshold_ = np.percentile(scores, 100 - target_percentile) + + return self + + def transform(self, X): + """Calculate volume protrusion scores for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + The volume protrusion scores. Higher values indicate samples + further from the training data center. + """ + check_is_fitted(self) + X = check_array(X) + + # Calculate volume protrusions + protrusions = (X**2 / self.t2_**2).sum(axis=1) + return protrusions.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/isolation_forest.py b/scikit_mol/applicability/isolation_forest.py new file mode 100644 index 0000000..3ef23d3 --- /dev/null +++ b/scikit_mol/applicability/isolation_forest.py @@ -0,0 +1,154 @@ +""" +Isolation Forest applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.ensemble import IsolationForest +from sklearn.utils.validation import check_array, check_is_fitted + + +class IsolationForestApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on Isolation Forest. + + Uses Isolation Forest to identify outliers based on the isolation depth + of samples in random decision trees. + + Parameters + ---------- + n_estimators : int, default=100 + Number of trees in the forest. + contamination : float, default=0.01 + Expected proportion of outliers in the training data. + random_state : int or RandomState, default=None + Controls the randomness of the forest. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + iforest_ : IsolationForest + Fitted isolation forest model. + + Examples + -------- + >>> from scikit_mol.applicability import IsolationForestApplicabilityDomain + >>> ad = IsolationForestApplicabilityDomain(contamination=0.1) + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] Liu, F. T., Ting, K. M., & Zhou, Z. H. (2008). Isolation forest. + In 2008 Eighth IEEE International Conference on Data Mining (pp. 413-422). + """ + + def __init__(self, n_estimators=100, contamination=0.01, random_state=None): + if not 0 < contamination < 1: + raise ValueError("contamination must be between 0 and 1") + + self.n_estimators = n_estimators + self.contamination = contamination + self.random_state = random_state + + def fit(self, X, y=None): + """Fit the isolation forest applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + self.iforest_ = IsolationForest( + n_estimators=self.n_estimators, + contamination=self.contamination, + random_state=self.random_state, + ) + self.iforest_.fit(X) + + self.fit_threshold(X) + + return self + + def transform(self, X): + """Calculate anomaly scores for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + The anomaly scores of the samples. + The lower the score, the more abnormal the sample. + """ + check_is_fitted(self) + X = check_array(X) + + scores = self.iforest_.score_samples(X) + return scores.reshape(-1, 1) + + def fit_threshold(self, X, target_percentile=95): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + target_percentile : float, default=95 + Target percentile of samples to include within domain. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X) + + if not 0 <= target_percentile <= 100: + raise ValueError("target_percentile must be between 0 and 100") + + # Get decision function scores + scores = self.iforest_.score_samples(X) + + # Set threshold to achieve desired percentile + self.threshold_ = np.percentile(scores, 100 - target_percentile) + + return self + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + if hasattr(self, "threshold_"): + return np.where(scores > self.threshold_, 1, -1) + return self.iforest_.predict(X) diff --git a/scikit_mol/applicability/kernel_density.py b/scikit_mol/applicability/kernel_density.py new file mode 100644 index 0000000..40125f1 --- /dev/null +++ b/scikit_mol/applicability/kernel_density.py @@ -0,0 +1,143 @@ +""" +Kernel Density applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.neighbors import KernelDensity +from sklearn.utils.validation import check_array, check_is_fitted + + +class KernelDensityApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on kernel density estimation. + + Uses kernel density estimation to model the distribution of the training data. + Samples with density below a threshold (determined by percentile of training + data densities) are considered outside the domain. + + Parameters + ---------- + bandwidth : float, default=1.0 + The bandwidth of the kernel. + kernel : str, default='gaussian' + The kernel to use. Options: ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']. + percentile : float, default=1.0 + The percentile of training set densities to use as threshold. + Must be between 0 and 100. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + kde_ : KernelDensity + Fitted kernel density estimator. + threshold_ : float + Density threshold for domain membership. + + Examples + -------- + >>> from scikit_mol.applicability import KernelDensityApplicabilityDomain + >>> ad = KernelDensityApplicabilityDomain(bandwidth=1.0) + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + """ + + def __init__(self, bandwidth=1.0, kernel="gaussian", percentile=1.0): + if not 0 <= percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + + self.bandwidth = bandwidth + self.kernel = kernel + self.percentile = percentile + + def fit(self, X, y=None): + """Fit the kernel density applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + # Fit KDE + self.kde_ = KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel) + self.kde_.fit(X) + + # Set initial threshold based on training data + self.fit_threshold(X) + + return self + + def transform(self, X): + """Calculate log density scores for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + The log density scores of the samples. Higher scores indicate samples + more similar to the training data. + """ + check_is_fitted(self) + X = check_array(X) + + scores = self.kde_.score_samples(X) + return scores.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores >= self.threshold_, 1, -1) + + def fit_threshold(self, X): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X) + + # Calculate density threshold from provided data + densities = self.kde_.score_samples(X) + self.threshold_ = np.percentile(densities, self.percentile) + + return self diff --git a/scikit_mol/applicability/knn.py b/scikit_mol/applicability/knn.py new file mode 100644 index 0000000..ebfd2d5 --- /dev/null +++ b/scikit_mol/applicability/knn.py @@ -0,0 +1,150 @@ +""" +K-Nearest Neighbors applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.neighbors import NearestNeighbors +from sklearn.utils.validation import check_array, check_is_fitted + + +class KNNApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain defined using K-nearest neighbors. + + Parameters + ---------- + n_neighbors : int, default=5 + Number of neighbors to use for distance calculation. + percentile : float, default=99 + Percentile of training set distances to use as threshold. + Samples with distances above this percentile are considered outside + the applicability domain. The fit_threshold method can be used to update + the threshold using new data without refitting the model (e.g. validation data). + metric : str, default='euclidean' + Distance metric to use for nearest neighbor calculation. + Any metric supported by sklearn.neighbors.NearestNeighbors can be used. + n_jobs : int, default=None + Number of parallel jobs to run for neighbors search. + None means 1 unless in a joblib.parallel_backend context. + -1 means using all processors. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + threshold_ : float + Distance threshold for the applicability domain. + nn_ : NearestNeighbors + Fitted nearest neighbors model. + """ + + def __init__(self, n_neighbors=5, percentile=95, metric="euclidean", n_jobs=None): + self.n_neighbors = n_neighbors + self.percentile = percentile + self.metric = metric + self.n_jobs = n_jobs + + def fit(self, X, y=None): + """Fit the KNN applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + if not 0 <= self.percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + + X = check_array(X, accept_sparse=True) + + self.n_features_in_ = X.shape[1] + + # Fit nearest neighbors model + self.nn_ = NearestNeighbors( + n_neighbors=self.n_neighbors + 1, # +1 because point is its own neighbor + metric=self.metric, + n_jobs=self.n_jobs, + ) + self.nn_.fit(X) + + # Set initial threshold based on training data + self.fit_threshold(X) + + return self + + def fit_threshold(self, X): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=True) + + # Calculate distances to k nearest neighbors + distances, _ = self.nn_.kneighbors(X) + mean_distances = distances[:, 1:].mean(axis=1) + + # Set threshold based on distance distribution + self.threshold_ = np.percentile(mean_distances, self.percentile) + + return self + + def transform(self, X): + """Calculate mean distance to k nearest neighbors in training set. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + distances : ndarray of shape (n_samples, 1) + Mean distance to k nearest neighbors. Higher values indicate samples + further from the training set. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=True) + + # Calculate distances to k nearest neighbors + distances, _ = self.nn_.kneighbors(X) + mean_distances = distances.mean(axis=1) + + return mean_distances.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/leverage.py b/scikit_mol/applicability/leverage.py new file mode 100644 index 0000000..3d8f309 --- /dev/null +++ b/scikit_mol/applicability/leverage.py @@ -0,0 +1,103 @@ +""" +Leverage-based applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) as described in the README.md file. +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class LeverageApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain defined using the leverage approach. + + The leverage approach measures how far a sample is from the center of the + feature space using the diagonal elements of the hat matrix H = X(X'X)^(-1)X'. + + Parameters + ---------- + threshold_factor : float, default=3 + Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n + where p is the number of features and n is the number of samples. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + threshold_ : float + Calculated leverage threshold. + var_covar_ : ndarray of shape (n_features, n_features) + Variance-covariance matrix of the training data. + """ + + def __init__(self, threshold_factor=3): + self.threshold_factor = threshold_factor + + def fit(self, X, y=None): + """Fit the leverage applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + n_samples = X.shape[0] + + # Calculate variance-covariance matrix + self.var_covar_ = np.linalg.inv(X.T.dot(X)) + + # Calculate threshold + self.threshold_ = self.threshold_factor * (self.n_features_in_ + 1) / n_samples + + return self + + def transform(self, X): + """Calculate leverage values for X. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + h : ndarray of shape (n_samples, 1) + The leverage values. Higher values indicate samples further from + the center of the training data. + """ + check_is_fitted(self) + X = check_array(X) + + # Calculate leverage values + h = np.sum(X.dot(self.var_covar_) * X, axis=1) + return h.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores < self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/local_outlier.py b/scikit_mol/applicability/local_outlier.py new file mode 100644 index 0000000..3fd181f --- /dev/null +++ b/scikit_mol/applicability/local_outlier.py @@ -0,0 +1,119 @@ +""" +Local Outlier Factor applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.neighbors import LocalOutlierFactor +from sklearn.utils.validation import check_array, check_is_fitted + + +class LocalOutlierFactorApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on Local Outlier Factor (LOF). + + LOF measures the local deviation of density of a sample with respect to its + neighbors, identifying samples that have substantially lower density than + their neighbors. + + Parameters + ---------- + n_neighbors : int, default=20 + Number of neighbors to use for LOF calculation. + contamination : float, default=0.1 + Expected proportion of outliers in the data set. + metric : str, default='euclidean' + Metric to use for distance computation. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + lof_ : LocalOutlierFactor + Fitted LOF estimator. + + Examples + -------- + >>> from scikit_mol.applicability import LocalOutlierFactorApplicabilityDomain + >>> ad = LocalOutlierFactorApplicabilityDomain() + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] Breunig et al. (2000). LOF: Identifying Density-Based Local Outliers. + In: Proc. 2000 ACM SIGMOD Int. Conf. Manag. Data, ACM, pp. 93-104. + """ + + def __init__(self, n_neighbors=20, contamination=0.1, metric="euclidean"): + self.n_neighbors = n_neighbors + self.contamination = contamination + self.metric = metric + + def fit(self, X, y=None): + """Fit the LOF applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + self.lof_ = LocalOutlierFactor( + n_neighbors=self.n_neighbors, + metric=self.metric, + contamination=self.contamination, + novelty=True, + ) + self.lof_.fit(X) + + return self + + def transform(self, X): + """Calculate LOF scores for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + The LOF scores of the samples. Higher scores indicate samples + that are more likely to be outliers. + """ + check_is_fitted(self) + X = check_array(X) + + # Get negative LOF scores (higher means more likely to be inlier) + scores = -self.lof_.score_samples(X) + return scores.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + return self.lof_.predict(X) diff --git a/scikit_mol/applicability/mahalanobis.py b/scikit_mol/applicability/mahalanobis.py new file mode 100644 index 0000000..4be40de --- /dev/null +++ b/scikit_mol/applicability/mahalanobis.py @@ -0,0 +1,158 @@ +""" +Mahalanobis distance applicability domain. +""" + +import numpy as np +from scipy import stats +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class MahalanobisApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on Mahalanobis distance. + + Uses Mahalanobis distance to measure how many standard deviations a sample + is from the training set mean, taking into account the covariance structure + of the data. For multivariate normal data, the squared Mahalanobis distances + follow a chi-square distribution. + + Parameters + ---------- + percentile : float, default=95.0 + Percentile for the confidence region (0-100). + Default 95.0 corresponds to ~2 standard deviations. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + mean_ : ndarray of shape (n_features,) + Mean of training data. + covariance_ : ndarray of shape (n_features, n_features) + Covariance matrix of training data. + inv_covariance_ : ndarray of shape (n_features, n_features) + Inverse covariance matrix. + threshold_ : float + Current threshold for Mahalanobis distances. + + Examples + -------- + >>> from scikit_mol.applicability import MahalanobisApplicabilityDomain + >>> ad = MahalanobisApplicabilityDomain(percentile=95) + >>> ad.fit(X_train) + >>> # Using chi-square threshold (default) + >>> predictions = ad.predict(X_test) + >>> + >>> # Adjusting threshold using validation set + >>> ad.fit_threshold(X_val, target_percentile=95) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] De Maesschalck, R., Jouan-Rimbaud, D., & Massart, D. L. (2000). + The Mahalanobis distance. Chemometrics and intelligent laboratory + systems, 50(1), 1-18. + """ + + def __init__(self, percentile=95.0): + if not 0 <= percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + self.percentile = percentile + + def fit(self, X, y=None): + """Fit the Mahalanobis distance applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + # Compute mean and covariance + self.mean_ = np.mean(X, axis=0) + self.covariance_ = np.cov(X, rowvar=False) + self.inv_covariance_ = np.linalg.inv(self.covariance_) + + # Set initial threshold using chi-square distribution + self.threshold_ = stats.chi2.ppf(self.percentile / 100, df=self.n_features_in_) + + return self + + def fit_threshold(self, X, target_percentile=95): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + target_percentile : float, default=95 + Target percentile of samples to include within domain. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X) + + if not 0 <= target_percentile <= 100: + raise ValueError("target_percentile must be between 0 and 100") + + # Calculate distances for validation set + scores = self.transform(X).ravel() + + # Set threshold to achieve desired percentile (lower distances = inside domain) + self.threshold_ = np.percentile(scores, 100 - target_percentile) + + return self + + def transform(self, X): + """Calculate Mahalanobis distances for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + distances : ndarray of shape (n_samples, 1) + The Mahalanobis distances. Higher values indicate samples + further from the training data center. + """ + check_is_fitted(self) + X = check_array(X) + + # Center the data + X_centered = X - self.mean_ + + # Calculate Mahalanobis distances + distances = np.sum(X_centered @ self.inv_covariance_ * X_centered, axis=1) + return distances.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/standardization.py b/scikit_mol/applicability/standardization.py new file mode 100644 index 0000000..c1e1414 --- /dev/null +++ b/scikit_mol/applicability/standardization.py @@ -0,0 +1,165 @@ +""" +Standardization approach applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. +""" + +import numpy as np +from scipy import stats +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import StandardScaler +from sklearn.utils.validation import check_array, check_is_fitted + + +class StandardizationApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain based on standardized feature values. + + Samples are considered within the domain if their standardized features + have a mean + z * std <= threshold, or if their maximum standardized + value <= threshold, where z corresponds to the specified percentile + assuming a normal distribution. + + Parameters + ---------- + percentile : float, default=95.0 + Percentile for the confidence interval (0-100). + Default 95.0 corresponds to ~2 standard deviations. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + scaler_ : StandardScaler + Fitted standard scaler. + threshold_ : float + Current threshold for standardized values. + + Examples + -------- + >>> from scikit_mol.applicability import StandardizationApplicabilityDomain + >>> ad = StandardizationApplicabilityDomain(percentile=95) + >>> ad.fit(X_train) + >>> # Optionally adjust threshold using validation set + >>> ad.fit_threshold(X_val, target_percentile=95) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] Roy, K., Kar, S., & Ambure, P. (2015). On a simple approach for + determining applicability domain of QSAR models. Chemometrics and + Intelligent Laboratory Systems, 145, 22-29. + """ + + def __init__(self, percentile=95.0): + if not 0 <= percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + self.percentile = percentile + + def fit(self, X, y=None): + """Fit the standardization applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + + self.scaler_ = StandardScaler() + self.scaler_.fit(X) + + # Convert percentile to z-score for initial threshold + self.threshold_ = stats.norm.ppf(self.percentile / 100) + + return self + + def fit_threshold(self, X, target_percentile=95): + """Update the threshold using new data without refitting the model. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to compute threshold from. + target_percentile : float, default=95 + Target percentile of samples to include within domain. + + Returns + ------- + self : object + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X) + + if not 0 <= target_percentile <= 100: + raise ValueError("target_percentile must be between 0 and 100") + + # Calculate scores for the provided data + scores = self.transform(X).ravel() + + # Set threshold to achieve desired percentile + self.threshold_ = np.percentile(scores, target_percentile) + + return self + + def transform(self, X): + """Calculate standardized feature statistics for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + The maximum of: + 1. Maximum absolute standardized value + 2. Mean + z * std of standardized values + where z corresponds to the specified percentile. + Higher values indicate samples further from the training data. + """ + check_is_fitted(self) + X = check_array(X) + + # Standardize features + X_std = self.scaler_.transform(X) + + # Calculate statistics + max_vals = np.max(np.abs(X_std), axis=1) + means = np.mean(X_std, axis=1) + stds = np.std(X_std, axis=1) + z_score = -stats.norm.ppf(self.percentile / 100) # negative for lower tail + mean_std = means + z_score * stds + + # Return maximum of the two criteria + scores = np.maximum(max_vals, mean_std) + return scores.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/topkat.py b/scikit_mol/applicability/topkat.py new file mode 100644 index 0000000..1392c65 --- /dev/null +++ b/scikit_mol/applicability/topkat.py @@ -0,0 +1,148 @@ +""" +TOPKAT's Optimal Prediction Space (OPS) applicability domain. + +This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) +Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) +See LICENSE.MIT in this directory for the original MIT license. + +""" + +import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array, check_is_fitted + + +class TopkatApplicabilityDomain(BaseEstimator, TransformerMixin): + """Applicability domain defined using TOPKAT's Optimal Prediction Space (OPS). + + The method transforms the input space (P-space) to a normalized space (S-space), + then projects it to the Optimal Prediction Space using eigendecomposition. + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + X_min_ : ndarray of shape (n_features,) + Minimum values of training features. + X_max_ : ndarray of shape (n_features,) + Maximum values of training features. + eigen_val_ : ndarray of shape (n_features + 1,) + Eigenvalues of the S-space transformation. + eigen_vec_ : ndarray of shape (n_features + 1, n_features + 1) + Eigenvectors of the S-space transformation. + + Examples + -------- + >>> from scikit_mol.applicability import TopkatApplicabilityDomain + >>> ad = TopkatApplicabilityDomain() + >>> ad.fit(X_train) + >>> predictions = ad.predict(X_test) + + References + ---------- + .. [1] Gombar, Vijay K. (1996). Method and apparatus for validation of model-based + predictions (US Patent No. 6-036-349) USPTO. + """ + + def fit(self, X, y=None): + """Fit the TOPKAT applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Ignored + Not used, present for API consistency. + + Returns + ------- + self : object + Returns the instance itself. + """ + X = check_array(X) + self.n_features_in_ = X.shape[1] + n_samples = X.shape[0] + + # Store scaling factors + self.X_min_ = X.min(axis=0) + self.X_max_ = X.max(axis=0) + + # Transform P-space to S-space + denom = np.where( + (self.X_max_ - self.X_min_) != 0, (self.X_max_ - self.X_min_), 1 + ) + S = (2 * X - self.X_max_ - self.X_min_) / denom + + # Add column of ones + S = np.c_[np.ones(n_samples), S] + + # Calculate eigendecomposition + self.eigen_val_, self.eigen_vec_ = np.linalg.eigh(S.T.dot(S)) + + # Ensure real values (numerical stability) + self.eigen_val_ = np.real(self.eigen_val_) + self.eigen_vec_ = np.real(self.eigen_vec_) + + return self + + def transform(self, X): + """Calculate OPS distance scores for samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data to transform. + + Returns + ------- + distances : ndarray of shape (n_samples, 1) + OPS distance scores. Higher values indicate samples further + from the training data. + """ + check_is_fitted(self) + X = check_array(X) + + # Transform to S-space + denom = np.where( + (self.X_max_ - self.X_min_) != 0, (self.X_max_ - self.X_min_), 1 + ) + S = (2 * X - self.X_max_ - self.X_min_) / denom + + # Add column of ones + if X.ndim == 1: + S = np.r_[1, S].reshape(1, -1) + else: + S = np.c_[np.ones(X.shape[0]), S] + + # Project to OPS + OPS = S.dot(self.eigen_vec_) + + # Calculate OPS distances - matching MLChemAD's approach + denom = np.divide( + np.ones_like(self.eigen_val_, dtype=float), + self.eigen_val_, + out=np.zeros_like(self.eigen_val_), + where=self.eigen_val_ != 0, + ) + distances = (OPS * OPS).dot(denom) + + return distances.reshape(-1, 1) + + def predict(self, X): + """Predict whether samples are within the applicability domain. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to predict. + + Returns + ------- + y_pred : ndarray of shape (n_samples,) + Returns 1 for samples inside the domain and -1 for samples outside + (following scikit-learn's convention for outlier detection). + """ + scores = self.transform(X).ravel() + threshold = 5 * (self.n_features_in_ + 1) / (2 * self.n_features_in_) + return np.where(scores < threshold, 1, -1) diff --git a/scikit_mol/applicability.py b/scikit_mol/applicability_old.py similarity index 97% rename from scikit_mol/applicability.py rename to scikit_mol/applicability_old.py index 8cfda1b..e7d8ecf 100644 --- a/scikit_mol/applicability.py +++ b/scikit_mol/applicability_old.py @@ -1,11 +1,8 @@ import numpy as np +from scipy import linalg, stats from scipy.sparse import csr_matrix -from sklearn.neighbors import NearestNeighbors from sklearn.base import BaseEstimator, TransformerMixin, check_array, check_is_fitted -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_is_fitted, check_array -import numpy as np -from scipy import linalg +from sklearn.neighbors import NearestNeighbors class NearestNeighborsDistance(BaseEstimator, TransformerMixin): @@ -231,7 +228,9 @@ def fit(self, X, y=None): # Set threshold based on strategy if self.threshold_strategy == "chi2": - self.threshold_ = chi2.ppf(self.threshold_quantile, df=self.n_features_in_) + self.threshold_ = stats.chi2.ppf( + self.threshold_quantile, df=self.n_features_in_ + ) elif self.threshold_strategy == "empirical": self.threshold_ = np.quantile(train_distances, self.threshold_quantile) elif self.threshold_strategy is None: From 2f3abfd96f001855e5ccbad64239704b1aa639df Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 09:39:11 +0100 Subject: [PATCH 04/24] Updated README with a reference --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 34920b4..7769378 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ Scikit-Mol has been featured in blog-posts or used in research, some examples wh - [WAE-DTI: Ensemble-based architecture for drug–target interaction prediction using descriptors and embeddings](https://www.sciencedirect.com/science/article/pii/S2352914824001618) - [Data Driven Estimation of Molecular Log-Likelihood using Fingerprint Key Counting](https://chemrxiv.org/engage/chemrxiv/article-details/661402ee21291e5d1d646651) - [AUTONOMOUS DRUG DISCOVERY](https://www.proquest.com/openview/3e830e36bc618f263905a99e787c66c6/1?pq-origsite=gscholar&cbl=18750&diss=y) +- [DrugGym: A testbed for the economics of autonomous drug discovery](https://www.biorxiv.org/content/10.1101/2024.05.28.596296v1.abstract) ## Roadmap and Contributing From 198cfba1a62f84ecc65825aac7a8b38f0cc12682 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 10:36:15 +0100 Subject: [PATCH 05/24] Developed a base_class for making AD estimators consistent --- scikit_mol/applicability/base.py | 253 +++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 scikit_mol/applicability/base.py diff --git a/scikit_mol/applicability/base.py b/scikit_mol/applicability/base.py new file mode 100644 index 0000000..9e52ff2 --- /dev/null +++ b/scikit_mol/applicability/base.py @@ -0,0 +1,253 @@ +"""Base class for applicability domain estimators.""" + +from abc import ABC, abstractmethod +from typing import Any, ClassVar, Optional, Union + +import numpy as np +import pandas as pd +from numpy.typing import ArrayLike, NDArray +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array +from sklearn.utils._set_output import _SetOutputMixin, _wrap_method_output +from sklearn.utils.validation import check_is_fitted + + +class _ADOutputMixin(_SetOutputMixin): + """Extends sklearn's _SetOutputMixin to handle predict and score_transform methods.""" + + def __init_subclass__(cls, **kwargs): + # First handle transform/fit_transform via parent + super().__init_subclass__(auto_wrap_output_keys=("transform",), **kwargs) + + # Add our additional methods + for method in ["predict", "score_transform"]: + if method not in cls.__dict__: + continue + wrapped_method = _wrap_method_output(getattr(cls, method), "transform") + setattr(cls, method, wrapped_method) + + +class BaseApplicabilityDomain(BaseEstimator, TransformerMixin, _ADOutputMixin, ABC): + """Base class for applicability domain estimators. + + Parameters + ---------- + percentile : float or None, default=None + Percentile of samples to consider within domain (0-100). + If None: + - For methods with statistical thresholds: use statistical method + - For percentile-only methods: use 99.0 (include 99% of training samples) + + Notes + ----- + Subclasses must define _scoring_convention as either: + - 'high_outside': Higher scores indicate samples outside domain (e.g., distances) + - 'high_inside': Higher scores indicate samples inside domain (e.g., likelihoods) + + The raw scores from transform() should maintain their natural interpretation, + while predict() will handle the conversion to ensure consistent output + (1 = inside domain, -1 = outside domain). + + Attributes + ---------- + n_features_in_ : int + Number of features seen during fit. + threshold_ : float + Current threshold for domain membership. + """ + + _supports_threshold_fitting: ClassVar[bool] = True + _scoring_convention: ClassVar[str] # Must be set by subclasses + + def __init__( + self, percentile: Optional[float] = None, feature_prefix: str = "AD_estimator" + ) -> None: + if not hasattr(self, "_scoring_convention"): + raise TypeError( + f"Class {self.__class__.__name__} must define _scoring_convention " + "as either 'high_outside' or 'high_inside'" + ) + if self._scoring_convention not in ["high_outside", "high_inside"]: + raise ValueError( + f"Invalid _scoring_convention '{self._scoring_convention}'. " + "Must be either 'high_outside' or 'high_inside'" + ) + if percentile is not None and not 0 <= percentile <= 100: + raise ValueError("percentile must be between 0 and 100") + self.percentile = percentile + self.feature_prefix = feature_prefix + self._check_params = { + "estimator": self, + "accept_sparse": False, + "dtype": None, + "force_all_finite": True, + "ensure_2d": True, + } + + @abstractmethod + def fit(self, X: ArrayLike, y: Optional[Any] = None) -> "BaseApplicabilityDomain": + """Fit the applicability domain estimator. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Training data. + y : Any, optional (default=None) + Not used, present for API consistency. + + Returns + ------- + self : BaseApplicabilityDomain + Returns the instance itself. + """ + raise NotImplementedError("Subclasses should implement fit") + + def fit_threshold( + self, X: ArrayLike, target_percentile: Optional[float] = None + ) -> "BaseApplicabilityDomain": + """Update threshold estimation using new data. + + Parameters + ---------- + X : array-like + Data to compute threshold from. + target_percentile : float, optional (default=None) + If provided: Use this percentile and update self.percentile + If None: Use current self.percentile setting + - For methods with statistical thresholds: use statistical method if percentile=None + - For percentile-only methods: use 99.0 if percentile=None + + Returns + ------- + self : BaseApplicabilityDomain + Returns the instance itself. + """ + check_is_fitted(self) + X = check_array(X, **self._check_params) + + if target_percentile is not None: + if not 0 <= target_percentile <= 100: + raise ValueError("target_percentile must be between 0 and 100") + self.percentile = target_percentile + + # Use statistical threshold if available and percentile is None + if self.percentile is None: + if hasattr(self, "_set_statistical_threshold"): + self._set_statistical_threshold(X) + else: + # Use 99th percentile for methods without statistical thresholds + scores = self.transform(X).ravel() + if self._scoring_convention == "high_outside": + self.threshold_ = np.percentile( + scores, 99.0 + ) # Only 1% above threshold (outside) + else: # high_inside + self.threshold_ = np.percentile( + scores, 1.0 + ) # Only 1% below threshold (outside) + else: + scores = self.transform(X).ravel() + if self._scoring_convention == "high_outside": + self.threshold_ = np.percentile( + scores, self.percentile + ) # percentile% below = inside + else: # high_inside + self.threshold_ = np.percentile( + scores, 100 - self.percentile + ) # percentile% above = inside + + return self + + def transform( + self, X: Union[ArrayLike, pd.DataFrame], y: Optional[Any] = None + ) -> Union[NDArray[np.float64], pd.DataFrame]: + """Calculate applicability domain scores. + + Parameters + ---------- + X : array-like or pandas DataFrame + The data to transform. + + Returns + ------- + scores : ndarray or pandas DataFrame + Method-specific scores. Interpretation depends on _scoring_convention: + - 'high_outside': Higher scores indicate samples further from training data + - 'high_inside': Higher scores indicate samples closer to training data + Shape (n_samples, 1). + """ + check_is_fitted(self) + X = check_array(X, **self._check_params) + + # Calculate scores + scores = self._transform(X) + + return scores + + @abstractmethod + def _transform(self, X: NDArray) -> NDArray[np.float64]: + """Implementation of the transform method. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Validated input data. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + Method-specific scores. + """ + raise NotImplementedError("Subclasses should implement _transform") + + def predict( + self, X: Union[ArrayLike, pd.DataFrame] + ) -> Union[NDArray[np.int_], pd.DataFrame]: + """Predict whether samples are within the applicability domain.""" + + check_is_fitted(self) + X = check_array(X, **self._check_params) + + # Calculate predictions + scores = self._transform(X).ravel() + if self._scoring_convention == "high_outside": + predictions = np.where(scores <= self.threshold_, 1, -1) + else: # high_inside + predictions = np.where(scores >= self.threshold_, 1, -1) + + return predictions + + def score_transform(self, X: ArrayLike) -> NDArray[np.float64]: + """Transform raw scores to [0,1] range using sigmoid. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The samples to transform. + + Returns + ------- + scores : ndarray of shape (n_samples, 1) + Transformed scores in [0,1] range. Higher values indicate + samples more likely to be within domain, regardless of + the method's raw score convention. + """ + + check_is_fitted(self) + scores = self.transform( + X + ) # May be pandas dataframe returned if that is set as output transform. + scores = check_array(scores, **self._check_params).ravel() + + # TODO: the sharpness ought to somehow be fitted to the range of the raw_scores + if self._scoring_convention == "high_outside": + # Flip sign for sigmoid so higher output = more likely inside + return (1 / (1 + np.exp(scores - self.threshold_))).reshape(-1, 1) + else: # high_inside + # No sign flip needed + return (1 / (1 + np.exp(self.threshold_ - scores))).reshape(-1, 1) + + def get_feature_names_out(self) -> NDArray[np.str_]: + """Get feature name for output column.""" + + return np.array([f"{self.feature_prefix}"]) From d5d84ad7520b73283763539fa97f2ecb8ed7c236 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 14:04:02 +0100 Subject: [PATCH 06/24] Made first transition for the kNN implementation. Currently only supports distance based methods, and only bit fingerprints for jaccard/tanimoto distances. --- scikit_mol/applicability/base.py | 93 +++++++++-------- scikit_mol/applicability/knn.py | 165 +++++++++++++++++-------------- tests/__init__.py | 0 tests/applicability/__init__.py | 0 tests/applicability/conftest.py | 49 +++++++++ tests/applicability/test_base.py | 127 ++++++++++++++++++++++++ tests/applicability/test_knn.py | 25 +++++ 7 files changed, 337 insertions(+), 122 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/applicability/__init__.py create mode 100644 tests/applicability/conftest.py create mode 100644 tests/applicability/test_base.py create mode 100644 tests/applicability/test_knn.py diff --git a/scikit_mol/applicability/base.py b/scikit_mol/applicability/base.py index 9e52ff2..cf7d994 100644 --- a/scikit_mol/applicability/base.py +++ b/scikit_mol/applicability/base.py @@ -27,6 +27,24 @@ def __init_subclass__(cls, **kwargs): setattr(cls, method, wrapped_method) +def _safe_flatten(X: Union[ArrayLike, pd.DataFrame]) -> NDArray[np.float64]: + """Safely flatten numpy arrays or pandas DataFrames to 1D array. + + Parameters + ---------- + X : array-like or DataFrame of shape (n_samples, n_features) + Input data to flatten + + Returns + ------- + flattened : ndarray of shape (n_samples,) + Flattened 1D array + """ + if hasattr(X, "to_numpy"): # pandas DataFrame + return X.to_numpy().ravel() + return np.asarray(X).ravel() + + class BaseApplicabilityDomain(BaseEstimator, TransformerMixin, _ADOutputMixin, ABC): """Base class for applicability domain estimators. @@ -103,25 +121,11 @@ def fit(self, X: ArrayLike, y: Optional[Any] = None) -> "BaseApplicabilityDomain raise NotImplementedError("Subclasses should implement fit") def fit_threshold( - self, X: ArrayLike, target_percentile: Optional[float] = None + self, + X: Union[ArrayLike, pd.DataFrame], + target_percentile: Optional[float] = None, ) -> "BaseApplicabilityDomain": - """Update threshold estimation using new data. - - Parameters - ---------- - X : array-like - Data to compute threshold from. - target_percentile : float, optional (default=None) - If provided: Use this percentile and update self.percentile - If None: Use current self.percentile setting - - For methods with statistical thresholds: use statistical method if percentile=None - - For percentile-only methods: use 99.0 if percentile=None - - Returns - ------- - self : BaseApplicabilityDomain - Returns the instance itself. - """ + """Update threshold estimation using new data.""" check_is_fitted(self) X = check_array(X, **self._check_params) @@ -131,30 +135,24 @@ def fit_threshold( self.percentile = target_percentile # Use statistical threshold if available and percentile is None + if self.percentile is None and hasattr(self, "_set_statistical_threshold"): + self._set_statistical_threshold(X) + return self + + # Otherwise use percentile-based threshold + scores = _safe_flatten(self.transform(X)) + if self.percentile is None: - if hasattr(self, "_set_statistical_threshold"): - self._set_statistical_threshold(X) - else: - # Use 99th percentile for methods without statistical thresholds - scores = self.transform(X).ravel() - if self._scoring_convention == "high_outside": - self.threshold_ = np.percentile( - scores, 99.0 - ) # Only 1% above threshold (outside) - else: # high_inside - self.threshold_ = np.percentile( - scores, 1.0 - ) # Only 1% below threshold (outside) + # Default percentile for methods without statistical thresholds + if self._scoring_convention == "high_outside": + self.threshold_ = np.percentile(scores, 99.0) + else: # high_inside + self.threshold_ = np.percentile(scores, 1.0) else: - scores = self.transform(X).ravel() if self._scoring_convention == "high_outside": - self.threshold_ = np.percentile( - scores, self.percentile - ) # percentile% below = inside + self.threshold_ = np.percentile(scores, self.percentile) else: # high_inside - self.threshold_ = np.percentile( - scores, 100 - self.percentile - ) # percentile% above = inside + self.threshold_ = np.percentile(scores, 100 - self.percentile) return self @@ -204,40 +202,37 @@ def predict( self, X: Union[ArrayLike, pd.DataFrame] ) -> Union[NDArray[np.int_], pd.DataFrame]: """Predict whether samples are within the applicability domain.""" - check_is_fitted(self) X = check_array(X, **self._check_params) # Calculate predictions - scores = self._transform(X).ravel() + scores = _safe_flatten(self.transform(X)) if self._scoring_convention == "high_outside": predictions = np.where(scores <= self.threshold_, 1, -1) else: # high_inside predictions = np.where(scores >= self.threshold_, 1, -1) - return predictions + return predictions.ravel() - def score_transform(self, X: ArrayLike) -> NDArray[np.float64]: + def score_transform( + self, X: Union[ArrayLike, pd.DataFrame] + ) -> Union[NDArray[np.float64], pd.DataFrame]: """Transform raw scores to [0,1] range using sigmoid. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : array-like or DataFrame of shape (n_samples, n_features) The samples to transform. Returns ------- - scores : ndarray of shape (n_samples, 1) + scores : ndarray or DataFrame of shape (n_samples, 1) Transformed scores in [0,1] range. Higher values indicate samples more likely to be within domain, regardless of the method's raw score convention. """ - check_is_fitted(self) - scores = self.transform( - X - ) # May be pandas dataframe returned if that is set as output transform. - scores = check_array(scores, **self._check_params).ravel() + scores = _safe_flatten(self.transform(X)) # TODO: the sharpness ought to somehow be fitted to the range of the raw_scores if self._scoring_convention == "high_outside": diff --git a/scikit_mol/applicability/knn.py b/scikit_mol/applicability/knn.py index ebfd2d5..749980e 100644 --- a/scikit_mol/applicability/knn.py +++ b/scikit_mol/applicability/knn.py @@ -7,49 +7,116 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Callable, ClassVar, Optional, Union + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin +from numpy.typing import ArrayLike from sklearn.neighbors import NearestNeighbors -from sklearn.utils.validation import check_array, check_is_fitted + +from .base import BaseApplicabilityDomain -class KNNApplicabilityDomain(BaseEstimator, TransformerMixin): +class KNNApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain defined using K-nearest neighbors. + Determines domain membership based on the mean distance to k nearest neighbors + in the training set. Higher distances indicate samples further from the + training distribution. + Parameters ---------- n_neighbors : int, default=5 Number of neighbors to use for distance calculation. - percentile : float, default=99 - Percentile of training set distances to use as threshold. - Samples with distances above this percentile are considered outside - the applicability domain. The fit_threshold method can be used to update - the threshold using new data without refitting the model (e.g. validation data). - metric : str, default='euclidean' - Distance metric to use for nearest neighbor calculation. - Any metric supported by sklearn.neighbors.NearestNeighbors can be used. + percentile : float or None, default=None + Percentile of training set distances to use as threshold (0-100). + If None, uses 99.0 (include 99% of training samples). + distance_metric : str or callable, default='euclidean' + Distance metric to use. Options: + - 'euclidean': Euclidean distance (default) + - 'manhattan': Manhattan distance + - 'cosine': Cosine distance + - 'tanimoto': Tanimoto distance for binary fingerprints (same as 'jaccard') + - 'jaccard': Jaccard distance for binary fingerprints + - callable: Custom distance metric function(X, Y) -> array-like + Any distance metric supported by sklearn.neighbors.NearestNeighbors can also be used. + Note: Only distance metrics are supported (higher values = more distant) currently. n_jobs : int, default=None Number of parallel jobs to run for neighbors search. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. + feature_prefix : str, default='KNN' + Prefix for feature names in output. + + Notes + ----- + For binary fingerprints, the Tanimoto distance is equivalent to the Jaccard distance. + Both 'tanimoto' and 'jaccard' options use scipy's implementation of the Jaccard + distance metric. Attributes ---------- n_features_in_ : int Number of features seen during fit. threshold_ : float - Distance threshold for the applicability domain. + Distance threshold for domain membership. nn_ : NearestNeighbors Fitted nearest neighbors model. + + Examples + -------- + >>> import numpy as np + >>> from scikit_mol.applicability import KNNApplicabilityDomain + >>> + >>> # Generate example data + >>> rng = np.random.RandomState(0) + >>> X_train = rng.normal(0, 1, (100, 5)) + >>> X_test = rng.normal(0, 2, (20, 5)) # More spread out than training + >>> + >>> # Fit AD model + >>> ad = KNNApplicabilityDomain(n_neighbors=5, percentile=95) + >>> ad.fit(X_train) + >>> + >>> # Get raw distance scores (higher = more distant) + >>> distances = ad.transform(X_test) + >>> + >>> # Get domain membership predictions + >>> predictions = ad.predict(X_test) # 1 = inside, -1 = outside + >>> + >>> # Get probability-like scores + >>> scores = ad.score_transform(X_test) # Higher = more likely inside """ - def __init__(self, n_neighbors=5, percentile=95, metric="euclidean", n_jobs=None): + _scoring_convention: ClassVar[str] = ( + "high_outside" # Higher distance = outside domain + ) + + def __init__( + self, + n_neighbors: int = 5, + percentile: Optional[float] = None, + distance_metric: Union[str, Callable] = "euclidean", + n_jobs: Optional[int] = None, + feature_prefix: str = "KNN", + ) -> None: + super().__init__(percentile=percentile, feature_prefix=feature_prefix) self.n_neighbors = n_neighbors - self.percentile = percentile - self.metric = metric + self.distance_metric = distance_metric self.n_jobs = n_jobs - def fit(self, X, y=None): + @property + def distance_metric(self) -> Union[Callable, str]: + return self._distance_metric + + @distance_metric.setter + def distance_metric(self, value: Union[str, Callable]) -> None: + if not isinstance(value, (str, Callable)): + raise ValueError("distance_metric must be a string or callable") + if value == "tanimoto": + self._distance_metric = "jaccard" # Use scipy's jaccard metric + else: + self._distance_metric = value + + def fit(self, X: ArrayLike, y=None) -> "KNNApplicabilityDomain": """Fit the KNN applicability domain. Parameters @@ -61,20 +128,19 @@ def fit(self, X, y=None): Returns ------- - self : object + self : KNNApplicabilityDomain Returns the instance itself. """ - if not 0 <= self.percentile <= 100: - raise ValueError("percentile must be between 0 and 100") - - X = check_array(X, accept_sparse=True) + if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1: + raise ValueError("n_neighbors must be a positive integer") + X = self._validate_data(X) self.n_features_in_ = X.shape[1] # Fit nearest neighbors model self.nn_ = NearestNeighbors( n_neighbors=self.n_neighbors + 1, # +1 because point is its own neighbor - metric=self.metric, + metric=self.distance_metric, n_jobs=self.n_jobs, ) self.nn_.fit(X) @@ -84,38 +150,13 @@ def fit(self, X, y=None): return self - def fit_threshold(self, X): - """Update the threshold using new data without refitting the model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - - Returns - ------- - self : object - Returns the instance itself. - """ - check_is_fitted(self) - X = check_array(X, accept_sparse=True) - - # Calculate distances to k nearest neighbors - distances, _ = self.nn_.kneighbors(X) - mean_distances = distances[:, 1:].mean(axis=1) - - # Set threshold based on distance distribution - self.threshold_ = np.percentile(mean_distances, self.percentile) - - return self - - def transform(self, X): + def _transform(self, X: np.ndarray) -> np.ndarray: """Calculate mean distance to k nearest neighbors in training set. Parameters ---------- - X : array-like of shape (n_samples, n_features) - The data to transform. + X : ndarray of shape (n_samples, n_features) + Validated input data. Returns ------- @@ -123,28 +164,6 @@ def transform(self, X): Mean distance to k nearest neighbors. Higher values indicate samples further from the training set. """ - check_is_fitted(self) - X = check_array(X, accept_sparse=True) - - # Calculate distances to k nearest neighbors distances, _ = self.nn_.kneighbors(X) - mean_distances = distances.mean(axis=1) - + mean_distances = distances[:, 1:].mean(axis=1) # Skip first (self) neighbor return mean_distances.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - return np.where(scores <= self.threshold_, 1, -1) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/applicability/__init__.py b/tests/applicability/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/applicability/conftest.py b/tests/applicability/conftest.py new file mode 100644 index 0000000..f762a8e --- /dev/null +++ b/tests/applicability/conftest.py @@ -0,0 +1,49 @@ +import numpy as np +import pytest +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + +from scikit_mol.applicability import KNNApplicabilityDomain +from scikit_mol.fingerprints import MorganFingerprintTransformer + +from ..fixtures import mols_list + + +@pytest.fixture( + params=[ + (KNNApplicabilityDomain, dict(n_neighbors=3)), + # Add other AD estimators here as (class, params) tuples + ] +) +def ad_estimator(request): + """Fixture providing fresh AD estimator instances.""" + estimator_class, params = request.param + return estimator_class(**params) + + +@pytest.fixture +def reduced_fingerprints(mols_list): + """Create dimensionality-reduced fingerprints for AD testing.""" + # Generate larger fingerprints first + fps = MorganFingerprintTransformer(fpSize=1024).fit_transform(mols_list) + # Reduce dimensionality while preserving ~90% variance + pca = PCA(n_components=0.9) + return StandardScaler().fit_transform(pca.fit_transform(fps)) + + +@pytest.fixture +def binary_fingerprints(mols_list): + """Binary fingerprints for testing e.g. Tanimoto distance.""" + return MorganFingerprintTransformer(fpSize=1024).fit_transform(mols_list) + + +@pytest.fixture +def ad_test_data(): + """Simple 2D data with clear in/out domain regions.""" + rng = np.random.RandomState(42) # Fixed seed for reproducibility + X_train = rng.uniform(0, 1, (20, 2)) + X_test_in = rng.uniform(0.25, 0.75, (5, 2)) + X_test_out = rng.uniform(2, 3, (5, 2)) + X_test = np.vstack([X_test_in, X_test_out]) + y_test = np.array([1] * 5 + [-1] * 5) + return X_train, X_test, y_test diff --git a/tests/applicability/test_base.py b/tests/applicability/test_base.py new file mode 100644 index 0000000..dd0df67 --- /dev/null +++ b/tests/applicability/test_base.py @@ -0,0 +1,127 @@ +"""Common tests for all applicability domain estimators.""" + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.estimator_checks import check_estimator + +# def test_estimator_api(ad_estimator): +# """Test scikit-learn API compatibility.""" +# check_estimator(ad_estimator) + + +def test_basic_functionality(ad_estimator, reduced_fingerprints): + """Test basic fit/transform on reduced fingerprints.""" + ad_estimator.fit(reduced_fingerprints) + scores = ad_estimator.transform(reduced_fingerprints) + assert scores.shape == (len(reduced_fingerprints), 1) + assert np.isfinite(scores).all() + + +def test_predict_functionality(ad_estimator, ad_test_data): + """Test predict method returns expected values.""" + X_train, X_test, expected = ad_test_data + + # Fit and predict + ad_estimator.fit(X_train) + predictions = ad_estimator.predict(X_test) + + # Check output format + assert predictions.shape == (len(X_test),) # Should be 1D + assert set(np.unique(predictions)) <= {-1, 1} # Only -1 and 1 allowed + + # Check predictions make sense (in/out of domain) + accuracy = np.mean(predictions == expected) + assert accuracy >= 0.8 # Allow some misclassification + + +def test_score_transform(ad_estimator, ad_test_data): + """Test score_transform returns valid probability-like scores.""" + X_train, X_test, expected = ad_test_data + + # Fit and get scores + ad_estimator.fit(X_train) + scores = ad_estimator.score_transform(X_test) + + # Check output format + assert scores.shape == (len(X_test), 1) + assert np.all((0 <= scores) & (scores <= 1)) # Scores in [0,1] + + # Check scores correlate with domain membership + in_domain = expected == 1 + mean_in = np.mean(scores[in_domain]) + mean_out = np.mean(scores[~in_domain]) + assert mean_in > mean_out # Inside domain should have higher scores + + +def test_threshold_setting(ad_estimator, reduced_fingerprints): + """Test threshold setting and percentile behavior.""" + # Test default threshold + ad_estimator.fit(reduced_fingerprints) + pred_default = ad_estimator.predict(reduced_fingerprints) + + # Test custom percentile + ad_estimator.percentile = 90 + ad_estimator.fit_threshold(reduced_fingerprints) + pred_90 = ad_estimator.predict(reduced_fingerprints) + + # More samples should be outside with stricter threshold + n_inside_default = np.sum(pred_default == 1) + n_inside_90 = np.sum(pred_90 == 1) + assert n_inside_90 <= n_inside_default + + +def test_feature_names(ad_estimator, reduced_fingerprints): + """Test feature names are properly handled.""" + ad_estimator.fit(reduced_fingerprints) + + # Check feature names exist and match prefix + feature_names = ad_estimator.get_feature_names_out() + assert len(feature_names) == 1 + assert feature_names[0].startswith(ad_estimator.feature_prefix) + + +def test_pandas_output(ad_estimator, reduced_fingerprints): + """Test pandas DataFrame output functionality.""" + ad_estimator.set_output(transform="pandas") + ad_estimator.fit(reduced_fingerprints) + + # Test transform output + scores_df = ad_estimator.transform(reduced_fingerprints) + assert hasattr(scores_df, "columns") + assert len(scores_df.columns) == 1 + assert scores_df.columns[0].startswith(ad_estimator.feature_prefix) + + # Test predict output + pred_df = ad_estimator.predict(reduced_fingerprints) + assert hasattr(pred_df, "columns") + assert len(pred_df.columns) == 1 + + +def test_input_validation(ad_estimator): + """Test input validation and error handling.""" + # Test fitting with invalid input + with pytest.raises(ValueError): + ad_estimator.fit([[]]) # Empty data + + with pytest.raises(ValueError): + ad_estimator.fit([[1], [2, 3]]) # Inconsistent dimensions + + # Test invalid percentile + with pytest.raises(ValueError): + ad_estimator.percentile = 101 + ad_estimator.fit([[1, 2]]) + + +def test_refit_consistency(ad_estimator, reduced_fingerprints): + """Test consistency when refitting with same data.""" + ad_estimator.fit(reduced_fingerprints) + scores1 = ad_estimator.transform(reduced_fingerprints) + + ad_estimator.fit(reduced_fingerprints) + scores2 = ad_estimator.transform(reduced_fingerprints) + + assert_array_almost_equal(scores1, scores2) + + +# ... other common tests ... diff --git a/tests/applicability/test_knn.py b/tests/applicability/test_knn.py new file mode 100644 index 0000000..3ff5463 --- /dev/null +++ b/tests/applicability/test_knn.py @@ -0,0 +1,25 @@ +"""Tests specific to KNN applicability domain.""" + +import numpy as np +import pytest + +from scikit_mol.applicability import KNNApplicabilityDomain +from scikit_mol.fingerprints import MorganFingerprintTransformer + + +@pytest.fixture +def binary_fingerprints(mols_list): + """Binary fingerprints for testing Tanimoto distance.""" + return MorganFingerprintTransformer(fpSize=1024).fit_transform(mols_list) + + +def test_knn_tanimoto(binary_fingerprints): + """Test KNN with Tanimoto distance on binary fingerprints.""" + ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric="tanimoto") + ad.fit(binary_fingerprints) + scores = ad.transform(binary_fingerprints) + assert scores.shape == (len(binary_fingerprints), 1) + assert np.all((0 <= scores) & (scores <= 1)) # Tanimoto distances are [0,1] + + +# ... other KNN-specific tests ... From 51efb25b25440254fb4d4b894f5a0727ac0189bd Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 14:15:14 +0100 Subject: [PATCH 07/24] Added leverage to our tests. --- scikit_mol/applicability/leverage.py | 112 +++++++++++++++++---------- tests/applicability/conftest.py | 4 +- tests/applicability/test_leverage.py | 63 +++++++++++++++ 3 files changed, 134 insertions(+), 45 deletions(-) create mode 100644 tests/applicability/test_leverage.py diff --git a/scikit_mol/applicability/leverage.py b/scikit_mol/applicability/leverage.py index 3d8f309..fee276c 100644 --- a/scikit_mol/applicability/leverage.py +++ b/scikit_mol/applicability/leverage.py @@ -7,22 +7,30 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from numpy.typing import ArrayLike, NDArray +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class LeverageApplicabilityDomain(BaseEstimator, TransformerMixin): +class LeverageApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain defined using the leverage approach. The leverage approach measures how far a sample is from the center of the feature space using the diagonal elements of the hat matrix H = X(X'X)^(-1)X'. + Higher leverage values indicate samples further from the center of the training data. Parameters ---------- threshold_factor : float, default=3 Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n where p is the number of features and n is the number of samples. + percentile : float or None, default=None + If not None, overrides the statistical threshold with a percentile-based one. + See BaseApplicabilityDomain for details. Attributes ---------- @@ -32,12 +40,61 @@ class LeverageApplicabilityDomain(BaseEstimator, TransformerMixin): Calculated leverage threshold. var_covar_ : ndarray of shape (n_features, n_features) Variance-covariance matrix of the training data. + + Notes + ----- + The statistical threshold h* = 3 * (p+1)/n is a commonly used rule of thumb + in regression diagnostics, where p is the number of features and n is the + number of training samples. + + Input data should be scaled (e.g., using StandardScaler) to ensure all features + contribute equally. For high-dimensional data like fingerprints, dimensionality + reduction (e.g., PCA) is strongly recommended to avoid computational issues with + the variance-covariance matrix inversion. + + Examples + -------- + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn.decomposition import PCA + >>> from scikit_mol.applicability import LeverageApplicabilityDomain + >>> + >>> # Create pipeline with scaling and dimensionality reduction + >>> pipe = Pipeline([ + ... ('scaler', StandardScaler()), + ... ('pca', PCA(n_components=0.95)), # Keep 95% of variance + ... ('ad', LeverageApplicabilityDomain()) + ... ]) + >>> + >>> # Fit pipeline + >>> X_train = [[0, 1, 2], [1, 2, 3], [2, 3, 4]] # Example data + >>> pipe.fit(X_train) + >>> + >>> # Predict domain membership for new samples + >>> X_test = [[0, 1, 2], [10, 20, 30]] + >>> pipe.predict(X_test) # Returns [1, -1] (in/out of domain) """ - def __init__(self, threshold_factor=3): + _scoring_convention = "high_outside" + _supports_threshold_fitting = True + + def __init__( + self, + threshold_factor: float = 3, + percentile: Optional[float] = None, + feature_prefix: str = "Leverage", + ) -> None: + super().__init__(percentile=percentile, feature_prefix=feature_prefix) self.threshold_factor = threshold_factor - def fit(self, X, y=None): + def _set_statistical_threshold(self, X: NDArray) -> None: + """Set the statistical threshold h* = threshold_factor * (p+1)/n.""" + n_samples = X.shape[0] + self.threshold_ = self.threshold_factor * (self.n_features_in_ + 1) / n_samples + + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "LeverageApplicabilityDomain": """Fit the leverage applicability domain. Parameters @@ -49,55 +106,24 @@ def fit(self, X, y=None): Returns ------- - self : object + self : LeverageApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] - n_samples = X.shape[0] # Calculate variance-covariance matrix self.var_covar_ = np.linalg.inv(X.T.dot(X)) - # Calculate threshold - self.threshold_ = self.threshold_factor * (self.n_features_in_ + 1) / n_samples + # Set initial threshold + self._set_statistical_threshold(X) return self - def transform(self, X): - """Calculate leverage values for X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The data to transform. + def _transform(self, X: NDArray) -> NDArray[np.float64]: + """Calculate leverage values. - Returns - ------- - h : ndarray of shape (n_samples, 1) - The leverage values. Higher values indicate samples further from - the center of the training data. + Higher values indicate samples further from the center of the training data. """ - check_is_fitted(self) - X = check_array(X) - - # Calculate leverage values h = np.sum(X.dot(self.var_covar_) * X, axis=1) return h.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - return np.where(scores < self.threshold_, 1, -1) diff --git a/tests/applicability/conftest.py b/tests/applicability/conftest.py index f762a8e..7047573 100644 --- a/tests/applicability/conftest.py +++ b/tests/applicability/conftest.py @@ -3,7 +3,7 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler -from scikit_mol.applicability import KNNApplicabilityDomain +from scikit_mol.applicability import KNNApplicabilityDomain, LeverageApplicabilityDomain from scikit_mol.fingerprints import MorganFingerprintTransformer from ..fixtures import mols_list @@ -12,7 +12,7 @@ @pytest.fixture( params=[ (KNNApplicabilityDomain, dict(n_neighbors=3)), - # Add other AD estimators here as (class, params) tuples + (LeverageApplicabilityDomain, dict(threshold_factor=3)), ] ) def ad_estimator(request): diff --git a/tests/applicability/test_leverage.py b/tests/applicability/test_leverage.py new file mode 100644 index 0000000..892582a --- /dev/null +++ b/tests/applicability/test_leverage.py @@ -0,0 +1,63 @@ +"""Tests specific to Leverage applicability domain.""" + +import numpy as np +import pytest +from sklearn.decomposition import PCA +from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from scikit_mol.applicability import LeverageApplicabilityDomain + + +def test_leverage_statistical_threshold(ad_test_data): + """Test the statistical threshold calculation.""" + X_train, _, _ = ad_test_data + ad = LeverageApplicabilityDomain(threshold_factor=3) + ad.fit(X_train) + + # Check threshold matches formula h* = 3 * (p+1)/n + n_samples, n_features = X_train.shape + expected_threshold = 3 * (n_features + 1) / n_samples + assert np.isclose(ad.threshold_, expected_threshold) + + +def test_leverage_pipeline(reduced_fingerprints): + """Test leverage works in pipeline with scaling and PCA.""" + pipe = Pipeline( + [ + ("scaler", StandardScaler()), + ("pca", PCA(n_components=0.95)), + ("ad", LeverageApplicabilityDomain()), + ] + ) + + # Should run without errors + pipe.fit(reduced_fingerprints) + scores = pipe.transform(reduced_fingerprints) + assert scores.shape == (len(reduced_fingerprints), 1) + + +def test_leverage_threshold_factor(): + """Test different threshold factors.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + + ad1 = LeverageApplicabilityDomain(threshold_factor=3) + ad2 = LeverageApplicabilityDomain(threshold_factor=2) + + ad1.fit(X) + ad2.fit(X) + + # Higher threshold factor should result in higher threshold + assert ad1.threshold_ > ad2.threshold_ + + +def test_leverage_var_covar_matrix(ad_test_data): + """Test the variance-covariance matrix calculation.""" + X_train, _, _ = ad_test_data + ad = LeverageApplicabilityDomain() + ad.fit(X_train) + + # Check matrix properties + assert ad.var_covar_.shape == (X_train.shape[1], X_train.shape[1]) + assert np.allclose(ad.var_covar_, ad.var_covar_.T) # Should be symmetric From f75bc800b0e9a7ea6e7ad4efe68d7d1bc6a37a46 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 14:58:09 +0100 Subject: [PATCH 08/24] Added more AD estimators as children of base_class --- scikit_mol/applicability/bounding_box.py | 96 ++++++----- scikit_mol/applicability/convex_hull.py | 64 +++++--- scikit_mol/applicability/hotelling.py | 134 ++++++--------- scikit_mol/applicability/isolation_forest.py | 161 +++++++++++-------- tests/applicability/conftest.py | 20 ++- tests/applicability/test_base.py | 16 +- tests/applicability/test_bounding_box.py | 71 ++++++++ tests/applicability/test_convex_hull.py | 81 ++++++++++ tests/applicability/test_hotelling.py | 87 ++++++++++ tests/applicability/test_isolation_forest.py | 25 +++ 10 files changed, 522 insertions(+), 233 deletions(-) create mode 100644 tests/applicability/test_bounding_box.py create mode 100644 tests/applicability/test_convex_hull.py create mode 100644 tests/applicability/test_hotelling.py create mode 100644 tests/applicability/test_isolation_forest.py diff --git a/scikit_mol/applicability/bounding_box.py b/scikit_mol/applicability/bounding_box.py index 4c946fe..6bd35b7 100644 --- a/scikit_mol/applicability/bounding_box.py +++ b/scikit_mol/applicability/bounding_box.py @@ -7,22 +7,28 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional, Tuple, Union + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from numpy.typing import ArrayLike, NDArray + +from .base import BaseApplicabilityDomain -class BoundingBoxApplicabilityDomain(BaseEstimator, TransformerMixin): +class BoundingBoxApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain defined by feature value ranges. Samples falling outside the allowed range for any feature are considered - outside the domain. + outside the domain. The range for each feature is defined by percentiles + of the training set distribution. Parameters ---------- percentile : float or tuple of float, default=(0.1, 99.9) Percentile(s) of the training set distribution used to define the bounding box. If float, uses (percentile, 100-percentile). + feature_prefix : str, default="BoundingBox" + Prefix for feature names in output. Attributes ---------- @@ -32,42 +38,49 @@ class BoundingBoxApplicabilityDomain(BaseEstimator, TransformerMixin): Minimum allowed value for each feature. max_ : ndarray of shape (n_features,) Maximum allowed value for each feature. + threshold_ : float + Current threshold for domain membership (always 0.5). + + Notes + ----- + The bounding box method is simple but effective, especially for chemical + descriptors with clear physical interpretations. For high-dimensional or + correlated features, other methods may be more appropriate. Examples -------- >>> from sklearn.pipeline import make_pipeline >>> from sklearn.preprocessing import StandardScaler - >>> from sklearn.decomposition import PCA >>> from scikit_mol.applicability import BoundingBoxApplicabilityDomain - - Basic usage: + >>> + >>> # Basic usage >>> ad = BoundingBoxApplicabilityDomain(percentile=1) >>> ad.fit(X_train) >>> predictions = ad.predict(X_test) - - With preprocessing: + >>> + >>> # With preprocessing >>> pipe = make_pipeline( ... StandardScaler(), ... BoundingBoxApplicabilityDomain(percentile=1) ... ) >>> pipe.fit(X_train) >>> predictions = pipe.predict(X_test) - - With PCA preprocessing: - >>> pipe = make_pipeline( - ... StandardScaler(), - ... PCA(n_components=0.9), - ... BoundingBoxApplicabilityDomain(percentile=1) - ... ) - >>> pipe.fit(X_train) - >>> predictions = pipe.predict(X_test) """ - def __init__(self, percentile=(0.1, 99.9)): + _scoring_convention = "high_outside" + _supports_threshold_fitting = False + + def __init__( + self, + percentile: Union[float, Tuple[float, float]] = (0.1, 99.9), + feature_prefix: str = "BoundingBox", + ) -> None: + super().__init__(percentile=None, feature_prefix=feature_prefix) + if isinstance(percentile, (int, float)): if not 0 <= percentile <= 100: raise ValueError("percentile must be between 0 and 100") - self.percentile = (percentile, 100 - percentile) + self.box_percentile = (percentile, 100 - percentile) else: if not all(0 <= p <= 100 for p in percentile): raise ValueError("percentiles must be between 0 and 100") @@ -75,9 +88,11 @@ def __init__(self, percentile=(0.1, 99.9)): raise ValueError("percentile must be a float or tuple of 2 floats") if percentile[0] >= percentile[1]: raise ValueError("first percentile must be less than second") - self.percentile = percentile + self.box_percentile = percentile - def fit(self, X, y=None): + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "BoundingBoxApplicabilityDomain": """Fit the bounding box applicability domain. Parameters @@ -89,24 +104,27 @@ def fit(self, X, y=None): Returns ------- - self : object + self : BoundingBoxApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = self._validate_data(X) self.n_features_in_ = X.shape[1] # Calculate bounds - self.min_ = np.percentile(X, self.percentile[0], axis=0) - self.max_ = np.percentile(X, self.percentile[1], axis=0) + self.min_ = np.percentile(X, self.box_percentile[0], axis=0) + self.max_ = np.percentile(X, self.box_percentile[1], axis=0) + + # Fixed threshold since we count violations + self.threshold_ = 0.5 return self - def transform(self, X): - """Calculate the number of features outside their bounds for each sample. + def _transform(self, X: NDArray) -> NDArray[np.float64]: + """Calculate the number of features outside their bounds. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -115,25 +133,5 @@ def transform(self, X): Number of features outside their bounds for each sample. Zero indicates all features within bounds. """ - check_is_fitted(self) - X = check_array(X) - violations = np.sum((X < self.min_) | (X > self.max_), axis=1) return violations.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - violations = self.transform(X).ravel() - return np.where(violations == 0, 1, -1) diff --git a/scikit_mol/applicability/convex_hull.py b/scikit_mol/applicability/convex_hull.py index 4f46eb9..0ec3eaa 100644 --- a/scikit_mol/applicability/convex_hull.py +++ b/scikit_mol/applicability/convex_hull.py @@ -7,21 +7,38 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np +from numpy.typing import ArrayLike, NDArray from scipy import optimize -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted + +from .base import BaseApplicabilityDomain -class ConvexHullApplicabilityDomain(BaseEstimator, TransformerMixin): +class ConvexHullApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain defined as the convex hull of the training data. The convex hull approach determines if a point belongs to the convex hull of the training set by checking if it can be represented as a convex combination of training points. - The method is based on the `highs` solver from the `scipy.optimize` module, but is still - slow at inference time. + Parameters + ---------- + percentile : float or None, default=None + Not used, present for API consistency. + feature_prefix : str, default="ConvexHull" + Prefix for feature names in output. + + Notes + ----- + The method is based on the `highs` solver from `scipy.optimize`. Note that this + method can be computationally expensive for high-dimensional data or large + training sets, as it requires solving a linear programming problem for each + test point. + + For high-dimensional data (e.g., fingerprints), consider using dimensionality + reduction before applying this method. Attributes ---------- @@ -29,9 +46,22 @@ class ConvexHullApplicabilityDomain(BaseEstimator, TransformerMixin): Number of features seen during fit. points_ : ndarray of shape (n_features + 1, n_samples) Transformed training points used for convex hull calculations. + threshold_ : float + Fixed at 0.5 since output is binary (inside/outside hull). """ - def fit(self, X, y=None): + _scoring_convention = "high_outside" + _supports_threshold_fitting = False + + def __init__( + self, percentile: Optional[float] = None, feature_prefix: str = "ConvexHull" + ) -> None: + super().__init__(percentile=None, feature_prefix=feature_prefix) + self.threshold_ = 0.5 # Fixed threshold since output is binary + + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "ConvexHullApplicabilityDomain": """Fit the convex hull applicability domain. Parameters @@ -43,10 +73,10 @@ def fit(self, X, y=None): Returns ------- - self : object + self : ConvexHullApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = self._validate_data(X) self.n_features_in_ = X.shape[1] # Add ones column and transpose for convex hull calculations @@ -54,15 +84,12 @@ def fit(self, X, y=None): return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate distance from convex hull for each sample. - A distance of 0 indicates the sample lies within the convex hull. - Positive values indicate distance outside the hull. - Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -71,17 +98,10 @@ def transform(self, X): Distance from convex hull. Zero for points inside the hull, positive for points outside. """ - check_is_fitted(self) - X = check_array(X) - - # Calculate distances - if X.ndim == 1: - X = X.reshape(1, -1) - distances = [] for sample in X: # Append 1 to sample vector - sample_ext = np.r_[sample, 1].astype(np.float16) + sample_ext = np.r_[sample, 1].astype(np.float32) # Try to solve the linear programming problem result = optimize.linprog( @@ -109,5 +129,5 @@ def predict(self, X): Returns 1 for samples inside the domain and -1 for samples outside (following scikit-learn's convention for outlier detection). """ - scores = self.transform(X).ravel() + scores = self._transform(X).ravel() return np.where(scores == 0, 1, -1) diff --git a/scikit_mol/applicability/hotelling.py b/scikit_mol/applicability/hotelling.py index 1494d48..00aafb2 100644 --- a/scikit_mol/applicability/hotelling.py +++ b/scikit_mol/applicability/hotelling.py @@ -7,30 +7,38 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np +from numpy.typing import ArrayLike, NDArray from scipy.stats import f as f_dist -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils import check_array + +from .base import BaseApplicabilityDomain -class HotellingT2ApplicabilityDomain(BaseEstimator, TransformerMixin): +class HotellingT2ApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on Hotelling's T² statistic. Uses Hotelling's T² statistic to define an elliptical confidence region around the training data. The threshold can be set using either the F-distribution (statistical approach) or adjusted using a validation set. - Lower volume protrusion scores indicate samples closer to the training - data center. By default, the threshold is set using the F-distribution - with a significance level of 0.05 (95% confidence). When using fit_threshold, - a target_percentile of 95 means that 95% of the validation samples with - the lowest protrusion scores will be considered inside the domain. - Parameters ---------- significance : float, default=0.05 Significance level for F-distribution threshold. - Only used if fit_threshold is not called. + percentile : float or None, default=None + If not None, overrides significance-based threshold. + Must be between 0 and 100. + feature_prefix : str, default="HotellingT2" + Prefix for feature names in output. + + Notes + ----- + Lower volume protrusion scores indicate samples closer to the training + data center. By default, the threshold is set using the F-distribution + with a significance level of 0.05 (95% confidence). Attributes ---------- @@ -41,52 +49,29 @@ class HotellingT2ApplicabilityDomain(BaseEstimator, TransformerMixin): threshold_ : float Current threshold for volume protrusions. - Examples - -------- - >>> from scikit_mol.applicability import HotellingT2ApplicabilityDomain - >>> ad = HotellingT2ApplicabilityDomain() - >>> # Using F-distribution threshold (default) - >>> ad.fit(X_train) - >>> predictions = ad.predict(X_test) - >>> - >>> # Adjusting threshold using validation set - >>> ad.fit_threshold(X_val, target_percentile=95) - >>> predictions = ad.predict(X_test) - References ---------- .. [1] Hotelling, H. (1931). The generalization of Student's ratio. The Annals of Mathematical Statistics, 2(3), 360-378. """ - def __init__(self, significance=0.05): + _scoring_convention = "high_outside" + _supports_threshold_fitting = True + + def __init__( + self, + significance: float = 0.05, + percentile: Optional[float] = None, + feature_prefix: str = "HotellingT2", + ) -> None: if not 0 < significance < 1: raise ValueError("significance must be between 0 and 1") + super().__init__(percentile=percentile, feature_prefix=feature_prefix) self.significance = significance - def fit(self, X, y=None): - """Fit the Hotelling T² applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data. - y : Ignored - Not used, present for API consistency. - - Returns - ------- - self : object - Returns the instance itself. - """ - X = check_array(X) - self.n_features_in_ = X.shape[1] + def _set_statistical_threshold(self, X: NDArray) -> None: + """Set threshold using F-distribution.""" n_samples = X.shape[0] - - # Determine the Hotelling T² ellipse - self.t2_ = np.sqrt((1 / n_samples) * (X**2).sum(axis=0)) - - # Set initial threshold using F-distribution f_stat = ( (n_samples - 1) / n_samples @@ -99,43 +84,43 @@ def fit(self, X, y=None): ) self.threshold_ = f_stat - return self - - def fit_threshold(self, X, target_percentile=95): - """Update the threshold using new data without refitting the model. + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "HotellingT2ApplicabilityDomain": + """Fit the Hotelling T² applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - target_percentile : float, default=95 - Target percentile of samples to include within domain. + Training data. + y : Ignored + Not used, present for API consistency. Returns ------- - self : object + self : HotellingT2ApplicabilityDomain Returns the instance itself. """ - check_is_fitted(self) - X = check_array(X) - - if not 0 <= target_percentile <= 100: - raise ValueError("target_percentile must be between 0 and 100") + X = check_array(X, **self._check_params) + self.n_features_in_ = X.shape[1] - # Calculate volume protrusions for validation set - scores = self.transform(X).ravel() + # Determine the Hotelling T² ellipse + self.t2_ = np.sqrt((1 / X.shape[0]) * (X**2).sum(axis=0)) - # Set threshold to achieve desired percentile (lower scores = inside domain) - self.threshold_ = np.percentile(scores, 100 - target_percentile) + # Set initial threshold + if self.percentile is not None: + self.fit_threshold(X) + else: + self._set_statistical_threshold(X) return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate volume protrusion scores for samples. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -144,26 +129,5 @@ def transform(self, X): The volume protrusion scores. Higher values indicate samples further from the training data center. """ - check_is_fitted(self) - X = check_array(X) - - # Calculate volume protrusions protrusions = (X**2 / self.t2_**2).sum(axis=1) return protrusions.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/isolation_forest.py b/scikit_mol/applicability/isolation_forest.py index 3ef23d3..ab1a3fa 100644 --- a/scikit_mol/applicability/isolation_forest.py +++ b/scikit_mol/applicability/isolation_forest.py @@ -7,13 +7,17 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin +from numpy.typing import ArrayLike, NDArray from sklearn.ensemble import IsolationForest -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class IsolationForestApplicabilityDomain(BaseEstimator, TransformerMixin): +class IsolationForestApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on Isolation Forest. Uses Isolation Forest to identify outliers based on the isolation depth @@ -25,8 +29,13 @@ class IsolationForestApplicabilityDomain(BaseEstimator, TransformerMixin): Number of trees in the forest. contamination : float, default=0.01 Expected proportion of outliers in the training data. - random_state : int or RandomState, default=None + random_state : Optional[int], default=None Controls the randomness of the forest. + percentile : float or None, default=None + Percentile of training set scores to use as threshold (0-100). + If None, uses contamination-based threshold from IsolationForest. + feature_prefix : str, default="IsolationForest" + Prefix for feature names in output. Attributes ---------- @@ -34,13 +43,13 @@ class IsolationForestApplicabilityDomain(BaseEstimator, TransformerMixin): Number of features seen during fit. iforest_ : IsolationForest Fitted isolation forest model. + threshold_ : float + Current threshold for domain membership. - Examples - -------- - >>> from scikit_mol.applicability import IsolationForestApplicabilityDomain - >>> ad = IsolationForestApplicabilityDomain(contamination=0.1) - >>> ad.fit(X_train) - >>> predictions = ad.predict(X_test) + Notes + ----- + The scoring convention is 'high_inside' because higher scores from + IsolationForest indicate samples more similar to the training data. References ---------- @@ -48,15 +57,27 @@ class IsolationForestApplicabilityDomain(BaseEstimator, TransformerMixin): In 2008 Eighth IEEE International Conference on Data Mining (pp. 413-422). """ - def __init__(self, n_estimators=100, contamination=0.01, random_state=None): + _scoring_convention = "high_inside" + _supports_threshold_fitting = True + + def __init__( + self, + n_estimators: int = 100, + contamination: float = 0.01, + random_state: Optional[int] = None, + percentile: Optional[float] = None, + feature_prefix: str = "IsolationForest", + ) -> None: if not 0 < contamination < 1: raise ValueError("contamination must be between 0 and 1") - + super().__init__(percentile=percentile, feature_prefix=feature_prefix) self.n_estimators = n_estimators self.contamination = contamination self.random_state = random_state - def fit(self, X, y=None): + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "IsolationForestApplicabilityDomain": """Fit the isolation forest applicability domain. Parameters @@ -68,10 +89,10 @@ def fit(self, X, y=None): Returns ------- - self : object + self : IsolationForestApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] self.iforest_ = IsolationForest( @@ -81,74 +102,76 @@ def fit(self, X, y=None): ) self.iforest_.fit(X) - self.fit_threshold(X) + # Set initial threshold + if self.percentile is not None: + self.fit_threshold(X) + else: + # Use IsolationForest's default threshold + self.threshold_ = self.iforest_.offset_ return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate anomaly scores for samples. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns ------- scores : ndarray of shape (n_samples, 1) The anomaly scores of the samples. - The lower the score, the more abnormal the sample. + Higher scores indicate samples more similar to training data. """ - check_is_fitted(self) - X = check_array(X) - scores = self.iforest_.score_samples(X) return scores.reshape(-1, 1) - def fit_threshold(self, X, target_percentile=95): - """Update the threshold using new data without refitting the model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - target_percentile : float, default=95 - Target percentile of samples to include within domain. - - Returns - ------- - self : object - Returns the instance itself. - """ - check_is_fitted(self) - X = check_array(X) - - if not 0 <= target_percentile <= 100: - raise ValueError("target_percentile must be between 0 and 100") - - # Get decision function scores - scores = self.iforest_.score_samples(X) - - # Set threshold to achieve desired percentile - self.threshold_ = np.percentile(scores, 100 - target_percentile) - - return self - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - if hasattr(self, "threshold_"): - return np.where(scores > self.threshold_, 1, -1) - return self.iforest_.predict(X) + # def fit_threshold(self, X, target_percentile=95): + # """Update the threshold using new data without refitting the model. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # Data to compute threshold from. + # target_percentile : float, default=95 + # Target percentile of samples to include within domain. + + # Returns + # ------- + # self : object + # Returns the instance itself. + # """ + # check_is_fitted(self) + # X = check_array(X) + + # if not 0 <= target_percentile <= 100: + # raise ValueError("target_percentile must be between 0 and 100") + + # # Get decision function scores + # scores = self.iforest_.score_samples(X) + + # # Set threshold to achieve desired percentile + # self.threshold_ = np.percentile(scores, 100 - target_percentile) + + # return self + + # def predict(self, X): + # """Predict whether samples are within the applicability domain. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # The samples to predict. + + # Returns + # ------- + # y_pred : ndarray of shape (n_samples,) + # Returns 1 for samples inside the domain and -1 for samples outside + # (following scikit-learn's convention for outlier detection). + # """ + # scores = self._transform(X).ravel() + # if hasattr(self, "threshold_"): + # return np.where(scores > self.threshold_, 1, -1) + # return self.iforest_.predict(X) diff --git a/tests/applicability/conftest.py b/tests/applicability/conftest.py index 7047573..f822b1f 100644 --- a/tests/applicability/conftest.py +++ b/tests/applicability/conftest.py @@ -3,7 +3,14 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler -from scikit_mol.applicability import KNNApplicabilityDomain, LeverageApplicabilityDomain +from scikit_mol.applicability import ( + BoundingBoxApplicabilityDomain, + ConvexHullApplicabilityDomain, + HotellingT2ApplicabilityDomain, + IsolationForestApplicabilityDomain, + KNNApplicabilityDomain, + LeverageApplicabilityDomain, +) from scikit_mol.fingerprints import MorganFingerprintTransformer from ..fixtures import mols_list @@ -13,6 +20,17 @@ params=[ (KNNApplicabilityDomain, dict(n_neighbors=3)), (LeverageApplicabilityDomain, dict(threshold_factor=3)), + (BoundingBoxApplicabilityDomain, dict(percentile=(1, 99))), + (ConvexHullApplicabilityDomain, dict()), # No special parameters needed + (HotellingT2ApplicabilityDomain, dict(significance=0.05)), + ( + IsolationForestApplicabilityDomain, + dict( + n_estimators=100, + contamination=0.1, + random_state=42, # Add fixed random state + ), + ), ] ) def ad_estimator(request): diff --git a/tests/applicability/test_base.py b/tests/applicability/test_base.py index dd0df67..a7d7a58 100644 --- a/tests/applicability/test_base.py +++ b/tests/applicability/test_base.py @@ -54,8 +54,12 @@ def test_score_transform(ad_estimator, ad_test_data): assert mean_in > mean_out # Inside domain should have higher scores +@pytest.mark.threshold_fitting def test_threshold_setting(ad_estimator, reduced_fingerprints): """Test threshold setting and percentile behavior.""" + if not ad_estimator._supports_threshold_fitting: + pytest.skip("Estimator does not support threshold fitting") + # Test default threshold ad_estimator.fit(reduced_fingerprints) pred_default = ad_estimator.predict(reduced_fingerprints) @@ -107,10 +111,11 @@ def test_input_validation(ad_estimator): with pytest.raises(ValueError): ad_estimator.fit([[1], [2, 3]]) # Inconsistent dimensions - # Test invalid percentile - with pytest.raises(ValueError): - ad_estimator.percentile = 101 - ad_estimator.fit([[1, 2]]) + # Test invalid percentile only if threshold fitting is supported + if ad_estimator._supports_threshold_fitting: + with pytest.raises(ValueError): + ad_estimator.percentile = 101 + ad_estimator.fit([[1, 2]]) def test_refit_consistency(ad_estimator, reduced_fingerprints): @@ -122,6 +127,3 @@ def test_refit_consistency(ad_estimator, reduced_fingerprints): scores2 = ad_estimator.transform(reduced_fingerprints) assert_array_almost_equal(scores1, scores2) - - -# ... other common tests ... diff --git a/tests/applicability/test_bounding_box.py b/tests/applicability/test_bounding_box.py new file mode 100644 index 0000000..451ad43 --- /dev/null +++ b/tests/applicability/test_bounding_box.py @@ -0,0 +1,71 @@ +"""Tests specific to Bounding Box applicability domain.""" + +import numpy as np +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from scikit_mol.applicability import BoundingBoxApplicabilityDomain + + +def test_bounding_box_bounds(ad_test_data): + """Test the bounds calculation.""" + X_train, _, _ = ad_test_data + ad = BoundingBoxApplicabilityDomain(percentile=(1, 99)) + ad.fit(X_train) + + # Check bounds match numpy percentile + expected_min = np.percentile(X_train, 1, axis=0) + expected_max = np.percentile(X_train, 99, axis=0) + + assert np.allclose(ad.min_, expected_min) + assert np.allclose(ad.max_, expected_max) + + +def test_bounding_box_violations(): + """Test violation counting.""" + X_train = np.array([[1, 1], [2, 2], [3, 3]]) + X_test = np.array( + [ + [2, 2], # Inside bounds (0 violations) + [0, 2], # One violation + [0, 4], # Two violations + ] + ) + + ad = BoundingBoxApplicabilityDomain(percentile=(0, 100)) + ad.fit(X_train) + + scores = ad.transform(X_test) + assert scores[0, 0] == 0 # Inside bounds + assert scores[1, 0] == 1 # One violation + assert scores[2, 0] == 2 # Two violations + + +def test_bounding_box_percentile_validation(): + """Test percentile parameter validation.""" + # Invalid single percentile + with pytest.raises(ValueError): + BoundingBoxApplicabilityDomain(percentile=101) + + # Invalid tuple length + with pytest.raises(ValueError): + BoundingBoxApplicabilityDomain(percentile=(1, 2, 3)) + + # Invalid order + with pytest.raises(ValueError): + BoundingBoxApplicabilityDomain(percentile=(99, 1)) + + +def test_bounding_box_pipeline(): + """Test bounding box works in pipeline with scaling.""" + X = np.random.randn(10, 5) + pipe = Pipeline( + [("scaler", StandardScaler()), ("ad", BoundingBoxApplicabilityDomain())] + ) + + # Should run without errors + pipe.fit(X) + scores = pipe.transform(X) + assert scores.shape == (len(X), 1) diff --git a/tests/applicability/test_convex_hull.py b/tests/applicability/test_convex_hull.py new file mode 100644 index 0000000..53f430c --- /dev/null +++ b/tests/applicability/test_convex_hull.py @@ -0,0 +1,81 @@ +"""Tests specific to Convex Hull applicability domain.""" + +import numpy as np +import pytest +from sklearn.decomposition import PCA +from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from scikit_mol.applicability import ConvexHullApplicabilityDomain + + +def test_convex_hull_simple(): + """Test with simple 2D data where result is obvious.""" + # Create a triangle of points + X_train = np.array([[0, 0], [1, 0], [0, 1]]) + X_test = np.array( + [ + [0.5, 0.25], # Inside triangle + [2, 2], # Outside triangle + ] + ) + + ad = ConvexHullApplicabilityDomain() + ad.fit(X_train) + + scores = ad.transform(X_test) + assert scores[0, 0] == 0.0 # Inside point + assert scores[1, 0] == 1.0 # Outside point + + +def test_convex_hull_pipeline(): + """Test convex hull works in pipeline with dimensionality reduction.""" + pipe = Pipeline( + [ + ("scaler", StandardScaler()), + ("pca", PCA(n_components=2)), # Reduce to 2D for speed + ("ad", ConvexHullApplicabilityDomain()), + ] + ) + + # Generate random high-dimensional data + X = np.random.randn(10, 5) + + # Should run without errors + pipe.fit(X) + scores = pipe.transform(X) + assert scores.shape == (len(X), 1) + assert np.all((scores == 0) | (scores == 1)) # Binary output + + +def test_convex_hull_numerical_stability(): + """Test numerical stability with nearly colinear points.""" + X_train = np.array( + [ + [0, 0], + [1, 0], + [2, 1e-10], # Nearly colinear + ] + ) + X_test = np.array([[0.5, 0]]) + + ad = ConvexHullApplicabilityDomain() + ad.fit(X_train) + + # Should not raise and give consistent results + scores = ad.transform(X_test) + assert np.all(np.isfinite(scores)) + + +def test_convex_hull_single_point(): + """Test behavior with single point (degenerate hull).""" + X_train = np.array([[1, 1]]) + X_test = np.array([[1, 1], [2, 2]]) + + ad = ConvexHullApplicabilityDomain() + ad.fit(X_train) + + scores = ad.transform(X_test) + assert scores[0, 0] == 0.0 # Same point + assert scores[1, 0] == 1.0 # Different point diff --git a/tests/applicability/test_hotelling.py b/tests/applicability/test_hotelling.py new file mode 100644 index 0000000..c3045fd --- /dev/null +++ b/tests/applicability/test_hotelling.py @@ -0,0 +1,87 @@ +"""Tests specific to Hotelling T² applicability domain.""" + +import numpy as np +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from scikit_mol.applicability import HotellingT2ApplicabilityDomain + + +def test_hotelling_threshold(): + """Test F-distribution threshold calculation.""" + X = np.random.randn(100, 3) # 100 samples, 3 features + + ad = HotellingT2ApplicabilityDomain(significance=0.05) + ad.fit(X) + + # Threshold should be positive + assert ad.threshold_ > 0 + + # More stringent significance should give higher threshold + ad_strict = HotellingT2ApplicabilityDomain(significance=0.01) + ad_strict.fit(X) + assert ad_strict.threshold_ > ad.threshold_ + + +def test_hotelling_scores(): + """Test score calculation with known data.""" + # Create data with known center and spread + X_train = np.array([[0, 0], [1, 0], [-1, 0], [0, 1], [0, -1]]) + + X_test = np.array( + [ + [0, 0], # Center point + [2, 0], # Further out + [10, 10], # Far out + ] + ) + + ad = HotellingT2ApplicabilityDomain() + ad.fit(X_train) + + scores = ad.transform(X_test) + + # Scores should increase with distance from center + assert scores[0, 0] < scores[1, 0] < scores[2, 0] + + +def test_hotelling_significance_validation(): + """Test significance parameter validation.""" + with pytest.raises(ValueError): + HotellingT2ApplicabilityDomain(significance=0) + + with pytest.raises(ValueError): + HotellingT2ApplicabilityDomain(significance=1) + + with pytest.raises(ValueError): + HotellingT2ApplicabilityDomain(significance=-0.5) + + +def test_hotelling_pipeline(): + """Test Hotelling works in pipeline with scaling.""" + pipe = Pipeline( + [("scaler", StandardScaler()), ("ad", HotellingT2ApplicabilityDomain())] + ) + + X = np.random.randn(10, 5) + + # Should run without errors + pipe.fit(X) + scores = pipe.transform(X) + assert scores.shape == (len(X), 1) + assert np.all(scores >= 0) # Scores should be non-negative + + +def test_hotelling_threshold_fitting(): + """Test threshold fitting with percentile.""" + X = np.random.randn(100, 3) + + ad = HotellingT2ApplicabilityDomain(percentile=90) + ad.fit(X) + + # Get scores and check threshold matches 90th percentile + scores = ad.transform(X) + expected_threshold = np.percentile(scores, 90) + assert np.isclose(ad.threshold_, expected_threshold) diff --git a/tests/applicability/test_isolation_forest.py b/tests/applicability/test_isolation_forest.py new file mode 100644 index 0000000..f546b52 --- /dev/null +++ b/tests/applicability/test_isolation_forest.py @@ -0,0 +1,25 @@ +"""Tests specific to Isolation Forest applicability domain.""" + +import numpy as np + +from scikit_mol.applicability import IsolationForestApplicabilityDomain + + +def test_refit_consistency(): + """Test consistency when refitting with same data.""" + X = np.random.RandomState(42).normal(0, 1, (100, 2)) + + # Use fixed random state + ad = IsolationForestApplicabilityDomain( + n_estimators=100, contamination=0.1, random_state=42 + ) + + # First fit + ad.fit(X) + scores1 = ad.transform(X) + + # Second fit + ad.fit(X) + scores2 = ad.transform(X) + + assert np.allclose(scores1, scores2) From 9cafe9f3704476740b8641c9a949ebdebe170c51 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Mon, 10 Feb 2025 16:06:36 +0100 Subject: [PATCH 09/24] Moved rest of AD estimators. All test runs. --- scikit_mol/applicability/base.py | 9 +- scikit_mol/applicability/bounding_box.py | 6 +- scikit_mol/applicability/convex_hull.py | 38 ++-- scikit_mol/applicability/hotelling.py | 6 +- scikit_mol/applicability/isolation_forest.py | 8 +- scikit_mol/applicability/kernel_density.py | 125 +++++++------ scikit_mol/applicability/knn.py | 9 +- scikit_mol/applicability/leverage.py | 4 +- scikit_mol/applicability/local_outlier.py | 92 +++++---- scikit_mol/applicability/mahalanobis.py | 186 +++++++++---------- scikit_mol/applicability/standardization.py | 156 ++++++---------- scikit_mol/applicability/topkat.py | 91 +++++---- tests/applicability/conftest.py | 18 ++ tests/applicability/test_base.py | 6 +- tests/applicability/test_kernel_density.py | 44 +++++ tests/applicability/test_local_outlier.py | 64 +++++++ tests/applicability/test_mahalanobis.py | 66 +++++++ tests/applicability/test_standardization.py | 76 ++++++++ tests/applicability/test_topkat.py | 55 ++++++ tests/conftest.py | 19 +- tests/test_metrics.py | 91 +++++++++ 21 files changed, 790 insertions(+), 379 deletions(-) create mode 100644 tests/applicability/test_kernel_density.py create mode 100644 tests/applicability/test_local_outlier.py create mode 100644 tests/applicability/test_mahalanobis.py create mode 100644 tests/applicability/test_standardization.py create mode 100644 tests/applicability/test_topkat.py create mode 100644 tests/test_metrics.py diff --git a/scikit_mol/applicability/base.py b/scikit_mol/applicability/base.py index cf7d994..c8cbb62 100644 --- a/scikit_mol/applicability/base.py +++ b/scikit_mol/applicability/base.py @@ -55,6 +55,8 @@ class BaseApplicabilityDomain(BaseEstimator, TransformerMixin, _ADOutputMixin, A If None: - For methods with statistical thresholds: use statistical method - For percentile-only methods: use 99.0 (include 99% of training samples) + feature_name : str, default="AD_estimator" + Name for the output feature column. Notes ----- @@ -78,7 +80,7 @@ class BaseApplicabilityDomain(BaseEstimator, TransformerMixin, _ADOutputMixin, A _scoring_convention: ClassVar[str] # Must be set by subclasses def __init__( - self, percentile: Optional[float] = None, feature_prefix: str = "AD_estimator" + self, percentile: Optional[float] = None, feature_name: str = "AD_estimator" ) -> None: if not hasattr(self, "_scoring_convention"): raise TypeError( @@ -93,7 +95,7 @@ def __init__( if percentile is not None and not 0 <= percentile <= 100: raise ValueError("percentile must be between 0 and 100") self.percentile = percentile - self.feature_prefix = feature_prefix + self.feature_name = feature_name self._check_params = { "estimator": self, "accept_sparse": False, @@ -244,5 +246,4 @@ def score_transform( def get_feature_names_out(self) -> NDArray[np.str_]: """Get feature name for output column.""" - - return np.array([f"{self.feature_prefix}"]) + return np.array([f"{self.feature_name}"]) diff --git a/scikit_mol/applicability/bounding_box.py b/scikit_mol/applicability/bounding_box.py index 6bd35b7..9d8ec4a 100644 --- a/scikit_mol/applicability/bounding_box.py +++ b/scikit_mol/applicability/bounding_box.py @@ -27,7 +27,7 @@ class BoundingBoxApplicabilityDomain(BaseApplicabilityDomain): percentile : float or tuple of float, default=(0.1, 99.9) Percentile(s) of the training set distribution used to define the bounding box. If float, uses (percentile, 100-percentile). - feature_prefix : str, default="BoundingBox" + feature_name : str, default="BoundingBox" Prefix for feature names in output. Attributes @@ -73,9 +73,9 @@ class BoundingBoxApplicabilityDomain(BaseApplicabilityDomain): def __init__( self, percentile: Union[float, Tuple[float, float]] = (0.1, 99.9), - feature_prefix: str = "BoundingBox", + feature_name: str = "BoundingBox", ) -> None: - super().__init__(percentile=None, feature_prefix=feature_prefix) + super().__init__(percentile=None, feature_name=feature_name) if isinstance(percentile, (int, float)): if not 0 <= percentile <= 100: diff --git a/scikit_mol/applicability/convex_hull.py b/scikit_mol/applicability/convex_hull.py index 0ec3eaa..9b7a486 100644 --- a/scikit_mol/applicability/convex_hull.py +++ b/scikit_mol/applicability/convex_hull.py @@ -27,7 +27,7 @@ class ConvexHullApplicabilityDomain(BaseApplicabilityDomain): ---------- percentile : float or None, default=None Not used, present for API consistency. - feature_prefix : str, default="ConvexHull" + feature_name : str, default="ConvexHull" Prefix for feature names in output. Notes @@ -54,9 +54,9 @@ class ConvexHullApplicabilityDomain(BaseApplicabilityDomain): _supports_threshold_fitting = False def __init__( - self, percentile: Optional[float] = None, feature_prefix: str = "ConvexHull" + self, percentile: Optional[float] = None, feature_name: str = "ConvexHull" ) -> None: - super().__init__(percentile=None, feature_prefix=feature_prefix) + super().__init__(percentile=None, feature_name=feature_name) self.threshold_ = 0.5 # Fixed threshold since output is binary def fit( @@ -115,19 +115,19 @@ def _transform(self, X: NDArray) -> NDArray[np.float64]: return np.array(distances).reshape(-1, 1) - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self._transform(X).ravel() - return np.where(scores == 0, 1, -1) + # def predict(self, X): + # """Predict whether samples are within the applicability domain. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # The samples to predict. + + # Returns + # ------- + # y_pred : ndarray of shape (n_samples,) + # Returns 1 for samples inside the domain and -1 for samples outside + # (following scikit-learn's convention for outlier detection). + # """ + # scores = self._transform(X).ravel() + # return np.where(scores == 0, 1, -1) diff --git a/scikit_mol/applicability/hotelling.py b/scikit_mol/applicability/hotelling.py index 00aafb2..2f9b6b8 100644 --- a/scikit_mol/applicability/hotelling.py +++ b/scikit_mol/applicability/hotelling.py @@ -31,7 +31,7 @@ class HotellingT2ApplicabilityDomain(BaseApplicabilityDomain): percentile : float or None, default=None If not None, overrides significance-based threshold. Must be between 0 and 100. - feature_prefix : str, default="HotellingT2" + feature_name : str, default="HotellingT2" Prefix for feature names in output. Notes @@ -62,11 +62,11 @@ def __init__( self, significance: float = 0.05, percentile: Optional[float] = None, - feature_prefix: str = "HotellingT2", + feature_name: str = "HotellingT2", ) -> None: if not 0 < significance < 1: raise ValueError("significance must be between 0 and 1") - super().__init__(percentile=percentile, feature_prefix=feature_prefix) + super().__init__(percentile=percentile, feature_name=feature_name) self.significance = significance def _set_statistical_threshold(self, X: NDArray) -> None: diff --git a/scikit_mol/applicability/isolation_forest.py b/scikit_mol/applicability/isolation_forest.py index ab1a3fa..ee29659 100644 --- a/scikit_mol/applicability/isolation_forest.py +++ b/scikit_mol/applicability/isolation_forest.py @@ -34,8 +34,8 @@ class IsolationForestApplicabilityDomain(BaseApplicabilityDomain): percentile : float or None, default=None Percentile of training set scores to use as threshold (0-100). If None, uses contamination-based threshold from IsolationForest. - feature_prefix : str, default="IsolationForest" - Prefix for feature names in output. + feature_name : str, default="IsolationForest" + Name for feature names in output. Attributes ---------- @@ -66,11 +66,11 @@ def __init__( contamination: float = 0.01, random_state: Optional[int] = None, percentile: Optional[float] = None, - feature_prefix: str = "IsolationForest", + feature_name: str = "IsolationForest", ) -> None: if not 0 < contamination < 1: raise ValueError("contamination must be between 0 and 1") - super().__init__(percentile=percentile, feature_prefix=feature_prefix) + super().__init__(percentile=percentile, feature_name=feature_name) self.n_estimators = n_estimators self.contamination = contamination self.random_state = random_state diff --git a/scikit_mol/applicability/kernel_density.py b/scikit_mol/applicability/kernel_density.py index 40125f1..3f7281d 100644 --- a/scikit_mol/applicability/kernel_density.py +++ b/scikit_mol/applicability/kernel_density.py @@ -7,13 +7,17 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin +from numpy.typing import ArrayLike, NDArray from sklearn.neighbors import KernelDensity -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class KernelDensityApplicabilityDomain(BaseEstimator, TransformerMixin): +class KernelDensityApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on kernel density estimation. Uses kernel density estimation to model the distribution of the training data. @@ -27,9 +31,11 @@ class KernelDensityApplicabilityDomain(BaseEstimator, TransformerMixin): kernel : str, default='gaussian' The kernel to use. Options: ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']. - percentile : float, default=1.0 - The percentile of training set densities to use as threshold. - Must be between 0 and 100. + percentile : float or None, default=None + The percentile of training set densities to use as threshold (0-100). + If None, uses 1.0 (exclude bottom 1% of training samples). + feature_name : str, default="KernelDensity" + Name for the output feature column. Attributes ---------- @@ -40,6 +46,11 @@ class KernelDensityApplicabilityDomain(BaseEstimator, TransformerMixin): threshold_ : float Density threshold for domain membership. + Notes + ----- + The scoring convention is 'high_inside' because higher density scores + indicate samples more similar to the training data. + Examples -------- >>> from scikit_mol.applicability import KernelDensityApplicabilityDomain @@ -48,30 +59,37 @@ class KernelDensityApplicabilityDomain(BaseEstimator, TransformerMixin): >>> predictions = ad.predict(X_test) """ - def __init__(self, bandwidth=1.0, kernel="gaussian", percentile=1.0): - if not 0 <= percentile <= 100: - raise ValueError("percentile must be between 0 and 100") + _scoring_convention = "high_inside" + def __init__( + self, + bandwidth: float = 1.0, + kernel: str = "gaussian", + percentile: Optional[float] = None, + feature_name: str = "KernelDensity", + ) -> None: + super().__init__(percentile=percentile or 1.0, feature_name=feature_name) self.bandwidth = bandwidth self.kernel = kernel - self.percentile = percentile - def fit(self, X, y=None): + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "KernelDensityApplicabilityDomain": """Fit the kernel density applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. - y : Ignored + y : Any, optional (default=None) Not used, present for API consistency. Returns ------- - self : object + self : KernelDensityApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] # Fit KDE @@ -83,12 +101,12 @@ def fit(self, X, y=None): return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate log density scores for samples. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -97,47 +115,40 @@ def transform(self, X): The log density scores of the samples. Higher scores indicate samples more similar to the training data. """ - check_is_fitted(self) - X = check_array(X) - scores = self.kde_.score_samples(X) return scores.reshape(-1, 1) - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - return np.where(scores >= self.threshold_, 1, -1) - - def fit_threshold(self, X): - """Update the threshold using new data without refitting the model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - - Returns - ------- - self : object - Returns the instance itself. - """ - check_is_fitted(self) - X = check_array(X) - - # Calculate density threshold from provided data - densities = self.kde_.score_samples(X) - self.threshold_ = np.percentile(densities, self.percentile) - - return self + # def predict(self, X): + # """Predict whether samples are within the applicability domain. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # The samples to predict. + + # Returns + # ------- + # y_pred : ndarray of shape (n_samples,) + # Returns 1 for samples inside the domain and -1 for samples outside + # (following scikit-learn's convention for outlier detection). + # """ + # scores = self._transform(X).ravel() + # return np.where(scores >= self.threshold_, 1, -1) + + # def fit_threshold(self, X): + # """Update the threshold using new data without refitting the model. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # Data to compute threshold from. + + # Returns + # ------- + # self : object + # Returns the instance itself. + # """ + # densities = self._transform(X).ravel() + # self.threshold_ = np.percentile(densities, self.percentile) + + # return self diff --git a/scikit_mol/applicability/knn.py b/scikit_mol/applicability/knn.py index 749980e..c9be686 100644 --- a/scikit_mol/applicability/knn.py +++ b/scikit_mol/applicability/knn.py @@ -12,6 +12,7 @@ import numpy as np from numpy.typing import ArrayLike from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_array from .base import BaseApplicabilityDomain @@ -44,7 +45,7 @@ class KNNApplicabilityDomain(BaseApplicabilityDomain): Number of parallel jobs to run for neighbors search. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. - feature_prefix : str, default='KNN' + feature_name : str, default='KNN' Prefix for feature names in output. Notes @@ -96,9 +97,9 @@ def __init__( percentile: Optional[float] = None, distance_metric: Union[str, Callable] = "euclidean", n_jobs: Optional[int] = None, - feature_prefix: str = "KNN", + feature_name: str = "KNN", ) -> None: - super().__init__(percentile=percentile, feature_prefix=feature_prefix) + super().__init__(percentile=percentile, feature_name=feature_name) self.n_neighbors = n_neighbors self.distance_metric = distance_metric self.n_jobs = n_jobs @@ -134,7 +135,7 @@ def fit(self, X: ArrayLike, y=None) -> "KNNApplicabilityDomain": if not isinstance(self.n_neighbors, int) or self.n_neighbors < 1: raise ValueError("n_neighbors must be a positive integer") - X = self._validate_data(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] # Fit nearest neighbors model diff --git a/scikit_mol/applicability/leverage.py b/scikit_mol/applicability/leverage.py index fee276c..d5a9e01 100644 --- a/scikit_mol/applicability/leverage.py +++ b/scikit_mol/applicability/leverage.py @@ -82,9 +82,9 @@ def __init__( self, threshold_factor: float = 3, percentile: Optional[float] = None, - feature_prefix: str = "Leverage", + feature_name: str = "Leverage", ) -> None: - super().__init__(percentile=percentile, feature_prefix=feature_prefix) + super().__init__(percentile=percentile, feature_name=feature_name) self.threshold_factor = threshold_factor def _set_statistical_threshold(self, X: NDArray) -> None: diff --git a/scikit_mol/applicability/local_outlier.py b/scikit_mol/applicability/local_outlier.py index 3fd181f..9366723 100644 --- a/scikit_mol/applicability/local_outlier.py +++ b/scikit_mol/applicability/local_outlier.py @@ -7,12 +7,17 @@ See LICENSE.MIT in this directory for the original MIT license. """ -from sklearn.base import BaseEstimator, TransformerMixin +from typing import Any, Optional + +import numpy as np +from numpy.typing import ArrayLike, NDArray from sklearn.neighbors import LocalOutlierFactor -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class LocalOutlierFactorApplicabilityDomain(BaseEstimator, TransformerMixin): +class LocalOutlierFactorApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on Local Outlier Factor (LOF). LOF measures the local deviation of density of a sample with respect to its @@ -27,6 +32,11 @@ class LocalOutlierFactorApplicabilityDomain(BaseEstimator, TransformerMixin): Expected proportion of outliers in the data set. metric : str, default='euclidean' Metric to use for distance computation. + percentile : float or None, default=None + Percentile of training set scores to use as threshold (0-100). + If None, uses contamination-based threshold from LOF. + feature_name : str, default="LOF" + Name for the output feature column. Attributes ---------- @@ -34,13 +44,13 @@ class LocalOutlierFactorApplicabilityDomain(BaseEstimator, TransformerMixin): Number of features seen during fit. lof_ : LocalOutlierFactor Fitted LOF estimator. + threshold_ : float + Current threshold for domain membership. - Examples - -------- - >>> from scikit_mol.applicability import LocalOutlierFactorApplicabilityDomain - >>> ad = LocalOutlierFactorApplicabilityDomain() - >>> ad.fit(X_train) - >>> predictions = ad.predict(X_test) + Notes + ----- + The scoring convention is 'high_outside' because higher LOF scores + indicate samples that are more likely to be outliers. References ---------- @@ -48,27 +58,39 @@ class LocalOutlierFactorApplicabilityDomain(BaseEstimator, TransformerMixin): In: Proc. 2000 ACM SIGMOD Int. Conf. Manag. Data, ACM, pp. 93-104. """ - def __init__(self, n_neighbors=20, contamination=0.1, metric="euclidean"): + _scoring_convention = "high_outside" + + def __init__( + self, + n_neighbors: int = 20, + contamination: float = 0.1, + metric: str = "euclidean", + percentile: Optional[float] = None, + feature_name: str = "LOF", + ) -> None: + super().__init__(percentile=percentile, feature_name=feature_name) self.n_neighbors = n_neighbors self.contamination = contamination self.metric = metric - def fit(self, X, y=None): + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "LocalOutlierFactorApplicabilityDomain": """Fit the LOF applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. - y : Ignored + y : Any, optional (default=None) Not used, present for API consistency. Returns ------- - self : object + self : LocalOutlierFactorApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] self.lof_ = LocalOutlierFactor( @@ -79,14 +101,17 @@ def fit(self, X, y=None): ) self.lof_.fit(X) + # Set initial threshold based on training data + self.fit_threshold(X) + return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate LOF scores for samples. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -95,25 +120,22 @@ def transform(self, X): The LOF scores of the samples. Higher scores indicate samples that are more likely to be outliers. """ - check_is_fitted(self) - X = check_array(X) - - # Get negative LOF scores (higher means more likely to be inlier) + # Get negative LOF scores (higher means more likely to be outlier) scores = -self.lof_.score_samples(X) return scores.reshape(-1, 1) - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - return self.lof_.predict(X) + # def predict(self, X): + # """Predict whether samples are within the applicability domain. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # The samples to predict. + + # Returns + # ------- + # y_pred : ndarray of shape (n_samples,) + # Returns 1 for samples inside the domain and -1 for samples outside + # (following scikit-learn's convention for outlier detection). + # """ + # return self.lof_.predict(X) diff --git a/scikit_mol/applicability/mahalanobis.py b/scikit_mol/applicability/mahalanobis.py index 4be40de..61cb493 100644 --- a/scikit_mol/applicability/mahalanobis.py +++ b/scikit_mol/applicability/mahalanobis.py @@ -2,13 +2,17 @@ Mahalanobis distance applicability domain. """ +from typing import Any, Optional + import numpy as np -from scipy import stats -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from numpy.typing import ArrayLike, NDArray +from scipy import linalg, stats +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class MahalanobisApplicabilityDomain(BaseEstimator, TransformerMixin): +class MahalanobisApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on Mahalanobis distance. Uses Mahalanobis distance to measure how many standard deviations a sample @@ -18,9 +22,11 @@ class MahalanobisApplicabilityDomain(BaseEstimator, TransformerMixin): Parameters ---------- - percentile : float, default=95.0 - Percentile for the confidence region (0-100). - Default 95.0 corresponds to ~2 standard deviations. + percentile : float or None, default=None + Percentile of training set scores to use as threshold (0-100). + If None, uses 95.0 (exclude top 5% of training samples). + feature_name : str, default="Mahalanobis" + Name for the output feature column. Attributes ---------- @@ -30,129 +36,107 @@ class MahalanobisApplicabilityDomain(BaseEstimator, TransformerMixin): Mean of training data. covariance_ : ndarray of shape (n_features, n_features) Covariance matrix of training data. - inv_covariance_ : ndarray of shape (n_features, n_features) - Inverse covariance matrix. threshold_ : float - Current threshold for Mahalanobis distances. - - Examples - -------- - >>> from scikit_mol.applicability import MahalanobisApplicabilityDomain - >>> ad = MahalanobisApplicabilityDomain(percentile=95) - >>> ad.fit(X_train) - >>> # Using chi-square threshold (default) - >>> predictions = ad.predict(X_test) - >>> - >>> # Adjusting threshold using validation set - >>> ad.fit_threshold(X_val, target_percentile=95) - >>> predictions = ad.predict(X_test) - - References - ---------- - .. [1] De Maesschalck, R., Jouan-Rimbaud, D., & Massart, D. L. (2000). - The Mahalanobis distance. Chemometrics and intelligent laboratory - systems, 50(1), 1-18. + Current threshold for domain membership. + + Notes + ----- + The scoring convention is 'high_outside' because higher Mahalanobis + distances indicate samples further from the training data mean. """ - def __init__(self, percentile=95.0): - if not 0 <= percentile <= 100: - raise ValueError("percentile must be between 0 and 100") - self.percentile = percentile + _scoring_convention = "high_outside" - def fit(self, X, y=None): - """Fit the Mahalanobis distance applicability domain. + def __init__( + self, + percentile: Optional[float] = None, + feature_name: str = "Mahalanobis", + ) -> None: + super().__init__(percentile=percentile or 95.0, feature_name=feature_name) - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data. - y : Ignored - Not used, present for API consistency. + def _set_statistical_threshold(self, X: NDArray) -> None: + """Set threshold based on chi-square distribution. - Returns - ------- - self : object - Returns the instance itself. + For multivariate normal data, squared Mahalanobis distances follow + a chi-square distribution with degrees of freedom equal to the + number of features. """ - X = check_array(X) - self.n_features_in_ = X.shape[1] - - # Compute mean and covariance - self.mean_ = np.mean(X, axis=0) - self.covariance_ = np.cov(X, rowvar=False) - self.inv_covariance_ = np.linalg.inv(self.covariance_) - - # Set initial threshold using chi-square distribution - self.threshold_ = stats.chi2.ppf(self.percentile / 100, df=self.n_features_in_) - - return self + df = self.n_features_in_ + self.threshold_ = np.sqrt(stats.chi2.ppf(0.95, df)) - def fit_threshold(self, X, target_percentile=95): - """Update the threshold using new data without refitting the model. + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "MahalanobisApplicabilityDomain": + """Fit the Mahalanobis distance applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - target_percentile : float, default=95 - Target percentile of samples to include within domain. + Training data. + y : Any, optional (default=None) + Not used, present for API consistency. Returns ------- - self : object + self : MahalanobisApplicabilityDomain Returns the instance itself. + + Raises + ------ + ValueError + If X has fewer samples than features, making covariance estimation unstable. """ - check_is_fitted(self) - X = check_array(X) + X = check_array(X, **self._check_params) + n_samples, n_features = X.shape + self.n_features_in_ = n_features - if not 0 <= target_percentile <= 100: - raise ValueError("target_percentile must be between 0 and 100") + if n_samples <= n_features: + raise ValueError( + f"n_samples ({n_samples}) must be greater than n_features ({n_features}) " + "for stable covariance estimation." + ) - # Calculate distances for validation set - scores = self.transform(X).ravel() + # Calculate mean and covariance + self.mean_ = np.mean(X, axis=0) + self.covariance_ = np.cov(X, rowvar=False, ddof=1) - # Set threshold to achieve desired percentile (lower distances = inside domain) - self.threshold_ = np.percentile(scores, 100 - target_percentile) + # Add small regularization to ensure positive definiteness + min_eig = np.min(linalg.eigvalsh(self.covariance_)) + if min_eig < 1e-6: + self.covariance_ += (abs(min_eig) + 1e-6) * np.eye(n_features) + + # Set initial threshold based on training data + self.fit_threshold(X) return self - def transform(self, X): - """Calculate Mahalanobis distances for samples. + def _transform(self, X: NDArray) -> NDArray[np.float64]: + """Calculate Mahalanobis distances. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns ------- distances : ndarray of shape (n_samples, 1) - The Mahalanobis distances. Higher values indicate samples - further from the training data center. - """ - check_is_fitted(self) - X = check_array(X) - - # Center the data - X_centered = X - self.mean_ - - # Calculate Mahalanobis distances - distances = np.sum(X_centered @ self.inv_covariance_ * X_centered, axis=1) - return distances.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). + The Mahalanobis distances of the samples. Higher distances indicate + samples further from the training data mean. """ - scores = self.transform(X).ravel() - return np.where(scores <= self.threshold_, 1, -1) + # Calculate Mahalanobis distances using stable computation + diff = X - self.mean_ + try: + # Try Cholesky decomposition first (more stable) + L = linalg.cholesky(self.covariance_, lower=True) + mahal_dist = np.sqrt( + np.sum(linalg.solve_triangular(L, diff.T, lower=True) ** 2, axis=0) + ) + except linalg.LinAlgError: + # Fallback to standard computation if Cholesky fails + inv_covariance = linalg.pinv( + self.covariance_ + ) # Use pseudo-inverse for stability + mahal_dist = np.sqrt(np.sum(diff @ inv_covariance * diff, axis=1)) + + return mahal_dist.reshape(-1, 1) diff --git a/scikit_mol/applicability/standardization.py b/scikit_mol/applicability/standardization.py index c1e1414..babeeba 100644 --- a/scikit_mol/applicability/standardization.py +++ b/scikit_mol/applicability/standardization.py @@ -7,26 +7,32 @@ See LICENSE.MIT in this directory for the original MIT license. """ +from typing import Any, Optional + import numpy as np +from numpy.typing import ArrayLike, NDArray from scipy import stats -from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import StandardScaler -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class StandardizationApplicabilityDomain(BaseEstimator, TransformerMixin): +class StandardizationApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain based on standardized feature values. Samples are considered within the domain if their standardized features - have a mean + z * std <= threshold, or if their maximum standardized - value <= threshold, where z corresponds to the specified percentile - assuming a normal distribution. + fall within a certain number of standard deviations from the mean. + The maximum absolute standardized value across all features is used + as the score. Parameters ---------- - percentile : float, default=95.0 - Percentile for the confidence interval (0-100). - Default 95.0 corresponds to ~2 standard deviations. + percentile : float or None, default=None + Percentile of training set scores to use as threshold (0-100). + If None, uses 95.0 (exclude top 5% of training samples). + feature_name : str, default="Standardization" + Name for the output feature column. Attributes ---------- @@ -35,131 +41,75 @@ class StandardizationApplicabilityDomain(BaseEstimator, TransformerMixin): scaler_ : StandardScaler Fitted standard scaler. threshold_ : float - Current threshold for standardized values. - - Examples - -------- - >>> from scikit_mol.applicability import StandardizationApplicabilityDomain - >>> ad = StandardizationApplicabilityDomain(percentile=95) - >>> ad.fit(X_train) - >>> # Optionally adjust threshold using validation set - >>> ad.fit_threshold(X_val, target_percentile=95) - >>> predictions = ad.predict(X_test) - - References - ---------- - .. [1] Roy, K., Kar, S., & Ambure, P. (2015). On a simple approach for - determining applicability domain of QSAR models. Chemometrics and - Intelligent Laboratory Systems, 145, 22-29. + Current threshold for domain membership. + + Notes + ----- + The scoring convention is 'high_outside' because higher standardized + values indicate samples further from the training data mean. """ - def __init__(self, percentile=95.0): - if not 0 <= percentile <= 100: - raise ValueError("percentile must be between 0 and 100") - self.percentile = percentile + _scoring_convention = "high_outside" + + def __init__( + self, + percentile: Optional[float] = None, + feature_name: str = "Standardization", + ) -> None: + super().__init__(percentile=percentile or 95.0, feature_name=feature_name) - def fit(self, X, y=None): + def _set_statistical_threshold(self, X: NDArray) -> None: + """Set threshold based on normal distribution. + + For normally distributed data, ~95% of values fall within + 2 standard deviations of the mean. + """ + self.threshold_ = stats.norm.ppf(0.975) # 2 standard deviations + + def fit( + self, X: ArrayLike, y: Optional[Any] = None + ) -> "StandardizationApplicabilityDomain": """Fit the standardization applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. - y : Ignored + y : Any, optional (default=None) Not used, present for API consistency. Returns ------- - self : object + self : StandardizationApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] + # Fit standard scaler self.scaler_ = StandardScaler() self.scaler_.fit(X) - # Convert percentile to z-score for initial threshold - self.threshold_ = stats.norm.ppf(self.percentile / 100) - - return self - - def fit_threshold(self, X, target_percentile=95): - """Update the threshold using new data without refitting the model. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Data to compute threshold from. - target_percentile : float, default=95 - Target percentile of samples to include within domain. - - Returns - ------- - self : object - Returns the instance itself. - """ - check_is_fitted(self) - X = check_array(X) - - if not 0 <= target_percentile <= 100: - raise ValueError("target_percentile must be between 0 and 100") - - # Calculate scores for the provided data - scores = self.transform(X).ravel() - - # Set threshold to achieve desired percentile - self.threshold_ = np.percentile(scores, target_percentile) + # Set initial threshold based on training data + self.fit_threshold(X) return self - def transform(self, X): - """Calculate standardized feature statistics for samples. + def _transform(self, X: NDArray) -> NDArray[np.float64]: + """Calculate maximum absolute standardized values. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns ------- scores : ndarray of shape (n_samples, 1) - The maximum of: - 1. Maximum absolute standardized value - 2. Mean + z * std of standardized values - where z corresponds to the specified percentile. - Higher values indicate samples further from the training data. + The maximum absolute standardized values. Higher values indicate + samples further from the training data mean. """ - check_is_fitted(self) - X = check_array(X) - - # Standardize features + # Calculate standardized values and take max absolute value per sample X_std = self.scaler_.transform(X) - - # Calculate statistics - max_vals = np.max(np.abs(X_std), axis=1) - means = np.mean(X_std, axis=1) - stds = np.std(X_std, axis=1) - z_score = -stats.norm.ppf(self.percentile / 100) # negative for lower tail - mean_std = means + z_score * stds - - # Return maximum of the two criteria - scores = np.maximum(max_vals, mean_std) + scores = np.max(np.abs(X_std), axis=1) return scores.reshape(-1, 1) - - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - return np.where(scores <= self.threshold_, 1, -1) diff --git a/scikit_mol/applicability/topkat.py b/scikit_mol/applicability/topkat.py index 1392c65..4849fc3 100644 --- a/scikit_mol/applicability/topkat.py +++ b/scikit_mol/applicability/topkat.py @@ -8,17 +8,28 @@ """ +from typing import Any, Optional + import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array, check_is_fitted +from numpy.typing import ArrayLike, NDArray +from sklearn.utils.validation import check_array + +from .base import BaseApplicabilityDomain -class TopkatApplicabilityDomain(BaseEstimator, TransformerMixin): +class TopkatApplicabilityDomain(BaseApplicabilityDomain): """Applicability domain defined using TOPKAT's Optimal Prediction Space (OPS). The method transforms the input space (P-space) to a normalized space (S-space), then projects it to the Optimal Prediction Space using eigendecomposition. + Parameters + ---------- + percentile : float or None, default=None + Not used, present for API consistency. + feature_name : str, default="TOPKAT" + Name for the output feature column. + Attributes ---------- n_features_in_ : int @@ -31,13 +42,13 @@ class TopkatApplicabilityDomain(BaseEstimator, TransformerMixin): Eigenvalues of the S-space transformation. eigen_vec_ : ndarray of shape (n_features + 1, n_features + 1) Eigenvectors of the S-space transformation. + threshold_ : float + Fixed threshold based on dimensionality. - Examples - -------- - >>> from scikit_mol.applicability import TopkatApplicabilityDomain - >>> ad = TopkatApplicabilityDomain() - >>> ad.fit(X_train) - >>> predictions = ad.predict(X_test) + Notes + ----- + The scoring convention is 'high_outside' because higher OPS distances + indicate samples further from the training data. References ---------- @@ -45,22 +56,32 @@ class TopkatApplicabilityDomain(BaseEstimator, TransformerMixin): predictions (US Patent No. 6-036-349) USPTO. """ - def fit(self, X, y=None): + _scoring_convention = "high_outside" + _supports_threshold_fitting = False + + def __init__( + self, + percentile: Optional[float] = None, + feature_name: str = "TOPKAT", + ) -> None: + super().__init__(percentile=None, feature_name=feature_name) + + def fit(self, X: ArrayLike, y: Optional[Any] = None) -> "TopkatApplicabilityDomain": """Fit the TOPKAT applicability domain. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. - y : Ignored + y : Any, optional (default=None) Not used, present for API consistency. Returns ------- - self : object + self : TopkatApplicabilityDomain Returns the instance itself. """ - X = check_array(X) + X = check_array(X, **self._check_params) self.n_features_in_ = X.shape[1] n_samples = X.shape[0] @@ -84,14 +105,17 @@ def fit(self, X, y=None): self.eigen_val_ = np.real(self.eigen_val_) self.eigen_vec_ = np.real(self.eigen_vec_) + # Set fixed threshold based on dimensionality + self.threshold_ = 5 * (self.n_features_in_ + 1) / (2 * self.n_features_in_) + return self - def transform(self, X): + def _transform(self, X: NDArray) -> NDArray[np.float64]: """Calculate OPS distance scores for samples. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : ndarray of shape (n_samples, n_features) The data to transform. Returns @@ -100,9 +124,6 @@ def transform(self, X): OPS distance scores. Higher values indicate samples further from the training data. """ - check_is_fitted(self) - X = check_array(X) - # Transform to S-space denom = np.where( (self.X_max_ - self.X_min_) != 0, (self.X_max_ - self.X_min_), 1 @@ -129,20 +150,20 @@ def transform(self, X): return distances.reshape(-1, 1) - def predict(self, X): - """Predict whether samples are within the applicability domain. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The samples to predict. - - Returns - ------- - y_pred : ndarray of shape (n_samples,) - Returns 1 for samples inside the domain and -1 for samples outside - (following scikit-learn's convention for outlier detection). - """ - scores = self.transform(X).ravel() - threshold = 5 * (self.n_features_in_ + 1) / (2 * self.n_features_in_) - return np.where(scores < threshold, 1, -1) + # def predict(self, X): + # """Predict whether samples are within the applicability domain. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # The samples to predict. + + # Returns + # ------- + # y_pred : ndarray of shape (n_samples,) + # Returns 1 for samples inside the domain and -1 for samples outside + # (following scikit-learn's convention for outlier detection). + # """ + # scores = self._transform(X).ravel() + # threshold = self.threshold_ + # return np.where(scores < threshold, 1, -1) diff --git a/tests/applicability/conftest.py b/tests/applicability/conftest.py index f822b1f..764e724 100644 --- a/tests/applicability/conftest.py +++ b/tests/applicability/conftest.py @@ -8,8 +8,13 @@ ConvexHullApplicabilityDomain, HotellingT2ApplicabilityDomain, IsolationForestApplicabilityDomain, + KernelDensityApplicabilityDomain, KNNApplicabilityDomain, LeverageApplicabilityDomain, + LocalOutlierFactorApplicabilityDomain, + MahalanobisApplicabilityDomain, + StandardizationApplicabilityDomain, + TopkatApplicabilityDomain, ) from scikit_mol.fingerprints import MorganFingerprintTransformer @@ -31,6 +36,19 @@ random_state=42, # Add fixed random state ), ), + ( + KernelDensityApplicabilityDomain, + dict(bandwidth=1.0, kernel="gaussian"), + ), + ( + LocalOutlierFactorApplicabilityDomain, + dict( + n_neighbors=3, contamination=0.1 + ), # Reduced from 20 to 3 for small test datasets + ), + (MahalanobisApplicabilityDomain, dict()), # No special parameters needed + (StandardizationApplicabilityDomain, dict()), # No special parameters needed + (TopkatApplicabilityDomain, dict()), # No special parameters needed ] ) def ad_estimator(request): diff --git a/tests/applicability/test_base.py b/tests/applicability/test_base.py index a7d7a58..56eba50 100644 --- a/tests/applicability/test_base.py +++ b/tests/applicability/test_base.py @@ -79,10 +79,10 @@ def test_feature_names(ad_estimator, reduced_fingerprints): """Test feature names are properly handled.""" ad_estimator.fit(reduced_fingerprints) - # Check feature names exist and match prefix + # Check feature names exist and match name feature_names = ad_estimator.get_feature_names_out() assert len(feature_names) == 1 - assert feature_names[0].startswith(ad_estimator.feature_prefix) + assert feature_names[0] == ad_estimator.feature_name def test_pandas_output(ad_estimator, reduced_fingerprints): @@ -94,7 +94,7 @@ def test_pandas_output(ad_estimator, reduced_fingerprints): scores_df = ad_estimator.transform(reduced_fingerprints) assert hasattr(scores_df, "columns") assert len(scores_df.columns) == 1 - assert scores_df.columns[0].startswith(ad_estimator.feature_prefix) + assert scores_df.columns[0] == ad_estimator.feature_name # Test predict output pred_df = ad_estimator.predict(reduced_fingerprints) diff --git a/tests/applicability/test_kernel_density.py b/tests/applicability/test_kernel_density.py new file mode 100644 index 0000000..2c178c4 --- /dev/null +++ b/tests/applicability/test_kernel_density.py @@ -0,0 +1,44 @@ +"""Tests for KernelDensityApplicabilityDomain.""" + +import pytest + +from scikit_mol.applicability import KernelDensityApplicabilityDomain + + +@pytest.fixture +def ad_estimator(): + """Fixture providing a KernelDensityApplicabilityDomain instance.""" + return KernelDensityApplicabilityDomain() + + +def test_kernel_parameter(): + """Test different kernel parameters.""" + kernels = ["gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine"] + # Create data with clear density gradient + X = [[0, 0], [0.1, 0.1], [0.2, 0.2], [2, 2]] + + for kernel in kernels: + ad = KernelDensityApplicabilityDomain(kernel=kernel) + ad.fit(X) + scores = ad.transform(X) + assert scores.shape == (4, 1) + # First point should have higher density than last point + assert scores[0, 0] > scores[-1, 0], f"Failed for kernel {kernel}" + + +def test_bandwidth_effect(): + """Test effect of bandwidth parameter on scores.""" + X = [[0, 0], [1, 1], [2, 2]] + test_point = [[10, 10]] # Far from training data + + # Larger bandwidth should give higher scores to outliers + ad_small = KernelDensityApplicabilityDomain(bandwidth=0.1) + ad_large = KernelDensityApplicabilityDomain(bandwidth=10.0) + + ad_small.fit(X) + ad_large.fit(X) + + score_small = ad_small.transform(test_point) + score_large = ad_large.transform(test_point) + + assert score_large[0, 0] > score_small[0, 0] diff --git a/tests/applicability/test_local_outlier.py b/tests/applicability/test_local_outlier.py new file mode 100644 index 0000000..c0b129f --- /dev/null +++ b/tests/applicability/test_local_outlier.py @@ -0,0 +1,64 @@ +"""Tests for LocalOutlierFactorApplicabilityDomain.""" + +import numpy as np +import pytest + +from scikit_mol.applicability import LocalOutlierFactorApplicabilityDomain + + +@pytest.fixture +def ad_estimator(): + """Fixture providing a LocalOutlierFactorApplicabilityDomain instance.""" + return LocalOutlierFactorApplicabilityDomain() + + +def test_n_neighbors_effect(): + """Test effect of n_neighbors parameter on scores.""" + # Create data with clear outlier + X = np.vstack([np.random.randn(50, 2), [[10, 10]]]) + outlier = np.array([[10, 10]]) + + # Compare different n_neighbors settings + ad_small = LocalOutlierFactorApplicabilityDomain(n_neighbors=5) + ad_large = LocalOutlierFactorApplicabilityDomain(n_neighbors=20) + + ad_small.fit(X) + ad_large.fit(X) + + score_small = ad_small.transform(outlier) + score_large = ad_large.transform(outlier) + + # Scores should be different but both should identify the point as an outlier + assert score_small != score_large + assert ad_small.predict(outlier) == -1 + assert ad_large.predict(outlier) == -1 + + +def test_metric_parameter(): + """Test different metric parameters.""" + metrics = ["euclidean", "manhattan", "cosine"] + X = np.random.randn(10, 2) + + for metric in metrics: + ad = LocalOutlierFactorApplicabilityDomain(metric=metric) + ad.fit(X) + scores = ad.transform(X) + assert scores.shape == (10, 1) + + +def test_contamination_effect(): + """Test effect of contamination parameter on predictions.""" + X = np.random.randn(100, 2) + + # Compare different contamination levels + ad_low = LocalOutlierFactorApplicabilityDomain(contamination=0.1) + ad_high = LocalOutlierFactorApplicabilityDomain(contamination=0.2) + + ad_low.fit(X) + ad_high.fit(X) + + pred_low = ad_low.predict(X) + pred_high = ad_high.predict(X) + + # Higher contamination should result in more outliers + assert np.sum(pred_high == -1) > np.sum(pred_low == -1) diff --git a/tests/applicability/test_mahalanobis.py b/tests/applicability/test_mahalanobis.py new file mode 100644 index 0000000..5d0c31c --- /dev/null +++ b/tests/applicability/test_mahalanobis.py @@ -0,0 +1,66 @@ +"""Tests for MahalanobisApplicabilityDomain.""" + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal + +from scikit_mol.applicability import MahalanobisApplicabilityDomain + + +@pytest.fixture +def ad_estimator(): + """Fixture providing a MahalanobisApplicabilityDomain instance.""" + return MahalanobisApplicabilityDomain() + + +def test_statistical_threshold(): + """Test chi-square based statistical threshold.""" + # Create multivariate normal data + n_samples = 1000 + n_features = 3 + mean = np.zeros(n_features) + cov = np.eye(n_features) + X = np.random.multivariate_normal(mean, cov, n_samples) + + # Fit with statistical threshold + ad = MahalanobisApplicabilityDomain(percentile=None) + ad.fit(X) + + # For standard normal data, ~95% should be within threshold + predictions = ad.predict(X) + inside_ratio = np.mean(predictions == 1) + assert 0.93 <= inside_ratio <= 0.97 # Allow some variation + + +def test_mean_covariance(): + """Test mean and covariance computation.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + ad = MahalanobisApplicabilityDomain() + ad.fit(X) + + # Check mean computation + expected_mean = np.array([3, 4]) + assert_array_almost_equal(ad.mean_, expected_mean) + + # Check covariance computation + expected_cov = np.array([[4, 4], [4, 4]]) + assert_array_almost_equal(ad.covariance_, expected_cov) + + +def test_distance_properties(): + """Test properties of Mahalanobis distances.""" + # Create data with clear outlier + X = np.vstack([np.random.randn(50, 2), [[10, 10]]]) + outlier = np.array([[10, 10]]) + + ad = MahalanobisApplicabilityDomain() + ad.fit(X) + + # Distance to mean should be zero + mean_dist = ad.transform(ad.mean_.reshape(1, -1)) + assert_array_almost_equal(mean_dist, [[0]], decimal=10) + + # Outlier should have large distance and be predicted outside + outlier_dist = ad.transform(outlier) + assert outlier_dist[0, 0] > ad.threshold_ + assert ad.predict(outlier) == -1 diff --git a/tests/applicability/test_standardization.py b/tests/applicability/test_standardization.py new file mode 100644 index 0000000..fb9fb2a --- /dev/null +++ b/tests/applicability/test_standardization.py @@ -0,0 +1,76 @@ +"""Tests for StandardizationApplicabilityDomain.""" + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal + +from scikit_mol.applicability import StandardizationApplicabilityDomain + + +@pytest.fixture +def ad_estimator(): + """Fixture providing a StandardizationApplicabilityDomain instance.""" + return StandardizationApplicabilityDomain() + + +def test_statistical_threshold(): + """Test normal distribution based statistical threshold.""" + # Create standard normal data + n_samples = 1000 + n_features = 3 + X = np.random.randn(n_samples, n_features) + + # Fit with statistical threshold + ad = StandardizationApplicabilityDomain(percentile=None) + ad.fit(X) + + # For standard normal data, ~95% should be within threshold + predictions = ad.predict(X) + inside_ratio = np.mean(predictions == 1) + assert 0.93 <= inside_ratio <= 0.97 # Allow some variation + + +def test_standardization(): + """Test standardization of features.""" + X = np.array([[1, 2], [3, 4], [5, 6]]) + ad = StandardizationApplicabilityDomain() + ad.fit(X) + + # Transform data + X_std = ad.scaler_.transform(X) + + # Check standardization properties + assert_array_almost_equal(np.mean(X_std, axis=0), [0, 0]) + assert_array_almost_equal(np.std(X_std, axis=0), [1, 1]) + + +def test_max_absolute_score(): + """Test that scores are maximum absolute standardized values.""" + # Create data with known standardized values + X = np.array([[0, 0], [1, 2], [3, -4]]) + ad = StandardizationApplicabilityDomain() + ad.fit(X) + + # Create test point with one extreme standardized value + X_test = np.array([[0, 10]]) # Second feature will be very large when standardized + scores = ad.transform(X_test) + + # Score should be the maximum absolute standardized value + X_std = ad.scaler_.transform(X_test) + expected_score = np.max(np.abs(X_std)) + assert_array_almost_equal(scores, [[expected_score]]) + + +def test_outlier_detection(): + """Test outlier detection on simple dataset.""" + # Create data with clear outlier + X = np.vstack([np.random.randn(50, 2), [[10, 10]]]) + outlier = np.array([[10, 10]]) + + ad = StandardizationApplicabilityDomain() + ad.fit(X) + + # Outlier should have high score and be predicted outside + outlier_score = ad.transform(outlier) + assert outlier_score[0, 0] > ad.threshold_ + assert ad.predict(outlier) == -1 diff --git a/tests/applicability/test_topkat.py b/tests/applicability/test_topkat.py new file mode 100644 index 0000000..4d3e792 --- /dev/null +++ b/tests/applicability/test_topkat.py @@ -0,0 +1,55 @@ +"""Tests for TopkatApplicabilityDomain.""" + +import numpy as np +import pytest +from numpy.testing import assert_array_almost_equal + +from scikit_mol.applicability import TopkatApplicabilityDomain + + +@pytest.fixture +def ad_estimator(): + """Fixture providing a TopkatApplicabilityDomain instance.""" + return TopkatApplicabilityDomain() + + +def test_ops_transformation(): + """Test OPS transformation and distance calculation.""" + # Create simple test data + X_train = np.array([[0, 0], [1, 1], [2, 2]]) + X_test = np.array([[0.5, 0.5], [10, 10]]) + + # Fit AD model + ad = TopkatApplicabilityDomain() + ad.fit(X_train) + + # Check distances + distances = ad.transform(X_test) + assert distances.shape == (2, 1) + assert distances[0] < distances[1] # Interpolated point should have lower distance + + +def test_fixed_threshold(): + """Test that threshold is based on dimensionality.""" + X = np.random.randn(10, 3) + ad = TopkatApplicabilityDomain() + ad.fit(X) + + # Check threshold formula + expected_threshold = 5 * (3 + 1) / (2 * 3) # n_features = 3 + assert_array_almost_equal(ad.threshold_, expected_threshold) + + +def test_eigendecomposition(): + """Test eigendecomposition properties.""" + X = np.random.randn(10, 2) + ad = TopkatApplicabilityDomain() + ad.fit(X) + + # Check eigenvalue/vector shapes + assert ad.eigen_val_.shape == (3,) # n_features + 1 + assert ad.eigen_vec_.shape == (3, 3) # (n_features + 1, n_features + 1) + + # Check eigenvalues are real and sorted + assert np.all(np.isreal(ad.eigen_val_)) + assert np.all(np.diff(ad.eigen_val_) >= 0) # Sorted in ascending order diff --git a/tests/conftest.py b/tests/conftest.py index 2359cd1..ee916ec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ import hashlib import shutil -from pathlib import Path -from pathlib import PurePath +from pathlib import Path, PurePath from urllib.parse import urlsplit from urllib.request import urlopen @@ -10,6 +9,14 @@ import sklearn +# Register custom marks +def pytest_configure(config): + config.addinivalue_line( + "markers", + "threshold_fitting: mark tests that verify threshold fitting functionality", + ) + + TEST_DATA_URL = "https://ndownloader.figshare.com/files/25747817" TEST_DATA_MD5 = "1ec89bde544c3c4bc400d5b75315921e" @@ -32,9 +39,7 @@ def data_pth(tmp_path_factory) -> Path: if not data_fn.is_file(): # download svs from openslide test images - with urlopen(TEST_DATA_URL) as response, open( - data_fn, "wb" - ) as out_file: + with urlopen(TEST_DATA_URL) as response, open(data_fn, "wb") as out_file: shutil.copyfileobj(response, out_file) if md5(data_fn) != TEST_DATA_MD5: # pragma: no cover @@ -43,13 +48,15 @@ def data_pth(tmp_path_factory) -> Path: yield data_fn.absolute() + @pytest.fixture() def data(data_pth) -> pd.DataFrame: yield pd.read_csv(data_pth) + @pytest.fixture(scope="module") def pandas_output(): """Set sklearn to output pandas dataframes""" sklearn.set_config(transform_output="pandas") yield - sklearn.set_config(transform_output="default") \ No newline at end of file + sklearn.set_config(transform_output="default") diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..81b8773 --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest + +from scikit_mol.metrics import tanimoto_distance + +from .applicability.conftest import binary_fingerprints + + +@pytest.fixture +def simple_fingerprints(): + """Create simple binary fingerprints for testing.""" + return np.array( + [ + [1, 1, 0, 0], # fp0: 2 bits set + [1, 0, 1, 0], # fp1: 2 bits set, 1 in common with fp1 + [0, 0, 1, 1], # fp2: 2 bits set, 1 in common with fp2, none with fp1 + [1, 1, 1, 1], # fp3: all bits set + [0, 0, 0, 0], # fp4: no bits set + ], + dtype=bool, + ) + + +def test_tanimoto_distance_basic(simple_fingerprints): + """Test basic properties of Tanimoto distance.""" + distances = tanimoto_distance(simple_fingerprints[0], simple_fingerprints[1]) + + # Check distance range [0,1] + assert 0 <= distances <= 1 + + # Check specific distances + # fp0 vs fp1: 1 bit in common, 3 in union -> distance = 2/3 + assert np.isclose(distances, 2 / 3) + # fp0 vs fp2: no bits in common, 4 in union -> distance = 1 + assert np.isclose( + tanimoto_distance(simple_fingerprints[0], simple_fingerprints[2]), 1.0 + ) + # fp0 vs fp3: 2 bits in common, 4 in union -> distance = 0.5 + assert np.isclose( + tanimoto_distance(simple_fingerprints[0], simple_fingerprints[3]), 0.5 + ) + # fp0 vs fp4: no bits in common, 2 in union -> distance = 1 + assert np.isclose( + tanimoto_distance(simple_fingerprints[0], simple_fingerprints[4]), 1.0 + ) + + +def test_tanimoto_distance_edge_cases(simple_fingerprints): + """Test edge cases for Tanimoto distance.""" + empty = simple_fingerprints[4] # Empty fingerprint + full = simple_fingerprints[3] # Full fingerprint + + # Two empty fingerprints (fp4) + dist = tanimoto_distance(empty, empty) + # No bits in common, 0 in union -> distance = 0/0 = 0 in our implementation. + assert np.isclose(dist, 0.0) + + # Empty vs full fingerprint (fp3) + dist = tanimoto_distance(empty, full) + assert np.isclose(dist, 1.0) # No overlap -> maximum distance + + +# TODO, can rdkit speed things up? But not working with np.arrays +# def test_tanimoto_implementations_equivalent(simple_fingerprints): +# """Test that both implementations give equivalent results.""" +# X = simple_fingerprints[:2] +# Y = simple_fingerprints[2:4] + +# dist1 = tanimoto_distance(X, Y) +# dist2 = tanimoto_distance_rdkit(X, Y) + +# assert np.allclose(dist1, dist2) + + +# def test_tanimoto_distance_rdkit_basic(binary_fingerprints): +# """Test basic properties of RDKit-based Tanimoto distance.""" +# # Get a subset of fingerprints for testing +# X = binary_fingerprints[:3] +# Y = binary_fingerprints[3:6] + +# distances = tanimoto_distance_rdkit(X, Y) + +# # Check output shape +# assert distances.shape == (3, 3) + +# # Check distance range [0,1] +# assert np.all((0 <= distances) & (distances <= 1)) + +# # Check self-distance is 0 for identical fingerprints +# self_distances = tanimoto_distance_rdkit(X, X) +# assert np.allclose(np.diag(self_distances), 0) From c2a9dc4a02969d1fdbfe60e2026c73c6e8971b3e Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Wed, 12 Feb 2025 09:58:28 +0100 Subject: [PATCH 10/24] WIP on the adapter. Not fully there yet as some complications in the parallel helper functions. --- notebooks/12_applicability_domain.py | 298 +++++++++++++++++++++++++ scikit_mol/adapters.py | 322 ++++++++++++++------------- tests/test_adapters.py | 145 ++++++++++++ 3 files changed, 607 insertions(+), 158 deletions(-) create mode 100644 notebooks/12_applicability_domain.py create mode 100644 tests/test_adapters.py diff --git a/notebooks/12_applicability_domain.py b/notebooks/12_applicability_domain.py new file mode 100644 index 0000000..7baa492 --- /dev/null +++ b/notebooks/12_applicability_domain.py @@ -0,0 +1,298 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: -all +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.6 +# --- + +# %% +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Applicability Domain Estimation +# +# This notebook demonstrates how to use scikit-mol's applicability domain estimators to assess whether new compounds are within the domain of applicability of a trained model. +# +# We'll explore two different approaches: +# 1. Using Morgan binary fingerprints with a k-Nearest Neighbors based applicability domain +# 2. Using count-based Morgan fingerprints with dimensionality reduction and a leverage-based applicability domain +# +# First, let's import the necessary libraries and load our dataset: + +# %% +import numpy as np +import pandas as pd +from rdkit import Chem +from rdkit.Chem import PandasTools +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA + +from scikit_mol.conversions import SmilesToMolTransformer +from scikit_mol.fingerprints import MorganFingerprintTransformer +from scikit_mol.applicability import KNNApplicabilityDomain, LeverageApplicabilityDomain + +# %% [markdown] +# ## Load and Prepare Data + +# %% +# Load the dataset +csv_file = "../tests/data/SLC6A4_active_excapedb_subset.csv" +data = pd.read_csv(csv_file) + +# Add RDKit mol objects +PandasTools.AddMoleculeColumnToFrame(data, smilesCol="SMILES") +print(f"{data.ROMol.isna().sum()} out of {len(data)} SMILES failed in conversion") + +# Split into train/val/test +X = data.ROMol +y = data.pXC50 + +X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42) + +# %% [markdown] +# ## Example 1: k-NN Applicability Domain with Binary Morgan Fingerprints +# +# In this example, we'll use binary Morgan fingerprints and a k-NN based applicability domain with Tanimoto distance. +# This is particularly suitable for binary fingerprints as the Tanimoto coefficient is a natural similarity measure for them. + +# %% +# Create pipeline for binary fingerprints +binary_fp_pipe = Pipeline([ + ('fp', MorganFingerprintTransformer(fpSize=2048, radius=2)), + ('rf', RandomForestRegressor(n_estimators=100, random_state=42)) +]) + +# Train the model +binary_fp_pipe.fit(X_train, y_train) + +# Get predictions and errors +y_pred_test = binary_fp_pipe.predict(X_test) +abs_errors = np.abs(y_test - y_pred_test) + +# Create and fit k-NN AD estimator +knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric='tanimoto') +knn_ad.fit(binary_fp_pipe.named_steps['fp'].transform(X_train)) + +# Fit threshold using validation set +knn_ad.fit_threshold(binary_fp_pipe.named_steps['fp'].transform(X_val)) + +# Get AD scores for test set +knn_scores = knn_ad.transform(binary_fp_pipe.named_steps['fp'].transform(X_test)) + +# %% [markdown] +# Let's visualize the relationship between prediction errors and AD scores: + +# %% +plt.figure(figsize=(10, 6)) +plt.scatter(knn_scores, abs_errors, alpha=0.5) +plt.axvline(x=knn_ad.threshold_, color='r', linestyle='--', label='AD Threshold') +plt.xlabel('k-NN AD Score') +plt.ylabel('Absolute Prediction Error') +plt.title('Prediction Errors vs k-NN AD Scores') +plt.legend() +plt.show() + +# Calculate error statistics +in_domain = knn_ad.predict(binary_fp_pipe.named_steps['fp'].transform(X_test)) +errors_in = abs_errors[in_domain == 1] +errors_out = abs_errors[in_domain == -1] + +print(f"95th percentile of errors inside domain: {np.percentile(errors_in, 95):.2f}") +print(f"95th percentile of errors outside domain: {np.percentile(errors_out, 95):.2f}") +print(f"Fraction of samples outside domain: {(in_domain == -1).mean():.2f}") + +# %% [markdown] +# ## Example 2: Leverage-based AD with Count-based Morgan Fingerprints +# +# In this example, we'll use count-based Morgan fingerprints, reduce their dimensionality with PCA, +# and apply a leverage-based applicability domain estimator. + +# %% +# Create pipeline for count-based fingerprints with PCA +count_fp_pipe = Pipeline([ + ('fp', MorganFingerprintTransformer(fpSize=2048, radius=2, useCounts=True)), + ('pca', PCA(n_components=0.9)), # Keep 90% of variance + ('scaler', StandardScaler()), + ('rf', RandomForestRegressor(n_estimators=100, random_state=42)) +]) + +# Train the model +count_fp_pipe.fit(X_train, y_train) + +# Get predictions and errors +y_pred_test = count_fp_pipe.predict(X_test) +abs_errors = np.abs(y_test - y_pred_test) + +# Create and fit leverage AD estimator +leverage_ad = LeverageApplicabilityDomain() +X_train_transformed = count_fp_pipe.named_steps['scaler'].transform( + count_fp_pipe.named_steps['pca'].transform( + count_fp_pipe.named_steps['fp'].transform(X_train) + ) +) +leverage_ad.fit(X_train_transformed) + +# Fit threshold using validation set +X_val_transformed = count_fp_pipe.named_steps['scaler'].transform( + count_fp_pipe.named_steps['pca'].transform( + count_fp_pipe.named_steps['fp'].transform(X_val) + ) +) +leverage_ad.fit_threshold(X_val_transformed) + +# Get AD scores for test set +X_test_transformed = count_fp_pipe.named_steps['scaler'].transform( + count_fp_pipe.named_steps['pca'].transform( + count_fp_pipe.named_steps['fp'].transform(X_test) + ) +) +leverage_scores = leverage_ad.transform(X_test_transformed) + +# %% [markdown] +# Visualize the relationship between prediction errors and leverage scores: + +# %% +plt.figure(figsize=(10, 6)) +plt.scatter(leverage_scores, abs_errors, alpha=0.5) +plt.axvline(x=leverage_ad.threshold_, color='r', linestyle='--', label='AD Threshold') +plt.xlabel('Leverage AD Score') +plt.ylabel('Absolute Prediction Error') +plt.title('Prediction Errors vs Leverage Scores') +plt.legend() +plt.show() + +# Calculate error statistics +in_domain = leverage_ad.predict(X_test_transformed) +errors_in = abs_errors[in_domain == 1] +errors_out = abs_errors[in_domain == -1] + +print(f"95th percentile of errors inside domain: {np.percentile(errors_in, 95):.2f}") +print(f"95th percentile of errors outside domain: {np.percentile(errors_out, 95):.2f}") +print(f"Fraction of samples outside domain: {(in_domain == -1).mean():.2f}") + +# %% [markdown] +# ## Testing Famous Drugs +# +# Let's test some well-known drugs to see if they fall within our model's applicability domain: + +# %% +# Define famous drugs +famous_drugs = { + 'Aspirin': 'CC(=O)OC1=CC=CC=C1C(=O)O', + 'Viagra': 'CCc1nn(C)c2c(=O)[nH]c(nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4', + 'Heroin': 'CN1CC[C@]23[C@H]4Oc5c(O)ccc(CC1[C@H]2C=C[C@@H]4O3)c5', +} + +# Function to process a drug through both AD pipelines +def check_drug_applicability(smiles, name): + mol = Chem.MolFromSmiles(smiles) + + # k-NN AD + fp_binary = binary_fp_pipe.named_steps['fp'].transform([mol]) + knn_score = knn_ad.transform(fp_binary)[0][0] + knn_status = "Inside" if knn_ad.predict(fp_binary)[0] == 1 else "Outside" + + # Leverage AD + fp_count = count_fp_pipe.named_steps['fp'].transform([mol]) + fp_pca = count_fp_pipe.named_steps['pca'].transform(fp_count) + fp_scaled = count_fp_pipe.named_steps['scaler'].transform(fp_pca) + leverage_score = leverage_ad.transform(fp_scaled)[0][0] + leverage_status = "Inside" if leverage_ad.predict(fp_scaled)[0] == 1 else "Outside" + + return { + 'knn_score': knn_score, + 'knn_status': knn_status, + 'leverage_score': leverage_score, + 'leverage_status': leverage_status + } + +# Process each drug +results = [] +for name, smiles in famous_drugs.items(): + result = check_drug_applicability(smiles, name) + results.append({ + 'Drug': name, + 'k-NN Score': result['knn_score'], + 'k-NN Status': result['knn_status'], + 'Leverage Score': result['leverage_score'], + 'Leverage Status': result['leverage_status'] + }) + +# Display results +pd.DataFrame(results).set_index('Drug') + +# %% [markdown] +# Let's visualize where these drugs fall in our AD plots: + +# %% +# Plot for k-NN AD +plt.figure(figsize=(12, 5)) +plt.subplot(1, 2, 1) +plt.scatter(knn_scores, abs_errors, alpha=0.2, label='Test compounds') +plt.axvline(x=knn_ad.threshold_, color='r', linestyle='--', label='AD Threshold') + +for result in results: + plt.axvline(x=result['k-NN Score'], color='g', alpha=0.5, + label=f"{result['Drug']}") + +plt.xlabel('k-NN AD Score') +plt.ylabel('Absolute Prediction Error') +plt.title('k-NN AD Scores') +plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + +# Plot for Leverage AD +plt.subplot(1, 2, 2) +plt.scatter(leverage_scores, abs_errors, alpha=0.2, label='Test compounds') +plt.axvline(x=leverage_ad.threshold_, color='r', linestyle='--', label='AD Threshold') + +for result in results: + plt.axvline(x=result['Leverage Score'], color='g', alpha=0.5, + label=f"{result['Drug']}") + +plt.xlabel('Leverage AD Score') +plt.ylabel('Absolute Prediction Error') +plt.title('Leverage AD Scores') +plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + +plt.tight_layout() +plt.show() + +# %% [markdown] +# ## Conclusions +# +# This notebook demonstrated two different approaches to applicability domain estimation: +# +# 1. The k-NN based approach with binary fingerprints and Tanimoto distance provides a chemical similarity-based assessment +# of whether new compounds are similar enough to the training set. +# +# 2. The leverage-based approach with count-based fingerprints and dimensionality reduction focuses on the statistical +# novelty of compounds in the reduced feature space. +# +# The famous drugs we tested showed varying degrees of being within the applicability domain, which makes sense given +# that our training set is focused on SLC6A4 actives, while these drugs have different primary targets. +# +# The error analysis shows that compounds outside the applicability domain tend to have higher prediction errors, +# validating the usefulness of these approaches for identifying potentially unreliable predictions. diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index e5451f9..4bedd40 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -1,175 +1,181 @@ -import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin -from sklearn.utils._pprint import _EstimatorPrettyPrinter -from sklearn.utils._param_validation import validate_params -from sklearn.utils.metaestimators import available_if -from sklearn.utils.validation import check_is_fitted, check_array -from sklearn.utils._estimator_html_repr import _VisualBlock -from sklearn.utils._set_output import _safe_set_output -from scipy.stats import chi2 - - -class EstimatorUnion(BaseEstimator, TransformerMixin): - def __init__(self, estimators): - self.estimators = estimators - - def fit(self, X, y=None): - self.fitted_estimators_ = [] - for name, estimator in self.estimators: - if hasattr(estimator, "fit"): - fitted_estimator = estimator.fit(X, y) - self.fitted_estimators_.append((name, fitted_estimator)) - else: - self.fitted_estimators_.append((name, estimator)) - return self - - def transform(self, X): - check_is_fitted(self) - results = [] - for name, estimator in self.fitted_estimators_: - if hasattr(estimator, "predict"): - results.append(estimator.predict(X)) - elif hasattr(estimator, "transform"): - results.append(estimator.transform(X)) - return np.column_stack(results) - - def predict(self, X): - return self.transform(X) +from typing import Dict, List, Optional, Tuple - @available_if(lambda self: hasattr(self, "fitted_estimators_")) - def get_feature_names_out(self, input_features=None): - feature_names = [] - for name, estimator in self.fitted_estimators_: - if hasattr(estimator, "get_feature_names_out"): - feature_names.extend(estimator.get_feature_names_out()) - else: - feature_names.append(name) - return np.array(feature_names) +import numpy as np +from numpy.typing import NDArray +from sklearn.base import BaseEstimator +from sklearn.pipeline import FeatureUnion + + +class EstimatorUnion(FeatureUnion): + """A more flexible version of FeatureUnion that supports various estimator types. + + This class extends scikit-learn's FeatureUnion to support estimators with different + method interfaces (predict, transform, etc.) and allows explicit method selection. + It maintains all functionality of FeatureUnion while adding flexible method resolution. + + Parameters + ---------- + estimators : list of (str, estimator) tuples + List of (name, estimator) tuples, where estimator is any scikit-learn + compatible estimator with at least one of the methods specified in + method_resolution_order. + method_resolution_order : tuple of str, default=("predict", "transform") + Ordered tuple of method names to try when getting output from estimators. + Methods are tried in order until a valid one is found. + selected_methods : dict or None, default=None + Optional mapping of estimator names to specific methods to use. Takes + precedence over method_resolution_order. + n_jobs : int or None, default=None + Number of jobs to run in parallel. None means 1. + transformer_weights : dict or None, default=None + Multiplicative weights for features per transformer. Keys are transformer + names, values are weights. + verbose : bool, default=False + If True, the time elapsed while fitting each transformer will be printed. + + Attributes + ---------- + transformers_ : list + List of fitted transformers. + + Notes + ----- + This class inherits from FeatureUnion and maintains all its functionality including + parallel processing, transformer weights, and metadata routing. The key extension is + the ability to handle estimators with different method interfaces through configurable + method resolution. + + See Also + -------- + sklearn.pipeline.FeatureUnion : The parent class providing base functionality. + """ - def set_output(self, *, transform=None): - """Set output container for all estimators. + def __init__( + self, + estimator_list: List[Tuple[str, BaseEstimator]], + *, + method_resolution_order: Tuple[str, ...] = ("predict", "transform"), + selected_methods: Optional[Dict[str, str]] = None, + n_jobs: Optional[int] = None, + transformer_weights: Optional[Dict[str, float]] = None, + verbose: bool = False, + ) -> None: + # Store all parameters as properties + self.estimator_list = estimator_list + self.method_resolution_order = method_resolution_order + self.selected_methods = selected_methods or {} + self.n_jobs = n_jobs + self.transformer_weights = transformer_weights + self.verbose = verbose + + @property + def estimator_list(self) -> List[Tuple[str, BaseEstimator]]: + """Get estimators (alias for transformer_list).""" + return self.transformer_list + + @estimator_list.setter + def estimator_list(self, estimator_list: List[Tuple[str, BaseEstimator]]) -> None: + """Set estimators (and internal transformer_list property).""" + self.transformer_list = estimator_list + + def _get_method_name(self, estimator_tuple: Tuple[str, BaseEstimator]) -> str: + """Get the appropriate method name for the estimator, raising clear errors if not found. Parameters ---------- - transform : {"default", "pandas"}, default=None - Configure output of `transform` and `fit_transform`. + estimator_tuple : tuple of (str, estimator) + Tuple containing the estimator name and instance. Returns ------- - self : estimator instance - Estimator instance. - """ - for _, estimator in self.estimators: - _safe_set_output(estimator, transform=transform) - return super().set_output(transform=transform) - - def __repr__(self): - class_name = self.__class__.__name__ - estimator_reprs = [] - for name, estimator in self.estimators: - estimator_repr = f"{name}={estimator.__repr__()}" - estimator_reprs.append(estimator_repr) - estimators_str = ",\n".join(estimator_reprs) - return f"{class_name}([\n{estimators_str}\n])" - - def _sk_visual_block_(self): - names, transformers = zip(*self.estimators) - return _VisualBlock("parallel", transformers, names=names) - - -class SigmoidThresholdTransformer(BaseEstimator, TransformerMixin): - def __init__(self, threshold, steepness=1, feature_name="Sigmoid_", prefix=True): - self.threshold = threshold - self.steepness = steepness - self.feature_name = feature_name - self.prefix = prefix - - def fit(self, X, y=None): - return self - - def transform(self, X): - return 1 / (1 + np.exp(self.steepness * (X - self.threshold))) - - def predict(self, X): - return self.transform(X) + str + Name of the method to use for this estimator. - @available_if(lambda self: hasattr(self, "fitted_estimators_")) - def get_feature_names_out(self, input_features=None): - check_is_fitted(self) - - if input_features is None: - if ( - hasattr(self, "feature_names_in_") - and self.feature_names_in_ is not None - ): - input_features = self.feature_names_in_ - else: - input_features = [f"x{i}" for i in range(self.n_features_in_)] - - if self.feature_name: - if self.prefix: - return np.array( - [f"{self.feature_name}{feature}" for feature in input_features] + Raises + ------ + ValueError + If no valid method is found for the estimator. + """ + name, estimator = estimator_tuple + + # Check explicit method if specified + if name in self.selected_methods: + method = self.selected_methods[name] + if not hasattr(estimator, method): + raise ValueError( + f"Estimator '{name}' ({type(estimator).__name__}) does not have " + f"explicitly selected method '{method}'. Consider changing selected_methods " + f"or using only method_resolution_order to specify valid methods." ) - else: - if len(input_features) > 1: - return np.array( - [f"{self.feature_name}{i}" for i in range(len(input_features))] - ) - else: - return np.array([self.feature_name]) - else: - return np.array(input_features) - - -class NullEstimator(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): - def __init__( - self, - accept_sparse=False, - ): - self.accept_sparse = accept_sparse - - def fit(self, X, y=None): - # Check and store the input - self.X_ = check_array( - X, accept_sparse=self.accept_sparse, force_all_finite="allow-nan" - ) - self.n_features_in_ = self.X_.shape[1] - self.feature_names_in_ = getattr(X, "columns", None) - return self - - def transform(self, X): - check_is_fitted(self) - X = check_array( - X, accept_sparse=self.accept_sparse, force_all_finite="allow-nan" + return method + + # Try methods in resolution order + for method in self.method_resolution_order: + if hasattr(estimator, method): + return method + + raise ValueError( + f"Estimator '{name}' ({type(estimator).__name__}) does not have any of " + f"the methods: {', '.join(self.method_resolution_order)}. Consider using " + f"method_resolution_order or selected_methods to specify valid methods." ) - # Check that the input is of the same shape as the one passed during fit. - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"Shape of input is different from what was seen in `fit`" - f" Expected {self.n_features_in_} features, got {X.shape[1]}" - ) - return X + def _get_estimator_output( + self, estimator_tuple: Tuple[str, BaseEstimator], X: NDArray + ) -> NDArray: + """Get output from estimator using appropriate method.""" + name, estimator = estimator_tuple + method = self._get_method_name(estimator_tuple) + output = getattr(estimator, method)(X) - def predict(self, X): - return self.transform(X) + # Ensure 2D output + if output.ndim == 1: + output = output.reshape(-1, 1) + return output + + def transform(self, X: NDArray) -> NDArray: + """Transform X using the selected method for each estimator. - @available_if(lambda self: hasattr(self, "fitted_estimators_")) - def get_feature_names_out(self, input_features=None): - check_is_fitted(self) + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data to be transformed. + + Returns + ------- + ndarray of shape (n_samples, sum_n_output_features) + Horizontally stacked results of all estimators. + sum_n_output_features is the sum of n_output_features for each + estimator. + """ + Xs = self._parallel_func(X, self._get_estimator_output) + if not Xs: + # All transformers are None + return np.zeros((X.shape[0], 0)) + + if self.transformer_weights is not None: + Xs = [ + (Xs[name] * self.transformer_weights[name]) + if name in self.transformer_weights + else Xs[name] + for name in self._iter() + ] + + return np.hstack(Xs) - # Do I need to heck that the size of input_features is correct? - # if len(input_features) != self.n_features_out_: - # raise ValueError(f"Expected {self.n_features_in_} features, got {len(input_features)}") + def predict(self, X: NDArray) -> NDArray: + """Predict using all estimators. - if input_features: - return input_features - else: - return np.array([f"x{i}" for i in range(self.n_features_in_)]) + Alias for transform to maintain predictor interface. - def _more_tags(self): - return { - "allow_nan": True, - "X_types": ["2darray"] + (["sparse"] if self.accept_sparse else []), - } + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Input data to be predicted. + + Returns + ------- + ndarray of shape (n_samples, sum_n_output_features) + Horizontally stacked predictions of all estimators. + """ + return self.transform(X) diff --git a/tests/test_adapters.py b/tests/test_adapters.py new file mode 100644 index 0000000..340c986 --- /dev/null +++ b/tests/test_adapters.py @@ -0,0 +1,145 @@ +"""Tests for EstimatorUnion adapter.""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import StandardScaler + +from scikit_mol.adapters import EstimatorUnion +from scikit_mol.applicability import ( + MahalanobisApplicabilityDomain, + StandardizationApplicabilityDomain, +) + +# Use existing fixtures +from .fixtures import ( + atompair_transformer, + mols_list, + morgan_transformer, + skip_pandas_output_test, +) + + +def test_estimator_union_basic(morgan_transformer, atompair_transformer): + """Test basic functionality of EstimatorUnion.""" + + union = EstimatorUnion( + [ + ("fp1", morgan_transformer), + ( + "fp2", + atompair_transformer, + ), # Different radius for different features + ] + ) + + # Test unfitted raises exception + with pytest.raises(NotFittedError): + union.transform(mols_list) + + # Test fit and transform + union.fit(mols_list) + features = union.transform(mols_list) + + # Check output shape + n_fp = morgan_transformer().fpSize + assert features.shape == (len(mols_list), n_fp * 2) + + +def test_estimator_union_with_ad(morgan_transformer): + """Test EstimatorUnion with applicability domain estimator.""" + union = EstimatorUnion( + [ + ("fp", morgan_transformer), + ("ad", MahalanobisApplicabilityDomain()), + ], + method_resolution_order=("transform", "transform_score"), + ) + + union.fit(mols_list) + features = union.transform(mols_list) + + # Check output shape (fingerprints + 1 AD score) + n_fp = morgan_transformer().fpSize + assert features.shape == (len(mols_list), n_fp + 1) + + +def test_estimator_union_mixed_methods(morgan_transformer): + """Test EstimatorUnion with different methods specified.""" + union = EstimatorUnion( + [ + ("scale", StandardScaler(), "transform"), + ("ad", StandardizationApplicabilityDomain(), "transform_score"), + ("fp", morgan_transformer, "transform"), + ] + ) + + # Create some numeric data for StandardScaler + X = morgan_transformer.fit_transform(mols_list) + union.fit(X) + features = union.transform(X) + + # Check output shape + assert features.shape[0] == len(X) + assert features.shape[1] == X.shape[1] * 2 + 1 # scaled + fp + 1 AD score + + +@skip_pandas_output_test +def test_estimator_union_pandas_output(pandas_output, morgan_transformer): + """Test pandas DataFrame output from EstimatorUnion.""" + union = EstimatorUnion( + [ + ("fp", morgan_transformer), + ("ad", MahalanobisApplicabilityDomain(), "transform_score"), + ] + ) + + union.fit(mols_list) + features = union.transform(mols_list) + + # Check output type and structure + assert isinstance(features, pd.DataFrame) + assert len(features) == len(mols_list) + + # Check column names + fp_cols = [f"fp_{i}" for i in range(morgan_transformer.fpSize)] + expected_cols = fp_cols + ["Mahalanobis"] + assert features.columns.tolist() == expected_cols + + +def test_estimator_union_invalid_method(morgan_transformer): + """Test EstimatorUnion with invalid method specification.""" + with pytest.raises(ValueError): + EstimatorUnion([("fp", morgan_transformer, "invalid_method")]) + + +def test_estimator_union_get_feature_names_out(morgan_transformer): + """Test feature names output from EstimatorUnion.""" + union = EstimatorUnion( + [ + ("fp", morgan_transformer), + ("ad", MahalanobisApplicabilityDomain(), "transform_score"), + ] + ) + + union.fit(mols_list) + feature_names = union.get_feature_names_out() + + # Check number and format of feature names + n_fp = morgan_transformer().fpSize + assert len(feature_names) == n_fp + 1 + assert all(name.startswith("fp_") for name in feature_names[:-1]) + assert feature_names[-1] == "Mahalanobis" + + +def test_estimator_union_partial_fit(morgan_transformer): + """Test EstimatorUnion with some estimators already fitted.""" + fp = morgan_transformer.fit(mols_list) + ad = MahalanobisApplicabilityDomain() + + union = EstimatorUnion([("fp", fp), ("ad", ad, "transform_score")]) + + # Should work since fp is already fitted + features = union.fit_transform(mols_list) + assert features.shape == (len(mols_list), fp.fpSize + 1) From f2ab158d22a07c19b7c86e50abfd13843f6e0b41 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Wed, 19 Feb 2025 20:00:23 +0100 Subject: [PATCH 11/24] work in progress on adapters --- scikit_mol/adapters.py | 262 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 244 insertions(+), 18 deletions(-) diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index 4bedd40..9830472 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -2,8 +2,17 @@ import numpy as np from numpy.typing import NDArray -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import FeatureUnion +from sklearn.utils import Bunch +from sklearn.utils.metadata_routing import ( + _raise_for_params, + _routing_enabled, + process_routing, +) +from sklearn.utils.parallel import Parallel as skParallel +from sklearn.utils.parallel import delayed +from sklearn.utils.validation import check_is_fitted class EstimatorUnion(FeatureUnion): @@ -133,35 +142,108 @@ def _get_estimator_output( output = output.reshape(-1, 1) return output - def transform(self, X: NDArray) -> NDArray: - """Transform X using the selected method for each estimator. + # def transform_old(self, X: NDArray) -> NDArray: + # """Transform X using the selected method for each estimator. + + # Parameters + # ---------- + # X : array-like of shape (n_samples, n_features) + # Input data to be transformed. + + # Returns + # ------- + # ndarray of shape (n_samples, sum_n_output_features) + # Horizontally stacked results of all estimators. + # sum_n_output_features is the sum of n_output_features for each + # estimator. + # """ + # Xs = self._parallel_func(X, self._get_estimator_output) + # if not Xs: + # # All transformers are None + # return np.zeros((X.shape[0], 0)) + + # if self.transformer_weights is not None: + # Xs = [ + # (Xs[name] * self.transformer_weights[name]) + # if name in self.transformer_weights + # else Xs[name] + # for name in self._iter() + # ] + + # return np.hstack(Xs) + def _validate_transformers(self): + names, transformers = zip(*self.transformer_list) + + # validate names + self._validate_names(names) + + # validate estimators + for t in transformers: + if t in ("drop", "passthrough"): + continue + # TODO, make a check that the methods in the method_resolution_order /method mappting are present + # if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( + # t, "transform" + # ): + # raise TypeError( + # "All estimators should implement fit and " + # "transform. '%s' (type %s) doesn't" % (t, type(t)) + # ) + + def transform(self, X, **params): + """Transform X separately by each transformer, concatenate results. Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : iterable or array-like, depending on transformers Input data to be transformed. + **params : dict, default=None + + Parameters routed to the `transform` method of the sub-transformers via the + metadata routing API. See :ref:`Metadata Routing User Guide + ` for more details. + + .. versionadded:: 1.5 + Returns ------- - ndarray of shape (n_samples, sum_n_output_features) - Horizontally stacked results of all estimators. - sum_n_output_features is the sum of n_output_features for each - estimator. + X_t : array-like or sparse matrix of shape (n_samples, sum_n_components) + The `hstack` of results of transformers. `sum_n_components` is the + sum of `n_components` (output dimension) over transformers. """ - Xs = self._parallel_func(X, self._get_estimator_output) + _raise_for_params(params, self, "transform") + + if _routing_enabled(): + routed_params = process_routing(self, "transform", **params) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + routed_params = Bunch() + for name, _ in self.transformer_list: + routed_params[name] = Bunch(transform={}) + + # Build delayed jobs with custom methods + delayed_jobs = [] + for name, trans, weight in self._iter(): + method_name = self._get_method_name((name, trans)) + delayed_jobs.append( + delayed(_transform_one)( + trans, + X, + None, + weight, + params=routed_params[name], + method=method_name, + ) + ) + + Xs = skParallel(n_jobs=self.n_jobs)(delayed_jobs) + if not Xs: # All transformers are None return np.zeros((X.shape[0], 0)) - if self.transformer_weights is not None: - Xs = [ - (Xs[name] * self.transformer_weights[name]) - if name in self.transformer_weights - else Xs[name] - for name in self._iter() - ] - - return np.hstack(Xs) + return self._hstack(Xs) def predict(self, X: NDArray) -> NDArray: """Predict using all estimators. @@ -179,3 +261,147 @@ def predict(self, X: NDArray) -> NDArray: Horizontally stacked predictions of all estimators. """ return self.transform(X) + + +def _transform_one(transformer, X, y, weight, params=None, method="transform"): + """Call transform and apply weight to output. + + Parameters + ---------- + transformer : estimator + Estimator to be used for transformation. + + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data to be transformed. + + y : ndarray of shape (n_samples,) + Ignored. + + weight : float + Weight to be applied to the output of the transformation. + + method : str + Method to use for transformation (e.g. "transform", "predict", "predict_proba"). + + params : dict + Parameters to be passed to the transformer's ``transform`` method. + + This should be of the form ``process_routing()["step_name"]``. + """ + res = getattr(transformer, method)(X, **params.transform) + # Ensure 2D output + if res.ndim == 1: + res = res.reshape(-1, 1) + # if we have a weight for this transformer, multiply output + if weight is None: + return res + return res * weight + + +# def _fit_transform_one( +# transformer, X, y, weight, message_clsname="", message=None, params=None, method="transform" +# ): +# """ +# Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned +# with the fitted transformer. If ``weight`` is not ``None``, the result will +# be multiplied by ``weight``. + +# ``params`` needs to be of the form ``process_routing()["step_name"]``. +# """ +# params = params or {} +# with _print_elapsed_time(message_clsname, message): +# if hasattr(transformer, "fit_transform"): +# res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) +# else: +# res = transformer.fit(X, y, **params.get("fit", {})).transform( +# X, **params.get("transform", {}) +# ) + +# if weight is None: +# return res, transformer +# return res * weight, transformer + + +class PredictToTransformAdapter(TransformerMixin, BaseEstimator): + """Adapter that exposes an estimator's predict method as transform. + + Parameters + ---------- + estimator : BaseEstimator + Estimator with a predict method. + method : str, default="predict" + The method to use for transformation (e.g., "predict", "predict_proba"). + """ + + def __init__(self, estimator: BaseEstimator, method: str = "predict"): + self.estimator = estimator + self.method = method + + def fit(self, X, y=None): + self.estimator.fit(X, y) + return self + + def transform(self, X): + check_is_fitted(self) + return getattr(self.estimator, self.method)(X) + + def get_feature_names_out(self, input_features=None): + """Delegate feature names to wrapped estimator if available.""" + if hasattr(self.estimator, "get_feature_names_out"): + return self.estimator.get_feature_names_out(input_features) + return None + + def __sklearn_is_fitted__(self): + """Delegate fit check to wrapped estimator.""" + try: + check_is_fitted(self.estimator) + return True + except ValueError: + return False + + def _repr_html_(self): + """HTML representation for notebooks.""" + if hasattr(self.estimator, "_repr_html_"): + return f"
PredictToTransformAdapter using method '{self.method}' on:
{self.estimator._repr_html_()}
" + return f"
PredictToTransformAdapter(method='{self.method}', estimator={self.estimator})
" + + +class TransformToPredictAdapter(BaseEstimator): + """Adapter that exposes an estimator's transform method as predict. + + Parameters + ---------- + transformer : BaseEstimator + Estimator with a transform method. + """ + + def __init__(self, transformer: BaseEstimator): + self.transformer = transformer + + def fit(self, X, y=None): + self.transformer.fit(X, y) + return self + + def predict(self, X): + check_is_fitted(self) + return self.transformer.transform(X) + + def get_feature_names_out(self, input_features=None): + """Delegate feature names to wrapped transformer if available.""" + if hasattr(self.transformer, "get_feature_names_out"): + return self.transformer.get_feature_names_out(input_features) + return None + + def __sklearn_is_fitted__(self): + """Delegate fit check to wrapped transformer.""" + try: + check_is_fitted(self.transformer) + return True + except ValueError: + return False + + def _repr_html_(self): + """HTML representation for notebooks.""" + if hasattr(self.transformer, "_repr_html_"): + return f"
TransformToPredictAdapter on:
{self.transformer._repr_html_()}
" + return f"
TransformToPredictAdapter(transformer={self.transformer})
" From 42739365cb13be4e80314710101cd8caf798da94 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 22 Feb 2025 10:40:23 +0100 Subject: [PATCH 12/24] It seems to be getting there with the EstimatorUnion. Got feature_names to work. --- scikit_mol/adapters.py | 40 +++++++++++++++++++++++++++++++- scikit_mol/applicability/base.py | 11 +++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index 9830472..6ad54d8 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -16,7 +16,7 @@ class EstimatorUnion(FeatureUnion): - """A more flexible version of FeatureUnion that supports various estimator types. + """EXPERIMENTAL: more flexible version of FeatureUnion that supports various estimator types. This class extends scikit-learn's FeatureUnion to support estimators with different method interfaces (predict, transform, etc.) and allows explicit method selection. @@ -41,6 +41,8 @@ class EstimatorUnion(FeatureUnion): names, values are weights. verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed. + verbose_feature_names_out : bool, default=True + If True, the feature names out will be verbose. Attributes ---------- @@ -68,6 +70,7 @@ def __init__( n_jobs: Optional[int] = None, transformer_weights: Optional[Dict[str, float]] = None, verbose: bool = False, + verbose_feature_names_out: bool = True, ) -> None: # Store all parameters as properties self.estimator_list = estimator_list @@ -76,6 +79,7 @@ def __init__( self.n_jobs = n_jobs self.transformer_weights = transformer_weights self.verbose = verbose + self.verbose_feature_names_out = verbose_feature_names_out @property def estimator_list(self) -> List[Tuple[str, BaseEstimator]]: @@ -262,6 +266,40 @@ def predict(self, X: NDArray) -> NDArray: """ return self.transform(X) + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + # List of tuples (name, feature_names_out) + transformer_with_feature_names_out = [] + for name, trans, _ in self._iter(): + if hasattr(trans, "predict") and not hasattr( + trans, "get_feature_names_out" + ): + # Assume predictors only return 1D output and thus we use their name as feature_name + feature_names_out = np.array([name]) + elif not hasattr(trans, "get_feature_names_out"): + raise AttributeError( + "Transformer %s (type %s) does not provide get_feature_names_out." + % (str(name), type(trans).__name__) + ) + else: + feature_names_out = trans.get_feature_names_out(input_features) + transformer_with_feature_names_out.append((name, feature_names_out)) + + return self._add_prefix_for_feature_names_out( + transformer_with_feature_names_out + ) + def _transform_one(transformer, X, y, weight, params=None, method="transform"): """Call transform and apply weight to output. diff --git a/scikit_mol/applicability/base.py b/scikit_mol/applicability/base.py index c8cbb62..813a8b1 100644 --- a/scikit_mol/applicability/base.py +++ b/scikit_mol/applicability/base.py @@ -203,7 +203,13 @@ def _transform(self, X: NDArray) -> NDArray[np.float64]: def predict( self, X: Union[ArrayLike, pd.DataFrame] ) -> Union[NDArray[np.int_], pd.DataFrame]: - """Predict whether samples are within the applicability domain.""" + """Predict whether samples are within the applicability domain. + + Returns + ------- + predictions : ndarray of shape (n_samples,) + Returns 1 for inside and -1 for outside. + """ check_is_fitted(self) X = check_array(X, **self._check_params) @@ -244,6 +250,7 @@ def score_transform( # No sign flip needed return (1 / (1 + np.exp(self.threshold_ - scores))).reshape(-1, 1) - def get_feature_names_out(self) -> NDArray[np.str_]: + def get_feature_names_out(self, input_features=None) -> NDArray[np.str_]: """Get feature name for output column.""" + # TODO: what is the mechanism around input_features? return np.array([f"{self.feature_name}"]) From 159432f93e9abe47106ccd846b4e90b84984d6ad Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 22 Feb 2025 13:41:22 +0100 Subject: [PATCH 13/24] predicttotransformwrapper seems to be working now. --- scikit_mol/adapters.py | 84 ++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index 6ad54d8..ff9c8b6 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -5,6 +5,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import FeatureUnion from sklearn.utils import Bunch +from sklearn.utils._estimator_html_repr import _VisualBlock from sklearn.utils.metadata_routing import ( _raise_for_params, _routing_enabled, @@ -361,47 +362,66 @@ def _transform_one(transformer, X, y, weight, params=None, method="transform"): class PredictToTransformAdapter(TransformerMixin, BaseEstimator): - """Adapter that exposes an estimator's predict method as transform. - - Parameters - ---------- - estimator : BaseEstimator - Estimator with a predict method. - method : str, default="predict" - The method to use for transformation (e.g., "predict", "predict_proba"). - """ + """Adapter that exposes an estimator's predict method as transform.""" def __init__(self, estimator: BaseEstimator, method: str = "predict"): self.estimator = estimator self.method = method - def fit(self, X, y=None): - self.estimator.fit(X, y) - return self - def transform(self, X): check_is_fitted(self) - return getattr(self.estimator, self.method)(X) - - def get_feature_names_out(self, input_features=None): - """Delegate feature names to wrapped estimator if available.""" - if hasattr(self.estimator, "get_feature_names_out"): - return self.estimator.get_feature_names_out(input_features) - return None + prediction = getattr(self.estimator, self.method)(X) + if prediction.ndim == 1: + prediction = prediction.reshape(-1, 1) + return prediction + + def __getattr__(self, name): + """Delegate any unknown attributes/methods to wrapped estimator.""" + if hasattr(self.estimator, name): + attr = getattr(self.estimator, name) + # If it's a property, get its value + if isinstance(attr, property): + return attr.__get__(self.estimator) + return attr + raise AttributeError( + f"Neither {self.__class__.__name__} nor {self.estimator.__class__.__name__} " + f"has attribute '{name}'" + ) - def __sklearn_is_fitted__(self): - """Delegate fit check to wrapped estimator.""" - try: - check_is_fitted(self.estimator) - return True - except ValueError: - return False + def __dir__(self): + """List all attributes including those from wrapped estimator.""" + return list(set(super().__dir__() + dir(self.estimator))) - def _repr_html_(self): - """HTML representation for notebooks.""" - if hasattr(self.estimator, "_repr_html_"): - return f"
PredictToTransformAdapter using method '{self.method}' on:
{self.estimator._repr_html_()}
" - return f"
PredictToTransformAdapter(method='{self.method}', estimator={self.estimator})
" + @property + def __dict__(self): + """Include estimator's properties in the instance dict.""" + # Get our own dict + d = super().__dict__.copy() + + # Add estimator instance attributes and properties + estimator_dict = vars(self.estimator) + for name, value in estimator_dict.items(): + if not name.startswith("_"): # Skip private attributes + d[name] = value + + return d + + def _sk_visual_block_(self): + """Generate information about how to display the adapter.""" + return _VisualBlock( + "parallel", + [self.estimator], + names=None, + # [ + # f"{self.estimator.__class__.__name__}", + # ], + name_details=None, + # [ + # f"{self.method} from {self.estimator}", + # ], + name_caption=None, + dash_wrapped=False, + ) class TransformToPredictAdapter(BaseEstimator): From 927900822ce9bd9962551da394b88a441c2a140d Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 22 Feb 2025 14:06:50 +0100 Subject: [PATCH 14/24] Also got the fit_transform to work. Seemingly getting there. --- scikit_mol/adapters.py | 197 +++++++++++++++++++++++++---------------- 1 file changed, 121 insertions(+), 76 deletions(-) diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index ff9c8b6..39931c8 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -6,6 +6,7 @@ from sklearn.pipeline import FeatureUnion from sklearn.utils import Bunch from sklearn.utils._estimator_html_repr import _VisualBlock +from sklearn.utils._user_interface import _print_elapsed_time from sklearn.utils.metadata_routing import ( _raise_for_params, _routing_enabled, @@ -250,6 +251,61 @@ def transform(self, X, **params): return self._hstack(Xs) + def fit_transform(self, X, y=None, **params): + """Fit all transformers, transform the data and concatenate results. + + Parameters + ---------- + X : iterable or array-like, depending on transformers + Input data to be transformed. + + y : array-like of shape (n_samples, n_outputs), default=None + Targets for supervised learning. + + **params : dict, default=None + - If `enable_metadata_routing=False` (default): + Parameters directly passed to the `fit` methods of the + sub-transformers. + + - If `enable_metadata_routing=True`: + Parameters safely routed to the `fit` methods of the + sub-transformers. See :ref:`Metadata Routing User Guide + ` for more details. + + .. versionchanged:: 1.5 + `**params` can now be routed via metadata routing API. + + Returns + ------- + X_t : array-like or sparse matrix of \ + shape (n_samples, sum_n_components) + The `hstack` of results of transformers. `sum_n_components` is the + sum of `n_components` (output dimension) over transformers. + """ + if _routing_enabled(): + routed_params = process_routing(self, "fit_transform", **params) + else: + # TODO(SLEP6): remove when metadata routing cannot be disabled. + routed_params = Bunch() + for name, obj in self.transformer_list: + if hasattr(obj, "fit_transform"): + routed_params[name] = Bunch(fit_transform={}) + routed_params[name].fit_transform = params + else: + routed_params[name] = Bunch(fit={}) + routed_params[name] = Bunch(transform={}) + routed_params[name].fit = params + + results = self._parallel_func(X, y, _fit_transform_one, routed_params) + if not results: + # All transformers are None + return np.zeros((X.shape[0], 0)) + + Xs, transformers = zip(*results) + self._update_transformer_list(transformers) + + return self._hstack(Xs) + def predict(self, X: NDArray) -> NDArray: """Predict using all estimators. @@ -337,49 +393,57 @@ def _transform_one(transformer, X, y, weight, params=None, method="transform"): return res * weight -# def _fit_transform_one( -# transformer, X, y, weight, message_clsname="", message=None, params=None, method="transform" -# ): -# """ -# Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned -# with the fitted transformer. If ``weight`` is not ``None``, the result will -# be multiplied by ``weight``. +# Ouch, this seem to be a problem with the EstimatorUnion class. +def _fit_transform_one( + transformer, + X, + y, + weight, + message_clsname="", + message=None, + params=None, + method="transform", +): + """ + Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned + with the fitted transformer. If ``weight`` is not ``None``, the result will + be multiplied by ``weight``. -# ``params`` needs to be of the form ``process_routing()["step_name"]``. -# """ -# params = params or {} -# with _print_elapsed_time(message_clsname, message): -# if hasattr(transformer, "fit_transform"): -# res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) -# else: -# res = transformer.fit(X, y, **params.get("fit", {})).transform( -# X, **params.get("transform", {}) -# ) + ``params`` needs to be of the form ``process_routing()["step_name"]``. + """ + params = params or {} + with _print_elapsed_time(message_clsname, message): + if hasattr(transformer, "fit_transform"): + res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) + elif hasattr(transformer, "transform"): + res = transformer.fit(X, y, **params.get("fit", {})).transform( + X, **params.get("transform", {}) + ) + elif hasattr(transformer, "predict"): + transformer.fit(X, y, **params.get("fit", {})) + res = transformer.predict(X, **params.get("predict", {})) + if res.ndim == 1: + res = res.reshape(-1, 1) + else: + raise ValueError( + f"Transformer {transformer} does not have a fit_transform, fit or predict method." + ) -# if weight is None: -# return res, transformer -# return res * weight, transformer + if weight is None: + return res, transformer + return res * weight, transformer -class PredictToTransformAdapter(TransformerMixin, BaseEstimator): - """Adapter that exposes an estimator's predict method as transform.""" +class _BaseAdapter(BaseEstimator): + """EXPERIMENTAL: Base class for adapters that wrap estimators and modify their interface.""" - def __init__(self, estimator: BaseEstimator, method: str = "predict"): + def __init__(self, estimator: BaseEstimator): self.estimator = estimator - self.method = method - - def transform(self, X): - check_is_fitted(self) - prediction = getattr(self.estimator, self.method)(X) - if prediction.ndim == 1: - prediction = prediction.reshape(-1, 1) - return prediction def __getattr__(self, name): """Delegate any unknown attributes/methods to wrapped estimator.""" if hasattr(self.estimator, name): attr = getattr(self.estimator, name) - # If it's a property, get its value if isinstance(attr, property): return attr.__get__(self.estimator) return attr @@ -395,15 +459,11 @@ def __dir__(self): @property def __dict__(self): """Include estimator's properties in the instance dict.""" - # Get our own dict d = super().__dict__.copy() - - # Add estimator instance attributes and properties estimator_dict = vars(self.estimator) for name, value in estimator_dict.items(): - if not name.startswith("_"): # Skip private attributes + if not name.startswith("_"): d[name] = value - return d def _sk_visual_block_(self): @@ -412,54 +472,39 @@ def _sk_visual_block_(self): "parallel", [self.estimator], names=None, - # [ - # f"{self.estimator.__class__.__name__}", - # ], name_details=None, - # [ - # f"{self.method} from {self.estimator}", - # ], name_caption=None, dash_wrapped=False, ) -class TransformToPredictAdapter(BaseEstimator): - """Adapter that exposes an estimator's transform method as predict. +class PredictToTransformAdapter(_BaseAdapter, TransformerMixin): + """EXPERIMENTAL: Adapter that exposes an estimator's predict method as transform.""" - Parameters - ---------- - transformer : BaseEstimator - Estimator with a transform method. - """ + def __init__(self, estimator: BaseEstimator, method: str = "predict"): + super().__init__(estimator) + self.method = method - def __init__(self, transformer: BaseEstimator): - self.transformer = transformer + def transform(self, X): + check_is_fitted(self) + prediction = getattr(self.estimator, self.method)(X) + if prediction.ndim == 1: + prediction = prediction.reshape(-1, 1) + return prediction - def fit(self, X, y=None): - self.transformer.fit(X, y) - return self + +class TransformToPredictAdapter(_BaseAdapter, TransformerMixin): + """EXPERIMENTAL: Adapter that exposes an estimator's transform method as predict. + + 2D column vector output is flattened to 1D.""" + + def __init__(self, estimator: BaseEstimator, method: str = "transform"): + super().__init__(estimator) + self.method = method def predict(self, X): check_is_fitted(self) - return self.transformer.transform(X) - - def get_feature_names_out(self, input_features=None): - """Delegate feature names to wrapped transformer if available.""" - if hasattr(self.transformer, "get_feature_names_out"): - return self.transformer.get_feature_names_out(input_features) - return None - - def __sklearn_is_fitted__(self): - """Delegate fit check to wrapped transformer.""" - try: - check_is_fitted(self.transformer) - return True - except ValueError: - return False - - def _repr_html_(self): - """HTML representation for notebooks.""" - if hasattr(self.transformer, "_repr_html_"): - return f"
TransformToPredictAdapter on:
{self.transformer._repr_html_()}
" - return f"
TransformToPredictAdapter(transformer={self.transformer})
" + prediction = self.estimator.transform(X) + if prediction.shape[1] == 1: + prediction = prediction.flatten() + return prediction From d19322f6f57091fee3e25470b52ff9639cb038ea Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Mar 2025 09:43:48 +0100 Subject: [PATCH 15/24] Experimental adapters. WIP --- scikit_mol/adapters.py | 95 +++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py index 39931c8..5e3b2f0 100644 --- a/scikit_mol/adapters.py +++ b/scikit_mol/adapters.py @@ -18,7 +18,7 @@ class EstimatorUnion(FeatureUnion): - """EXPERIMENTAL: more flexible version of FeatureUnion that supports various estimator types. + """EXPERIMENTAL: A more flexible version of FeatureUnion that supports various estimator types. This class extends scikit-learn's FeatureUnion to support estimators with different method interfaces (predict, transform, etc.) and allows explicit method selection. @@ -148,35 +148,6 @@ def _get_estimator_output( output = output.reshape(-1, 1) return output - # def transform_old(self, X: NDArray) -> NDArray: - # """Transform X using the selected method for each estimator. - - # Parameters - # ---------- - # X : array-like of shape (n_samples, n_features) - # Input data to be transformed. - - # Returns - # ------- - # ndarray of shape (n_samples, sum_n_output_features) - # Horizontally stacked results of all estimators. - # sum_n_output_features is the sum of n_output_features for each - # estimator. - # """ - # Xs = self._parallel_func(X, self._get_estimator_output) - # if not Xs: - # # All transformers are None - # return np.zeros((X.shape[0], 0)) - - # if self.transformer_weights is not None: - # Xs = [ - # (Xs[name] * self.transformer_weights[name]) - # if name in self.transformer_weights - # else Xs[name] - # for name in self._iter() - # ] - - # return np.hstack(Xs) def _validate_transformers(self): names, transformers = zip(*self.transformer_list) @@ -233,7 +204,9 @@ def transform(self, X, **params): for name, trans, weight in self._iter(): method_name = self._get_method_name((name, trans)) delayed_jobs.append( - delayed(_transform_one)( + delayed( + _transform_one + )( # Seems like the only reason we modify this method from base class is to handle it with a custom function for parallel processing trans, X, None, @@ -393,7 +366,6 @@ def _transform_one(transformer, X, y, weight, params=None, method="transform"): return res * weight -# Ouch, this seem to be a problem with the EstimatorUnion class. def _fit_transform_one( transformer, X, @@ -419,6 +391,7 @@ def _fit_transform_one( res = transformer.fit(X, y, **params.get("fit", {})).transform( X, **params.get("transform", {}) ) + # Custom handling of methods that has predict but no fit_transform or transform elif hasattr(transformer, "predict"): transformer.fit(X, y, **params.get("fit", {})) res = transformer.predict(X, **params.get("predict", {})) @@ -437,8 +410,22 @@ def _fit_transform_one( class _BaseAdapter(BaseEstimator): """EXPERIMENTAL: Base class for adapters that wrap estimators and modify their interface.""" - def __init__(self, estimator: BaseEstimator): + def __init__( + self, estimator: BaseEstimator, _feature_names_out: Optional[List[str]] = None + ): + """Initialize the adapter with an estimator.""" self.estimator = estimator + self._feature_names_out = _feature_names_out + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features.""" + + return ["tester"] def __getattr__(self, name): """Delegate any unknown attributes/methods to wrapped estimator.""" @@ -467,6 +454,7 @@ def __dict__(self): return d def _sk_visual_block_(self): + # TODO: this looks strange when putting the wrapped estimator into a pipeline """Generate information about how to display the adapter.""" return _VisualBlock( "parallel", @@ -482,11 +470,36 @@ class PredictToTransformAdapter(_BaseAdapter, TransformerMixin): """EXPERIMENTAL: Adapter that exposes an estimator's predict method as transform.""" def __init__(self, estimator: BaseEstimator, method: str = "predict"): + """Initialize the adapter with an estimator and a method to use. + + Parameters + ---------- + estimator : BaseEstimator + The estimator to wrap. + method : str, default="predict" + The method to use for transformation. + """ super().__init__(estimator) self.method = method def transform(self, X): + """Transform X using the wrapped estimator's specified method. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data to transform. check_is_fitted(self) + + Example + -------- + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn_mol.adapters import PredictToTransformAdapter + >>> estimator = LogisticRegression() + >>> adapter = PredictToTransformAdapter(estimator, method="predict"") + >>> adapter.fit(X, y) + >>> adapter.transform(X) + """ prediction = getattr(self.estimator, self.method)(X) if prediction.ndim == 1: prediction = prediction.reshape(-1, 1) @@ -503,6 +516,22 @@ def __init__(self, estimator: BaseEstimator, method: str = "transform"): self.method = method def predict(self, X): + """Predict using the wrapped estimator's specified method. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The input data to predict. + + Example + -------- + >>> from sklearn.preprocessing import StandardScaler + >>> from sklearn_mol.adapters import TransformToPredictAdapter + >>> estimator = StandardScaler() + >>> adapter = TransformToPredictAdapter(estimator, method="transform") + >>> adapter.fit(X, y) + >>> adapter.predict(X) + """ check_is_fitted(self) prediction = self.estimator.transform(X) if prediction.shape[1] == 1: From 8dfa31258ff65ecb91c1ee7829b2141ae965c808 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Mar 2025 10:01:31 +0100 Subject: [PATCH 16/24] Cleaning up. --- docs/notebooks/12_applicability_domain.ipynb | 1660 ++++++++++++++++++ scikit_mol/adapters.py | 539 ------ scikit_mol/applicability_old.py | 276 --- 3 files changed, 1660 insertions(+), 815 deletions(-) create mode 100644 docs/notebooks/12_applicability_domain.ipynb delete mode 100644 scikit_mol/adapters.py delete mode 100644 scikit_mol/applicability_old.py diff --git a/docs/notebooks/12_applicability_domain.ipynb b/docs/notebooks/12_applicability_domain.ipynb new file mode 100644 index 0000000..fcac2a4 --- /dev/null +++ b/docs/notebooks/12_applicability_domain.ipynb @@ -0,0 +1,1660 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ee549099", + "metadata": {}, + "source": [ + "# Applicability Domain Estimation\n", + "\n", + "This notebook demonstrates how to use scikit-mol's applicability domain estimators to assess whether new compounds are within the domain of applicability of a trained model.\n", + "\n", + "We'll explore two different approaches:\n", + "1. Using Morgan binary fingerprints with a k-Nearest Neighbors based applicability domain\n", + "2. Using count-based Morgan fingerprints with dimensionality reduction and a leverage-based applicability domain\n", + "\n", + "First, let's import the necessary libraries and load our dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "40500fae", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from rdkit import Chem\n", + "from rdkit.Chem import Draw\n", + "from rdkit.Chem import PandasTools\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "import pathlib\n", + "\n", + "\n", + "from scikit_mol.fingerprints import MorganFingerprintTransformer\n", + "from scikit_mol.applicability import KNNApplicabilityDomain, LeverageApplicabilityDomain" + ] + }, + { + "cell_type": "markdown", + "id": "e5d1277e", + "metadata": {}, + "source": [ + "## Load and Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "79d3b853", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 out of 7228 SMILES failed in conversion\n" + ] + } + ], + "source": [ + "full_set = True\n", + "\n", + "if full_set:\n", + " csv_file = \"SLC6A4_active_excape_export.csv\"\n", + " if not pathlib.Path(csv_file).exists():\n", + " import urllib.request\n", + "\n", + " url = \"https://ndownloader.figshare.com/files/25747817\"\n", + " urllib.request.urlretrieve(url, csv_file)\n", + "else:\n", + " csv_file = \"../tests/data/SLC6A4_active_excapedb_subset.csv\"\n", + "\n", + "data = pd.read_csv(csv_file)\n", + "\n", + "#Could also build a pipeline to convert the smiles to mols using SmilesToMolTransformer\n", + "PandasTools.AddMoleculeColumnToFrame(data, smilesCol=\"SMILES\")\n", + "print(f\"{data.ROMol.isna().sum()} out of {len(data)} SMILES failed in conversion\")\n", + "\n", + "# Split into train/val/test\n", + "X = data.ROMol\n", + "y = data.pXC50\n", + "\n", + "X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)" + ] + }, + { + "cell_type": "markdown", + "id": "e2896ad5", + "metadata": {}, + "source": [ + "## Example 1: k-NN Applicability Domain with Binary Morgan Fingerprints\n", + "\n", + "In this example, we'll use binary Morgan fingerprints and a k-NN based applicability domain with Tanimoto distance.\n", + "This is particularly suitable for binary fingerprints as the Tanimoto coefficient is a natural similarity measure for them." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9c89148b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n" + ] + }, + { + "data": { + "text/html": [ + "
Pipeline(steps=[('fp', MorganFingerprintTransformer()),\n",
+       "                ('rf', RandomForestRegressor(n_jobs=-1, random_state=61453))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('fp', MorganFingerprintTransformer()),\n", + " ('rf', RandomForestRegressor(n_jobs=-1, random_state=61453))])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create pipeline for binary fingerprints\n", + "binary_fp_pipe = Pipeline([\n", + " ('fp', MorganFingerprintTransformer(fpSize=2048, radius=2)),\n", + " ('rf', RandomForestRegressor(n_estimators=100, random_state=0xf00d, n_jobs=-1))\n", + "])\n", + "\n", + "# Train the model\n", + "binary_fp_pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ee7b2f64", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n" + ] + }, + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Predicted pXC50 vs Absolute Error')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Get predictions and errors\n", + "y_pred_test = binary_fp_pipe.predict(X_test)\n", + "abs_errors = np.abs(y_test - y_pred_test)\n", + "\n", + "\n", + "fig = plt.figure(figsize=(3,3))\n", + "\n", + "plt.scatter(y_test, abs_errors, alpha=0.5)\n", + "plt.xlabel('pXC50')\n", + "plt.ylabel('Predicted Absolute Error')\n", + "plt.title('Predicted pXC50 vs Absolute Error')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9d2860b4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n" + ] + } + ], + "source": [ + "\n", + "# Create and fit k-NN AD estimator. Distance metrics follow the scikit-learn API, and the custom distance metric tanimoto popular in cheminformatics is available in scikit-mol.\n", + "knn_ad = KNNApplicabilityDomain(n_neighbors=3, distance_metric='tanimoto', n_jobs=-1)\n", + "knn_ad.fit(binary_fp_pipe.named_steps['fp'].transform(X_train))\n", + "\n", + "# Fit threshold using validation set\n", + "knn_ad.fit_threshold(binary_fp_pipe.named_steps['fp'].transform(X_val), target_percentile=95)\n", + "\n", + "# Get AD scores for test set\n", + "knn_scores = knn_ad.transform(binary_fp_pipe.named_steps['fp'].transform(X_test))" + ] + }, + { + "cell_type": "markdown", + "id": "22848529", + "metadata": {}, + "source": [ + "Let's visualize the relationship between prediction errors and AD scores, and calculate some statistics on compound errors within and outside the domain." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e8e2bb86", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95th percentile of errors inside domain: 1.45\n", + "95th percentile of errors outside domain: 1.85\n", + "Fraction of samples outside domain: 0.04\n" + ] + } + ], + "source": [ + "plt.figure(figsize=(4, 3))\n", + "plt.scatter(knn_scores, abs_errors, alpha=0.5)\n", + "plt.axvline(x=knn_ad.threshold_, color='r', linestyle='--', label='AD Threshold')\n", + "plt.xlabel('k-NN AD Score')\n", + "plt.ylabel('Absolute Prediction Error')\n", + "plt.title('Prediction Errors vs k-NN AD Scores')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "# Calculate error statistics\n", + "in_domain = knn_ad.predict(binary_fp_pipe.named_steps['fp'].transform(X_test))\n", + "errors_in = abs_errors[in_domain == 1]\n", + "errors_out = abs_errors[in_domain == -1]\n", + "\n", + "print(f\"95th percentile of errors inside domain: {np.percentile(errors_in, 95):.2f}\")\n", + "print(f\"95th percentile of errors outside domain: {np.percentile(errors_out, 95):.2f}\")\n", + "print(f\"Fraction of samples outside domain: {(in_domain == -1).mean():.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "10e69073", + "metadata": {}, + "source": [ + "There's some diffence in the errors distribution inside and outside the domain threshold, but maybe not as clear-cut as we could have wished for. The fraction of samples outside the domain in the test-set are close the 5% that corresponds to the threshold estimated from the validation set fractile of 95%." + ] + }, + { + "cell_type": "markdown", + "id": "09bdc3b2", + "metadata": {}, + "source": [ + "## Example 2: Leverage-based AD with Count-based Morgan Fingerprints\n", + "\n", + "In this example, we'll use count-based Morgan fingerprints, reduce their dimensionality with PCA,\n", + "and apply a leverage-based applicability domain estimator." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fe4a6819", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
Pipeline(steps=[('fp', MorganFingerprintTransformer(useCounts=True)),\n",
+       "                ('pca', PCA(n_components=0.9)), ('scaler', StandardScaler()),\n",
+       "                ('leverage', LeverageApplicabilityDomain())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('fp', MorganFingerprintTransformer(useCounts=True)),\n", + " ('pca', PCA(n_components=0.9)), ('scaler', StandardScaler()),\n", + " ('leverage', LeverageApplicabilityDomain())])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create pipeline for count-based fingerprints AD estimation with PCA, scaling and leverage\n", + "count_fp_pipe = Pipeline([\n", + " ('fp', MorganFingerprintTransformer(fpSize=2048, radius=2, useCounts=True)),\n", + " ('pca', PCA(n_components=0.9)), # Keep 90% of variance\n", + " ('scaler', StandardScaler()),\n", + " ('leverage', LeverageApplicabilityDomain())\n", + "])\n", + "\n", + "# Train the model\n", + "count_fp_pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "57d73a11", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'Series.swapaxes' is deprecated and will be removed in a future version. Please use 'Series.transpose' instead.\n", + " return bound(*args, **kwds)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "\n", + "X_val_transformed = count_fp_pipe[:-1].transform(X_val) #Index into pipeline to get all the pipeline up to thelast step before the AD estimator\n", + "count_fp_pipe.named_steps['leverage'].fit_threshold(X_val_transformed, target_percentile=95)\n", + "\n", + "\n", + "# Get AD scores for test set\n", + "X_test_transformed = count_fp_pipe[:-1].transform(X_test) #Index into pipeline to get the last step before the AD estimator \n", + "leverage_raw_scores = count_fp_pipe.named_steps['leverage'].transform(X_test_transformed)" + ] + }, + { + "cell_type": "markdown", + "id": "fd5c6718", + "metadata": {}, + "source": [ + "As before, let's visualize the relationship between prediction errors and leverage scores and look at the fractiles errors." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "41434c9d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "95th percentile of errors inside domain: 1.50\n", + "95th percentile of errors outside domain: 1.23\n", + "Fraction of samples outside domain: 0.05\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(4, 3))\n", + "plt.scatter(leverage_raw_scores, abs_errors, alpha=0.5)\n", + "plt.axvline(x=count_fp_pipe.named_steps['leverage'].threshold_, color='r', linestyle='--', label='AD Threshold')\n", + "plt.xlabel('Leverage AD Score')\n", + "plt.ylabel('Absolute Prediction Error')\n", + "plt.title('Prediction Errors vs Leverage Scores')\n", + "plt.legend()\n", + "\n", + "\n", + "# Calculate error statistics\n", + "in_domain = count_fp_pipe.named_steps['leverage'].predict(X_test_transformed)\n", + "errors_in = abs_errors[in_domain == 1]\n", + "errors_out = abs_errors[in_domain == -1]\n", + "\n", + "print(f\"95th percentile of errors inside domain: {np.percentile(errors_in, 95):.2f}\")\n", + "print(f\"95th percentile of errors outside domain: {np.percentile(errors_out, 95):.2f}\")\n", + "print(f\"Fraction of samples outside domain: {(in_domain == -1).mean():.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "86f8a09e", + "metadata": {}, + "source": [ + "Dissappointingly the error seems larger within the domain, than outside the domain." + ] + }, + { + "cell_type": "markdown", + "id": "e22b19f0", + "metadata": {}, + "source": [ + "## Testing Famous Drugs\n", + "\n", + "Let's test some well-known drugs to see if they fall within our model's applicability domain:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1d33100d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define famous drugs\n", + "famous_drugs = {\n", + " 'Aspirin': 'CC(=O)OC1=CC=CC=C1C(=O)O',\n", + " 'Viagra': 'CCc1nn(C)c2c(=O)[nH]c(nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4',\n", + " 'Heroin': 'CN1CC[C@]23[C@H]4Oc5c(O)ccc(CC1[C@H]2C=C[C@@H]4O3)c5',\n", + "}\n", + "\n", + "\n", + "Draw.MolsToGridImage([Chem.MolFromSmiles(drug) for drug in famous_drugs.values()], molsPerRow=3,\n", + " subImgSize=(250,250), legends=[f\"{name}\" for name, smiles in famous_drugs.items()])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "904ed0d0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/metrics/pairwise.py:2466: DataConversionWarning: Data was converted to boolean for metric jaccard\n", + " warnings.warn(msg, DataConversionWarning)\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n", + "/home/esben/envs/vscode/lib/python3.10/site-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Predicted pIC50k-NN Scorek-NN StatusLeverage ScoreLeverage Status
Drug
Aspirin5.900.719194Outside0.020058Inside
Viagra9.050.786921Outside0.050743Inside
Heroin6.450.812649Outside0.021588Inside
\n", + "
" + ], + "text/plain": [ + " Predicted pIC50 k-NN Score k-NN Status Leverage Score \\\n", + "Drug \n", + "Aspirin 5.90 0.719194 Outside 0.020058 \n", + "Viagra 9.05 0.786921 Outside 0.050743 \n", + "Heroin 6.45 0.812649 Outside 0.021588 \n", + "\n", + " Leverage Status \n", + "Drug \n", + "Aspirin Inside \n", + "Viagra Inside \n", + "Heroin Inside " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "leverage_ad = count_fp_pipe.named_steps['leverage']\n", + "\n", + "# Function to process a drug through both AD pipelines\n", + "def check_drug_applicability(smiles, name):\n", + " mol = Chem.MolFromSmiles(smiles)\n", + " \n", + " # k-NN AD\n", + " fp_binary = binary_fp_pipe.named_steps['fp'].transform([mol])\n", + " knn_score = knn_ad.transform(fp_binary)[0][0]\n", + " knn_status = \"Inside\" if knn_ad.predict(fp_binary)[0] == 1 else \"Outside\"\n", + " \n", + " # Leverage AD\n", + " fp_count = count_fp_pipe.named_steps['fp'].transform([mol])\n", + " fp_pca = count_fp_pipe.named_steps['pca'].transform(fp_count)\n", + " fp_scaled = count_fp_pipe.named_steps['scaler'].transform(fp_pca)\n", + " leverage_score = leverage_ad.transform(fp_scaled)[0][0]\n", + " leverage_status = \"Inside\" if leverage_ad.predict(fp_scaled)[0] == 1 else \"Outside\"\n", + " \n", + " # Get prediction\n", + " pred_pIC50 = binary_fp_pipe.predict([mol])[0]\n", + " \n", + " return {\n", + " 'knn_score': knn_score,\n", + " 'knn_status': knn_status,\n", + " 'leverage_score': leverage_score,\n", + " 'leverage_status': leverage_status,\n", + " 'pred_pIC50': pred_pIC50\n", + " }\n", + "\n", + "# Process each drug\n", + "results = []\n", + "for name, smiles in famous_drugs.items():\n", + " result = check_drug_applicability(smiles, name)\n", + " results.append({\n", + " 'Drug': name,\n", + " 'Predicted pIC50': f\"{result['pred_pIC50']:.2f}\",\n", + " 'k-NN Score': result['knn_score'],\n", + " 'k-NN Status': result['knn_status'],\n", + " 'Leverage Score': result['leverage_score'],\n", + " 'Leverage Status': result['leverage_status']\n", + " })\n", + "\n", + "# Display results\n", + "pd.DataFrame(results).set_index('Drug')" + ] + }, + { + "cell_type": "markdown", + "id": "a5241345", + "metadata": {}, + "source": [ + "Let's visualize where these drugs fall in our AD plots:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3aaf4485", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot for k-NN AD\n", + "plt.figure(figsize=(12, 5))\n", + "plt.subplot(1, 2, 1)\n", + "plt.scatter(knn_scores, abs_errors, alpha=0.2, label='Test compounds')\n", + "plt.axvline(x=knn_ad.threshold_, color='r', linestyle='--', label='AD Threshold')\n", + "\n", + "for result in results:\n", + " plt.axvline(x=result['k-NN Score'], color='g', alpha=0.5,\n", + " label=f\"{result['Drug']}\")\n", + "\n", + "plt.xlabel('k-NN AD Score')\n", + "plt.ylabel('Absolute Prediction Error')\n", + "plt.title('k-NN AD Scores')\n", + "#plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "\n", + "# Plot for Leverage AD\n", + "plt.subplot(1, 2, 2)\n", + "plt.scatter(leverage_raw_scores, abs_errors, alpha=0.2, label='Test compounds')\n", + "plt.axvline(x=leverage_ad.threshold_, color='r', linestyle='--', label='AD Threshold')\n", + "\n", + "for result in results:\n", + " plt.axvline(x=result['Leverage Score'], color='g', alpha=0.5,\n", + " label=f\"{result['Drug']}\")\n", + "\n", + "plt.xlabel('Leverage AD Score')\n", + "plt.ylabel('Absolute Prediction Error')\n", + "plt.title('Leverage AD Scores')\n", + "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3c018e68", + "metadata": {}, + "source": [ + "## Conclusions on testing the AD estimators\n", + "\n", + "This notebook demonstrated two different approaches to applicability domain estimation:\n", + "\n", + "1. The k-NN based approach with binary fingerprints and Tanimoto distance provides a chemical similarity-based assessment\n", + "of whether new compounds are similar enough to the training set.\n", + "\n", + "2. The leverage-based approach with count-based fingerprints and dimensionality reduction focuses on the statistical\n", + "novelty of compounds in the reduced feature space.\n", + "\n", + "Heroin and Aspirin was predicted to have a low affinity, whereas Viagra was predicted as having a ~9 pXC50 corresponding to nanomolar affinity. As the regression model had only been trained on actives it will have a tendency to always predict things as active, which is hard to believe for compounds so dissimilar to the training set and with our prior knowledge about their primary targets.\n", + "\n", + "The famous drugs we tested showed marked differences between the two AD estimation techniques. \n", + "\n", + "The kNN based method using tanimoto distance showed all test drugs to be distant from the training set and thus outside the applicability domain, whereas the leverage method gave the \"green light\" for all of them. As the drug have different primary targets than the SLC6A4 serotonin transporter, it seems like the kNN based method in this instance (dataset, featurization, ML-model) is a better way to estimate the AD for given novel compounds. This is consistent with our analysis of the 95 percentile of the absolute errors for two different methods, where kNN had a higher 95% percentile error outside the domain, it was lower for the leverage based method.\n", + "\n" + ] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "-all", + "formats": "ipynb,py:percent", + "main_language": "python" + }, + "kernelspec": { + "display_name": "vscode", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scikit_mol/adapters.py b/scikit_mol/adapters.py deleted file mode 100644 index 5e3b2f0..0000000 --- a/scikit_mol/adapters.py +++ /dev/null @@ -1,539 +0,0 @@ -from typing import Dict, List, Optional, Tuple - -import numpy as np -from numpy.typing import NDArray -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.pipeline import FeatureUnion -from sklearn.utils import Bunch -from sklearn.utils._estimator_html_repr import _VisualBlock -from sklearn.utils._user_interface import _print_elapsed_time -from sklearn.utils.metadata_routing import ( - _raise_for_params, - _routing_enabled, - process_routing, -) -from sklearn.utils.parallel import Parallel as skParallel -from sklearn.utils.parallel import delayed -from sklearn.utils.validation import check_is_fitted - - -class EstimatorUnion(FeatureUnion): - """EXPERIMENTAL: A more flexible version of FeatureUnion that supports various estimator types. - - This class extends scikit-learn's FeatureUnion to support estimators with different - method interfaces (predict, transform, etc.) and allows explicit method selection. - It maintains all functionality of FeatureUnion while adding flexible method resolution. - - Parameters - ---------- - estimators : list of (str, estimator) tuples - List of (name, estimator) tuples, where estimator is any scikit-learn - compatible estimator with at least one of the methods specified in - method_resolution_order. - method_resolution_order : tuple of str, default=("predict", "transform") - Ordered tuple of method names to try when getting output from estimators. - Methods are tried in order until a valid one is found. - selected_methods : dict or None, default=None - Optional mapping of estimator names to specific methods to use. Takes - precedence over method_resolution_order. - n_jobs : int or None, default=None - Number of jobs to run in parallel. None means 1. - transformer_weights : dict or None, default=None - Multiplicative weights for features per transformer. Keys are transformer - names, values are weights. - verbose : bool, default=False - If True, the time elapsed while fitting each transformer will be printed. - verbose_feature_names_out : bool, default=True - If True, the feature names out will be verbose. - - Attributes - ---------- - transformers_ : list - List of fitted transformers. - - Notes - ----- - This class inherits from FeatureUnion and maintains all its functionality including - parallel processing, transformer weights, and metadata routing. The key extension is - the ability to handle estimators with different method interfaces through configurable - method resolution. - - See Also - -------- - sklearn.pipeline.FeatureUnion : The parent class providing base functionality. - """ - - def __init__( - self, - estimator_list: List[Tuple[str, BaseEstimator]], - *, - method_resolution_order: Tuple[str, ...] = ("predict", "transform"), - selected_methods: Optional[Dict[str, str]] = None, - n_jobs: Optional[int] = None, - transformer_weights: Optional[Dict[str, float]] = None, - verbose: bool = False, - verbose_feature_names_out: bool = True, - ) -> None: - # Store all parameters as properties - self.estimator_list = estimator_list - self.method_resolution_order = method_resolution_order - self.selected_methods = selected_methods or {} - self.n_jobs = n_jobs - self.transformer_weights = transformer_weights - self.verbose = verbose - self.verbose_feature_names_out = verbose_feature_names_out - - @property - def estimator_list(self) -> List[Tuple[str, BaseEstimator]]: - """Get estimators (alias for transformer_list).""" - return self.transformer_list - - @estimator_list.setter - def estimator_list(self, estimator_list: List[Tuple[str, BaseEstimator]]) -> None: - """Set estimators (and internal transformer_list property).""" - self.transformer_list = estimator_list - - def _get_method_name(self, estimator_tuple: Tuple[str, BaseEstimator]) -> str: - """Get the appropriate method name for the estimator, raising clear errors if not found. - - Parameters - ---------- - estimator_tuple : tuple of (str, estimator) - Tuple containing the estimator name and instance. - - Returns - ------- - str - Name of the method to use for this estimator. - - Raises - ------ - ValueError - If no valid method is found for the estimator. - """ - name, estimator = estimator_tuple - - # Check explicit method if specified - if name in self.selected_methods: - method = self.selected_methods[name] - if not hasattr(estimator, method): - raise ValueError( - f"Estimator '{name}' ({type(estimator).__name__}) does not have " - f"explicitly selected method '{method}'. Consider changing selected_methods " - f"or using only method_resolution_order to specify valid methods." - ) - return method - - # Try methods in resolution order - for method in self.method_resolution_order: - if hasattr(estimator, method): - return method - - raise ValueError( - f"Estimator '{name}' ({type(estimator).__name__}) does not have any of " - f"the methods: {', '.join(self.method_resolution_order)}. Consider using " - f"method_resolution_order or selected_methods to specify valid methods." - ) - - def _get_estimator_output( - self, estimator_tuple: Tuple[str, BaseEstimator], X: NDArray - ) -> NDArray: - """Get output from estimator using appropriate method.""" - name, estimator = estimator_tuple - method = self._get_method_name(estimator_tuple) - output = getattr(estimator, method)(X) - - # Ensure 2D output - if output.ndim == 1: - output = output.reshape(-1, 1) - return output - - def _validate_transformers(self): - names, transformers = zip(*self.transformer_list) - - # validate names - self._validate_names(names) - - # validate estimators - for t in transformers: - if t in ("drop", "passthrough"): - continue - # TODO, make a check that the methods in the method_resolution_order /method mappting are present - # if not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not hasattr( - # t, "transform" - # ): - # raise TypeError( - # "All estimators should implement fit and " - # "transform. '%s' (type %s) doesn't" % (t, type(t)) - # ) - - def transform(self, X, **params): - """Transform X separately by each transformer, concatenate results. - - Parameters - ---------- - X : iterable or array-like, depending on transformers - Input data to be transformed. - - **params : dict, default=None - - Parameters routed to the `transform` method of the sub-transformers via the - metadata routing API. See :ref:`Metadata Routing User Guide - ` for more details. - - .. versionadded:: 1.5 - - Returns - ------- - X_t : array-like or sparse matrix of shape (n_samples, sum_n_components) - The `hstack` of results of transformers. `sum_n_components` is the - sum of `n_components` (output dimension) over transformers. - """ - _raise_for_params(params, self, "transform") - - if _routing_enabled(): - routed_params = process_routing(self, "transform", **params) - else: - # TODO(SLEP6): remove when metadata routing cannot be disabled. - routed_params = Bunch() - for name, _ in self.transformer_list: - routed_params[name] = Bunch(transform={}) - - # Build delayed jobs with custom methods - delayed_jobs = [] - for name, trans, weight in self._iter(): - method_name = self._get_method_name((name, trans)) - delayed_jobs.append( - delayed( - _transform_one - )( # Seems like the only reason we modify this method from base class is to handle it with a custom function for parallel processing - trans, - X, - None, - weight, - params=routed_params[name], - method=method_name, - ) - ) - - Xs = skParallel(n_jobs=self.n_jobs)(delayed_jobs) - - if not Xs: - # All transformers are None - return np.zeros((X.shape[0], 0)) - - return self._hstack(Xs) - - def fit_transform(self, X, y=None, **params): - """Fit all transformers, transform the data and concatenate results. - - Parameters - ---------- - X : iterable or array-like, depending on transformers - Input data to be transformed. - - y : array-like of shape (n_samples, n_outputs), default=None - Targets for supervised learning. - - **params : dict, default=None - - If `enable_metadata_routing=False` (default): - Parameters directly passed to the `fit` methods of the - sub-transformers. - - - If `enable_metadata_routing=True`: - Parameters safely routed to the `fit` methods of the - sub-transformers. See :ref:`Metadata Routing User Guide - ` for more details. - - .. versionchanged:: 1.5 - `**params` can now be routed via metadata routing API. - - Returns - ------- - X_t : array-like or sparse matrix of \ - shape (n_samples, sum_n_components) - The `hstack` of results of transformers. `sum_n_components` is the - sum of `n_components` (output dimension) over transformers. - """ - if _routing_enabled(): - routed_params = process_routing(self, "fit_transform", **params) - else: - # TODO(SLEP6): remove when metadata routing cannot be disabled. - routed_params = Bunch() - for name, obj in self.transformer_list: - if hasattr(obj, "fit_transform"): - routed_params[name] = Bunch(fit_transform={}) - routed_params[name].fit_transform = params - else: - routed_params[name] = Bunch(fit={}) - routed_params[name] = Bunch(transform={}) - routed_params[name].fit = params - - results = self._parallel_func(X, y, _fit_transform_one, routed_params) - if not results: - # All transformers are None - return np.zeros((X.shape[0], 0)) - - Xs, transformers = zip(*results) - self._update_transformer_list(transformers) - - return self._hstack(Xs) - - def predict(self, X: NDArray) -> NDArray: - """Predict using all estimators. - - Alias for transform to maintain predictor interface. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Input data to be predicted. - - Returns - ------- - ndarray of shape (n_samples, sum_n_output_features) - Horizontally stacked predictions of all estimators. - """ - return self.transform(X) - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Input features. - - Returns - ------- - feature_names_out : ndarray of str objects - Transformed feature names. - """ - # List of tuples (name, feature_names_out) - transformer_with_feature_names_out = [] - for name, trans, _ in self._iter(): - if hasattr(trans, "predict") and not hasattr( - trans, "get_feature_names_out" - ): - # Assume predictors only return 1D output and thus we use their name as feature_name - feature_names_out = np.array([name]) - elif not hasattr(trans, "get_feature_names_out"): - raise AttributeError( - "Transformer %s (type %s) does not provide get_feature_names_out." - % (str(name), type(trans).__name__) - ) - else: - feature_names_out = trans.get_feature_names_out(input_features) - transformer_with_feature_names_out.append((name, feature_names_out)) - - return self._add_prefix_for_feature_names_out( - transformer_with_feature_names_out - ) - - -def _transform_one(transformer, X, y, weight, params=None, method="transform"): - """Call transform and apply weight to output. - - Parameters - ---------- - transformer : estimator - Estimator to be used for transformation. - - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Input data to be transformed. - - y : ndarray of shape (n_samples,) - Ignored. - - weight : float - Weight to be applied to the output of the transformation. - - method : str - Method to use for transformation (e.g. "transform", "predict", "predict_proba"). - - params : dict - Parameters to be passed to the transformer's ``transform`` method. - - This should be of the form ``process_routing()["step_name"]``. - """ - res = getattr(transformer, method)(X, **params.transform) - # Ensure 2D output - if res.ndim == 1: - res = res.reshape(-1, 1) - # if we have a weight for this transformer, multiply output - if weight is None: - return res - return res * weight - - -def _fit_transform_one( - transformer, - X, - y, - weight, - message_clsname="", - message=None, - params=None, - method="transform", -): - """ - Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned - with the fitted transformer. If ``weight`` is not ``None``, the result will - be multiplied by ``weight``. - - ``params`` needs to be of the form ``process_routing()["step_name"]``. - """ - params = params or {} - with _print_elapsed_time(message_clsname, message): - if hasattr(transformer, "fit_transform"): - res = transformer.fit_transform(X, y, **params.get("fit_transform", {})) - elif hasattr(transformer, "transform"): - res = transformer.fit(X, y, **params.get("fit", {})).transform( - X, **params.get("transform", {}) - ) - # Custom handling of methods that has predict but no fit_transform or transform - elif hasattr(transformer, "predict"): - transformer.fit(X, y, **params.get("fit", {})) - res = transformer.predict(X, **params.get("predict", {})) - if res.ndim == 1: - res = res.reshape(-1, 1) - else: - raise ValueError( - f"Transformer {transformer} does not have a fit_transform, fit or predict method." - ) - - if weight is None: - return res, transformer - return res * weight, transformer - - -class _BaseAdapter(BaseEstimator): - """EXPERIMENTAL: Base class for adapters that wrap estimators and modify their interface.""" - - def __init__( - self, estimator: BaseEstimator, _feature_names_out: Optional[List[str]] = None - ): - """Initialize the adapter with an estimator.""" - self.estimator = estimator - self._feature_names_out = _feature_names_out - - def get_feature_names_out(self, input_features=None): - """Get output feature names for transformation. - - Parameters - ---------- - input_features : array-like of str or None, default=None - Input features.""" - - return ["tester"] - - def __getattr__(self, name): - """Delegate any unknown attributes/methods to wrapped estimator.""" - if hasattr(self.estimator, name): - attr = getattr(self.estimator, name) - if isinstance(attr, property): - return attr.__get__(self.estimator) - return attr - raise AttributeError( - f"Neither {self.__class__.__name__} nor {self.estimator.__class__.__name__} " - f"has attribute '{name}'" - ) - - def __dir__(self): - """List all attributes including those from wrapped estimator.""" - return list(set(super().__dir__() + dir(self.estimator))) - - @property - def __dict__(self): - """Include estimator's properties in the instance dict.""" - d = super().__dict__.copy() - estimator_dict = vars(self.estimator) - for name, value in estimator_dict.items(): - if not name.startswith("_"): - d[name] = value - return d - - def _sk_visual_block_(self): - # TODO: this looks strange when putting the wrapped estimator into a pipeline - """Generate information about how to display the adapter.""" - return _VisualBlock( - "parallel", - [self.estimator], - names=None, - name_details=None, - name_caption=None, - dash_wrapped=False, - ) - - -class PredictToTransformAdapter(_BaseAdapter, TransformerMixin): - """EXPERIMENTAL: Adapter that exposes an estimator's predict method as transform.""" - - def __init__(self, estimator: BaseEstimator, method: str = "predict"): - """Initialize the adapter with an estimator and a method to use. - - Parameters - ---------- - estimator : BaseEstimator - The estimator to wrap. - method : str, default="predict" - The method to use for transformation. - """ - super().__init__(estimator) - self.method = method - - def transform(self, X): - """Transform X using the wrapped estimator's specified method. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input data to transform. - check_is_fitted(self) - - Example - -------- - >>> from sklearn.linear_model import LogisticRegression - >>> from sklearn_mol.adapters import PredictToTransformAdapter - >>> estimator = LogisticRegression() - >>> adapter = PredictToTransformAdapter(estimator, method="predict"") - >>> adapter.fit(X, y) - >>> adapter.transform(X) - """ - prediction = getattr(self.estimator, self.method)(X) - if prediction.ndim == 1: - prediction = prediction.reshape(-1, 1) - return prediction - - -class TransformToPredictAdapter(_BaseAdapter, TransformerMixin): - """EXPERIMENTAL: Adapter that exposes an estimator's transform method as predict. - - 2D column vector output is flattened to 1D.""" - - def __init__(self, estimator: BaseEstimator, method: str = "transform"): - super().__init__(estimator) - self.method = method - - def predict(self, X): - """Predict using the wrapped estimator's specified method. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input data to predict. - - Example - -------- - >>> from sklearn.preprocessing import StandardScaler - >>> from sklearn_mol.adapters import TransformToPredictAdapter - >>> estimator = StandardScaler() - >>> adapter = TransformToPredictAdapter(estimator, method="transform") - >>> adapter.fit(X, y) - >>> adapter.predict(X) - """ - check_is_fitted(self) - prediction = self.estimator.transform(X) - if prediction.shape[1] == 1: - prediction = prediction.flatten() - return prediction diff --git a/scikit_mol/applicability_old.py b/scikit_mol/applicability_old.py deleted file mode 100644 index e7d8ecf..0000000 --- a/scikit_mol/applicability_old.py +++ /dev/null @@ -1,276 +0,0 @@ -import numpy as np -from scipy import linalg, stats -from scipy.sparse import csr_matrix -from sklearn.base import BaseEstimator, TransformerMixin, check_array, check_is_fitted -from sklearn.neighbors import NearestNeighbors - - -class NearestNeighborsDistance(BaseEstimator, TransformerMixin): - def __init__(self, n_neighbors=1): - self.n_neighbors = n_neighbors - self.feature_name = "nn_distance" - - def fit(self, X, y=None): - self.X_sparse = csr_matrix(X) - self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric="cosine") - self.nn.fit(self.X_sparse) - return self - - def transform(self, X): - X_sparse = csr_matrix(X) - distances, _ = self.nn.kneighbors(X_sparse) - avg_distances = np.mean(distances, axis=1) - return avg_distances.reshape(-1, 1) # Return 2D array for consistency - - def predict(self, X): - return self.transform(X) - - def get_feature_names_out(self, input_features=None): - return np.array([self.feature_name]) - - -class LeverageDistanceSlow(BaseEstimator, TransformerMixin): - """Calculate leverage-based distances for applicability domain assessment. - - The leverage approach measures how far a sample is from the center of the - X variable space. It's based on the hat matrix H = X(X'X)^(-1)X'. - - Parameters - ---------- - threshold_factor : float, default=3 - Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n - where p is the number of features and n is the number of samples. - - Attributes - ---------- - n_features_in_ : int - Number of features seen during fit. - X_fit_ : ndarray - Training data used in fit. - leverage_threshold_ : float - Calculated leverage threshold (h*). - """ - - def __init__(self, threshold_factor=3): - self.threshold_factor = threshold_factor - - def fit(self, X, y=None): - """Fit the model using X as training data. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Training data. - y : Ignored - Not used, present here for API consistency by convention. - - Returns - ------- - self : object - Returns the instance itself. - """ - X = check_array(X, accept_sparse=False) - self.n_features_in_ = X.shape[1] - self.X_fit_ = X - - # Calculate leverage threshold h* - n_samples = X.shape[0] - self.leverage_threshold_ = ( - self.threshold_factor * (self.n_features_in_ + 1) / n_samples - ) - - # Store (X'X)^(-1) for later use - self.xtx_inv_ = np.linalg.inv(X.T @ X) - - return self - - def transform(self, X): - """Calculate leverage-based distances for X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The data to calculate leverage distances for. - - Returns - ------- - h : ndarray of shape (n_samples, 1) - The leverage values for each sample. - """ - check_is_fitted(self) - X = check_array(X, accept_sparse=False) - - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but LeverageDistance " - f"was fitted with {self.n_features_in_} features." - ) - - # Calculate leverage values h = diag(X(X'X)^(-1)X') - # Slighlty different implementation (from another package) - # hat_matrix = X @ self.xtx_inv_ @ X.T - # leverages = np.diag(hat_matrix) - - h = np.sum(X @ self.xtx_inv_ * X, axis=1) - - return h.reshape(-1, 1) - - def predict(self, X): - """Alias for transform, following scikit-learn conventions.""" - return self.transform(X) - - def get_feature_names_out(self, input_features=None): - """Get output feature names. - - Parameters - ---------- - input_features : None - Ignored as the transformer generates new feature names. - - Returns - ------- - feature_names_out : ndarray of str objects - Leverage distance feature name. - """ - check_is_fitted(self) - return np.array(["leverage_distance"]) - - -# Faster but gives some _very_ large distances for some compounds! -class LeverageDistance(BaseEstimator, TransformerMixin): - """Calculate leverage-based distances for applicability domain assessment. - - Parameters - ---------- - threshold_factor : float, default=3 - Factor used in calculating the leverage threshold h* = threshold_factor * (p+1)/n - """ - - def __init__(self, threshold_factor=3): - self.threshold_factor = threshold_factor - - def fit(self, X, y=None): - X = check_array(X, accept_sparse=False) - self.n_features_in_ = X.shape[1] - n_samples = X.shape[0] - - # Calculate leverage threshold h* - self.leverage_threshold_ = ( - self.threshold_factor * (self.n_features_in_ + 1) / n_samples - ) - - # Use more efficient matrix operations - # Calculate (X'X)^(-1) using SVD which is more stable - U, s, Vh = linalg.svd(X, full_matrices=False) - - # Store components for faster transform - self.s_inv_ = 1 / s - self.U_ = U - self.Vh_ = Vh - - return self - - def transform(self, X): - check_is_fitted(self) - X = check_array(X, accept_sparse=False) - - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but LeverageDistance " - f"was fitted with {self.n_features_in_} features." - ) - - # Efficient leverage calculation using stored SVD components - # This avoids explicit matrix inversion - Z = X @ self.Vh_.T * self.s_inv_ - h = np.sum(Z * Z, axis=1) - - return h.reshape(-1, 1) - - def predict(self, X): - return self.transform(X) - - def get_feature_names_out(self, input_features=None): - check_is_fitted(self) - return np.array(["leverage_distance"]) - - -class MahalanobisDistance(BaseEstimator, TransformerMixin): - """Calculate Mahalanobis distances for applicability domain assessment. - - Parameters - ---------- - threshold_quantile : float, default=0.975 - Quantile of chi-square distribution to use as threshold. - threshold_strategy : str, default='chi2' - Strategy to compute threshold. Options: - - 'chi2': Use chi-square distribution (theoretical) - - 'empirical': Use empirical distribution from training data - - None: Don't compute threshold (useful for CV) - """ - - def __init__(self, threshold_quantile=0.975, threshold_strategy="chi2"): - self.threshold_quantile = threshold_quantile - self.threshold_strategy = threshold_strategy - - def fit(self, X, y=None): - X = check_array(X) - self.n_features_in_ = X.shape[1] - - # Compute mean and covariance - self.mean_ = np.mean(X, axis=0) - self.covariance_ = np.cov(X, rowvar=False) - self.inv_covariance_ = np.linalg.inv(self.covariance_) - - # Calculate distances for training set - train_distances = self._mahalanobis(X) - self.train_distances_ = train_distances - - # Set threshold based on strategy - if self.threshold_strategy == "chi2": - self.threshold_ = stats.chi2.ppf( - self.threshold_quantile, df=self.n_features_in_ - ) - elif self.threshold_strategy == "empirical": - self.threshold_ = np.quantile(train_distances, self.threshold_quantile) - elif self.threshold_strategy is None: - self.threshold_ = None - else: - raise ValueError(f"Unknown threshold_strategy: {self.threshold_strategy}") - - return self - - def _mahalanobis(self, X): - """Calculate Mahalanobis distances.""" - X_centered = X - self.mean_ - return np.sqrt(np.sum(X_centered @ self.inv_covariance_ * X_centered, axis=1)) - - def transform(self, X): - check_is_fitted(self) - X = check_array(X) - - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but {self.__class__.__name__} " - f"was fitted with {self.n_features_in_} features." - ) - - distances = self._mahalanobis(X) - return distances.reshape(-1, 1) - - def set_threshold(self, threshold): - """Set threshold manually, e.g., from cross-validation.""" - self.threshold_ = threshold - return self - - def get_feature_names_out(self, input_features=None): - check_is_fitted(self) - return np.array(["mahalanobis_distance"]) - - def _more_tags(self): - return { - "requires_fit": True, - "X_types": ["2darray"], - "poor_score": False, - "allow_nan": False, - } From 756e6c9636fa252b0c221bff0bbbde31635b0160 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Mar 2025 11:08:56 +0100 Subject: [PATCH 17/24] Further fixes in tests and some estimators --- scikit_mol/applicability/kernel_density.py | 4 +- scikit_mol/applicability/local_outlier.py | 4 + tests/applicability/test_local_outlier.py | 8 +- tests/test_adapters.py | 145 --------------------- tests/test_desctransformer.py | 19 +-- tests/test_fptransformers.py | 21 +-- tests/test_fptransformersgenerator.py | 19 +-- tests/test_metrics.py | 91 ------------- tests/test_parameter_types.py | 13 +- tests/test_safeinferencemode.py | 13 +- tests/test_sanitizer.py | 22 +++- tests/test_smilestomol.py | 13 +- tests/test_transformers.py | 30 ++--- 13 files changed, 93 insertions(+), 309 deletions(-) delete mode 100644 tests/test_adapters.py delete mode 100644 tests/test_metrics.py diff --git a/scikit_mol/applicability/kernel_density.py b/scikit_mol/applicability/kernel_density.py index 3f7281d..c32090c 100644 --- a/scikit_mol/applicability/kernel_density.py +++ b/scikit_mol/applicability/kernel_density.py @@ -33,7 +33,7 @@ class KernelDensityApplicabilityDomain(BaseApplicabilityDomain): 'exponential', 'linear', 'cosine']. percentile : float or None, default=None The percentile of training set densities to use as threshold (0-100). - If None, uses 1.0 (exclude bottom 1% of training samples). + If None, uses 99.0 (exclude bottom 1% of training samples). feature_name : str, default="KernelDensity" Name for the output feature column. @@ -68,7 +68,7 @@ def __init__( percentile: Optional[float] = None, feature_name: str = "KernelDensity", ) -> None: - super().__init__(percentile=percentile or 1.0, feature_name=feature_name) + super().__init__(percentile=percentile or 99.0, feature_name=feature_name) self.bandwidth = bandwidth self.kernel = kernel diff --git a/scikit_mol/applicability/local_outlier.py b/scikit_mol/applicability/local_outlier.py index 9366723..5dfb055 100644 --- a/scikit_mol/applicability/local_outlier.py +++ b/scikit_mol/applicability/local_outlier.py @@ -124,6 +124,10 @@ def _transform(self, X: NDArray) -> NDArray[np.float64]: scores = -self.lof_.score_samples(X) return scores.reshape(-1, 1) + def _set_statistical_threshold(self, X): + """Set the statistical threshold for the LOF scores.""" + self.threshold_ = -self.lof_.offset_ + # def predict(self, X): # """Predict whether samples are within the applicability domain. diff --git a/tests/applicability/test_local_outlier.py b/tests/applicability/test_local_outlier.py index c0b129f..8a5f6cb 100644 --- a/tests/applicability/test_local_outlier.py +++ b/tests/applicability/test_local_outlier.py @@ -19,8 +19,8 @@ def test_n_neighbors_effect(): outlier = np.array([[10, 10]]) # Compare different n_neighbors settings - ad_small = LocalOutlierFactorApplicabilityDomain(n_neighbors=5) - ad_large = LocalOutlierFactorApplicabilityDomain(n_neighbors=20) + ad_small = LocalOutlierFactorApplicabilityDomain(n_neighbors=2) + ad_large = LocalOutlierFactorApplicabilityDomain(n_neighbors=5) ad_small.fit(X) ad_large.fit(X) @@ -51,8 +51,8 @@ def test_contamination_effect(): X = np.random.randn(100, 2) # Compare different contamination levels - ad_low = LocalOutlierFactorApplicabilityDomain(contamination=0.1) - ad_high = LocalOutlierFactorApplicabilityDomain(contamination=0.2) + ad_low = LocalOutlierFactorApplicabilityDomain(contamination=0.05) + ad_high = LocalOutlierFactorApplicabilityDomain(contamination=0.25) ad_low.fit(X) ad_high.fit(X) diff --git a/tests/test_adapters.py b/tests/test_adapters.py deleted file mode 100644 index 340c986..0000000 --- a/tests/test_adapters.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Tests for EstimatorUnion adapter.""" - -import numpy as np -import pandas as pd -import pytest -from sklearn.exceptions import NotFittedError -from sklearn.preprocessing import StandardScaler - -from scikit_mol.adapters import EstimatorUnion -from scikit_mol.applicability import ( - MahalanobisApplicabilityDomain, - StandardizationApplicabilityDomain, -) - -# Use existing fixtures -from .fixtures import ( - atompair_transformer, - mols_list, - morgan_transformer, - skip_pandas_output_test, -) - - -def test_estimator_union_basic(morgan_transformer, atompair_transformer): - """Test basic functionality of EstimatorUnion.""" - - union = EstimatorUnion( - [ - ("fp1", morgan_transformer), - ( - "fp2", - atompair_transformer, - ), # Different radius for different features - ] - ) - - # Test unfitted raises exception - with pytest.raises(NotFittedError): - union.transform(mols_list) - - # Test fit and transform - union.fit(mols_list) - features = union.transform(mols_list) - - # Check output shape - n_fp = morgan_transformer().fpSize - assert features.shape == (len(mols_list), n_fp * 2) - - -def test_estimator_union_with_ad(morgan_transformer): - """Test EstimatorUnion with applicability domain estimator.""" - union = EstimatorUnion( - [ - ("fp", morgan_transformer), - ("ad", MahalanobisApplicabilityDomain()), - ], - method_resolution_order=("transform", "transform_score"), - ) - - union.fit(mols_list) - features = union.transform(mols_list) - - # Check output shape (fingerprints + 1 AD score) - n_fp = morgan_transformer().fpSize - assert features.shape == (len(mols_list), n_fp + 1) - - -def test_estimator_union_mixed_methods(morgan_transformer): - """Test EstimatorUnion with different methods specified.""" - union = EstimatorUnion( - [ - ("scale", StandardScaler(), "transform"), - ("ad", StandardizationApplicabilityDomain(), "transform_score"), - ("fp", morgan_transformer, "transform"), - ] - ) - - # Create some numeric data for StandardScaler - X = morgan_transformer.fit_transform(mols_list) - union.fit(X) - features = union.transform(X) - - # Check output shape - assert features.shape[0] == len(X) - assert features.shape[1] == X.shape[1] * 2 + 1 # scaled + fp + 1 AD score - - -@skip_pandas_output_test -def test_estimator_union_pandas_output(pandas_output, morgan_transformer): - """Test pandas DataFrame output from EstimatorUnion.""" - union = EstimatorUnion( - [ - ("fp", morgan_transformer), - ("ad", MahalanobisApplicabilityDomain(), "transform_score"), - ] - ) - - union.fit(mols_list) - features = union.transform(mols_list) - - # Check output type and structure - assert isinstance(features, pd.DataFrame) - assert len(features) == len(mols_list) - - # Check column names - fp_cols = [f"fp_{i}" for i in range(morgan_transformer.fpSize)] - expected_cols = fp_cols + ["Mahalanobis"] - assert features.columns.tolist() == expected_cols - - -def test_estimator_union_invalid_method(morgan_transformer): - """Test EstimatorUnion with invalid method specification.""" - with pytest.raises(ValueError): - EstimatorUnion([("fp", morgan_transformer, "invalid_method")]) - - -def test_estimator_union_get_feature_names_out(morgan_transformer): - """Test feature names output from EstimatorUnion.""" - union = EstimatorUnion( - [ - ("fp", morgan_transformer), - ("ad", MahalanobisApplicabilityDomain(), "transform_score"), - ] - ) - - union.fit(mols_list) - feature_names = union.get_feature_names_out() - - # Check number and format of feature names - n_fp = morgan_transformer().fpSize - assert len(feature_names) == n_fp + 1 - assert all(name.startswith("fp_") for name in feature_names[:-1]) - assert feature_names[-1] == "Mahalanobis" - - -def test_estimator_union_partial_fit(morgan_transformer): - """Test EstimatorUnion with some estimators already fitted.""" - fp = morgan_transformer.fit(mols_list) - ad = MahalanobisApplicabilityDomain() - - union = EstimatorUnion([("fp", fp), ("ad", ad, "transform_score")]) - - # Should work since fp is already fitted - features = union.fit_transform(mols_list) - assert features.shape == (len(mols_list), fp.fpSize + 1) diff --git a/tests/test_desctransformer.py b/tests/test_desctransformer.py index ed6c1a9..4f2dae9 100644 --- a/tests/test_desctransformer.py +++ b/tests/test_desctransformer.py @@ -6,15 +6,6 @@ import pandas as pd import pytest import sklearn -from fixtures import ( - mols_container, - mols_list, - mols_with_invalid_container, - skip_pandas_output_test, - smiles_container, - smiles_list, - smiles_list_with_invalid, -) from packaging.version import Version from rdkit.Chem import Descriptors from sklearn import clone @@ -24,6 +15,16 @@ from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT from scikit_mol.descriptors import MolecularDescriptorTransformer +from .fixtures import ( + mols_container, + mols_list, + mols_with_invalid_container, + skip_pandas_output_test, + smiles_container, + smiles_list, + smiles_list_with_invalid, +) + @pytest.fixture def default_descriptor_transformer(): diff --git a/tests/test_fptransformers.py b/tests/test_fptransformers.py index 23b734d..25bd64d 100644 --- a/tests/test_fptransformers.py +++ b/tests/test_fptransformers.py @@ -4,7 +4,17 @@ import numpy as np import pandas as pd import pytest -from fixtures import ( +from rdkit import Chem +from sklearn import clone + +from scikit_mol.fingerprints import ( + AvalonFingerprintTransformer, + MACCSKeysFingerprintTransformer, + MHFingerprintTransformer, + SECFingerprintTransformer, +) + +from .fixtures import ( chiral_mols_list, chiral_smiles_list, fingerprint, @@ -15,15 +25,6 @@ smiles_list, smiles_list_with_invalid, ) -from rdkit import Chem -from sklearn import clone - -from scikit_mol.fingerprints import ( - AvalonFingerprintTransformer, - MACCSKeysFingerprintTransformer, - MHFingerprintTransformer, - SECFingerprintTransformer, -) @pytest.fixture diff --git a/tests/test_fptransformersgenerator.py b/tests/test_fptransformersgenerator.py index 905ce27..ef94342 100644 --- a/tests/test_fptransformersgenerator.py +++ b/tests/test_fptransformersgenerator.py @@ -3,15 +3,6 @@ import numpy as np import pytest -from fixtures import ( - chiral_mols_list, - chiral_smiles_list, - fingerprint, - mols_container, - mols_list, - smiles_container, - smiles_list, -) from sklearn import clone from scikit_mol.fingerprints import ( @@ -21,6 +12,16 @@ TopologicalTorsionFingerprintTransformer, ) +from .fixtures import ( + chiral_mols_list, + chiral_smiles_list, + fingerprint, + mols_container, + mols_list, + smiles_container, + smiles_list, +) + test_transformers = [ AtomPairFingerprintTransformer, MorganFingerprintTransformer, diff --git a/tests/test_metrics.py b/tests/test_metrics.py deleted file mode 100644 index 81b8773..0000000 --- a/tests/test_metrics.py +++ /dev/null @@ -1,91 +0,0 @@ -import numpy as np -import pytest - -from scikit_mol.metrics import tanimoto_distance - -from .applicability.conftest import binary_fingerprints - - -@pytest.fixture -def simple_fingerprints(): - """Create simple binary fingerprints for testing.""" - return np.array( - [ - [1, 1, 0, 0], # fp0: 2 bits set - [1, 0, 1, 0], # fp1: 2 bits set, 1 in common with fp1 - [0, 0, 1, 1], # fp2: 2 bits set, 1 in common with fp2, none with fp1 - [1, 1, 1, 1], # fp3: all bits set - [0, 0, 0, 0], # fp4: no bits set - ], - dtype=bool, - ) - - -def test_tanimoto_distance_basic(simple_fingerprints): - """Test basic properties of Tanimoto distance.""" - distances = tanimoto_distance(simple_fingerprints[0], simple_fingerprints[1]) - - # Check distance range [0,1] - assert 0 <= distances <= 1 - - # Check specific distances - # fp0 vs fp1: 1 bit in common, 3 in union -> distance = 2/3 - assert np.isclose(distances, 2 / 3) - # fp0 vs fp2: no bits in common, 4 in union -> distance = 1 - assert np.isclose( - tanimoto_distance(simple_fingerprints[0], simple_fingerprints[2]), 1.0 - ) - # fp0 vs fp3: 2 bits in common, 4 in union -> distance = 0.5 - assert np.isclose( - tanimoto_distance(simple_fingerprints[0], simple_fingerprints[3]), 0.5 - ) - # fp0 vs fp4: no bits in common, 2 in union -> distance = 1 - assert np.isclose( - tanimoto_distance(simple_fingerprints[0], simple_fingerprints[4]), 1.0 - ) - - -def test_tanimoto_distance_edge_cases(simple_fingerprints): - """Test edge cases for Tanimoto distance.""" - empty = simple_fingerprints[4] # Empty fingerprint - full = simple_fingerprints[3] # Full fingerprint - - # Two empty fingerprints (fp4) - dist = tanimoto_distance(empty, empty) - # No bits in common, 0 in union -> distance = 0/0 = 0 in our implementation. - assert np.isclose(dist, 0.0) - - # Empty vs full fingerprint (fp3) - dist = tanimoto_distance(empty, full) - assert np.isclose(dist, 1.0) # No overlap -> maximum distance - - -# TODO, can rdkit speed things up? But not working with np.arrays -# def test_tanimoto_implementations_equivalent(simple_fingerprints): -# """Test that both implementations give equivalent results.""" -# X = simple_fingerprints[:2] -# Y = simple_fingerprints[2:4] - -# dist1 = tanimoto_distance(X, Y) -# dist2 = tanimoto_distance_rdkit(X, Y) - -# assert np.allclose(dist1, dist2) - - -# def test_tanimoto_distance_rdkit_basic(binary_fingerprints): -# """Test basic properties of RDKit-based Tanimoto distance.""" -# # Get a subset of fingerprints for testing -# X = binary_fingerprints[:3] -# Y = binary_fingerprints[3:6] - -# distances = tanimoto_distance_rdkit(X, Y) - -# # Check output shape -# assert distances.shape == (3, 3) - -# # Check distance range [0,1] -# assert np.all((0 <= distances) & (distances <= 1)) - -# # Check self-distance is 0 for identical fingerprints -# self_distances = tanimoto_distance_rdkit(X, X) -# assert np.allclose(np.diag(self_distances), 0) diff --git a/tests/test_parameter_types.py b/tests/test_parameter_types.py index 4b73959..15e4855 100644 --- a/tests/test_parameter_types.py +++ b/tests/test_parameter_types.py @@ -1,15 +1,16 @@ -import pytest import numpy as np +import pytest from rdkit import Chem -from fixtures import ( + +from .fixtures import ( + atompair_transformer, mols_list, - smiles_list, morgan_transformer, - atompair_transformer, - topologicaltorsion_transformer, rdkit_transformer, + smiles_list, + topologicaltorsion_transformer, ) -from test_fptransformers import ( +from .test_fptransformers import ( avalon_transformer, ) diff --git a/tests/test_safeinferencemode.py b/tests/test_safeinferencemode.py index 60f8d1f..615694d 100644 --- a/tests/test_safeinferencemode.py +++ b/tests/test_safeinferencemode.py @@ -1,12 +1,6 @@ import numpy as np import pandas as pd import pytest -from fixtures import ( - SLC6A4_subset, - invalid_smiles_list, - skip_pandas_output_test, - smiles_list, -) from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline @@ -15,6 +9,13 @@ from scikit_mol.safeinference import SafeInferenceWrapper from scikit_mol.utilities import set_safe_inference_mode +from .fixtures import ( + SLC6A4_subset, + invalid_smiles_list, + skip_pandas_output_test, + smiles_list, +) + def equal_val(value, expected_value): try: diff --git a/tests/test_sanitizer.py b/tests/test_sanitizer.py index 9dd09cc..05c334c 100644 --- a/tests/test_sanitizer.py +++ b/tests/test_sanitizer.py @@ -1,10 +1,12 @@ -import pytest import numpy as np import pandas as pd +import pytest from rdkit import Chem -from fixtures import smiles_list, smiles_list_with_invalid + from scikit_mol.utilities import CheckSmilesSanitazion +from .fixtures import smiles_list, smiles_list_with_invalid + @pytest.fixture def sanitizer(): @@ -23,7 +25,9 @@ def test_checksmilessanitation(smiles_list, smiles_list_with_invalid, sanitizer) assert errors[0] == sanitizer.errors.SMILES[0] -def test_checksmilessanitation_x_and_y(smiles_list, smiles_list_with_invalid, sanitizer): +def test_checksmilessanitation_x_and_y( + smiles_list, smiles_list_with_invalid, sanitizer +): smiles_list_sanitized, y_sanitized, errors, y_errors = sanitizer.sanitize( smiles_list_with_invalid, list(range(len(smiles_list_with_invalid))) ) @@ -36,14 +40,18 @@ def test_checksmilessanitation_x_and_y(smiles_list, smiles_list_with_invalid, sa def test_checksmilessanitation_np(smiles_list, smiles_list_with_invalid, sanitizer): - smiles_list_sanitized, errors = sanitizer.sanitize(np.array(smiles_list_with_invalid)) + smiles_list_sanitized, errors = sanitizer.sanitize( + np.array(smiles_list_with_invalid) + ) assert len(smiles_list_with_invalid) > len(smiles_list_sanitized) assert all([a == b for a, b in zip(smiles_list, smiles_list_sanitized)]) assert errors[0] == sanitizer.errors.SMILES[0] def test_checksmilessanitation_numpy(smiles_list, smiles_list_with_invalid, sanitizer): - smiles_list_sanitized, errors = sanitizer.sanitize(pd.Series(smiles_list_with_invalid)) + smiles_list_sanitized, errors = sanitizer.sanitize( + pd.Series(smiles_list_with_invalid) + ) assert len(smiles_list_with_invalid) > len(smiles_list_sanitized) assert all([a == b for a, b in zip(smiles_list, smiles_list_sanitized)]) assert errors[0] == sanitizer.errors.SMILES[0] @@ -52,7 +60,9 @@ def test_checksmilessanitation_numpy(smiles_list, smiles_list_with_invalid, sani def test_checksmilessanitation_return_mol( smiles_list, smiles_list_with_invalid, return_mol_sanitizer ): - smiles_list_sanitized, errors = return_mol_sanitizer.sanitize(smiles_list_with_invalid) + smiles_list_sanitized, errors = return_mol_sanitizer.sanitize( + smiles_list_with_invalid + ) assert len(smiles_list_with_invalid) > len(smiles_list_sanitized) assert all( [ diff --git a/tests/test_smilestomol.py b/tests/test_smilestomol.py index 2bb5f0f..0f53fa6 100644 --- a/tests/test_smilestomol.py +++ b/tests/test_smilestomol.py @@ -2,12 +2,6 @@ import pandas as pd import pytest import sklearn -from fixtures import ( - skip_pandas_output_test, - smiles_container, - smiles_list, - smiles_list_with_invalid, -) from packaging.version import Version from rdkit import Chem from sklearn import clone @@ -19,6 +13,13 @@ InvalidMol, ) +from .fixtures import ( + skip_pandas_output_test, + smiles_container, + smiles_list, + smiles_list_with_invalid, +) + @pytest.fixture def smilestomol_transformer(): diff --git a/tests/test_transformers.py b/tests/test_transformers.py index a47b8bf..633a3f0 100644 --- a/tests/test_transformers.py +++ b/tests/test_transformers.py @@ -6,36 +6,36 @@ # pytest tests/test_transformers.py --> tests/test_transformers.py::test_transformer PASSED -import pytest +import numpy as np import pandas as pd -from packaging.version import Version +import pytest import sklearn -import numpy as np -from sklearn.pipeline import Pipeline +from packaging.version import Version from sklearn.ensemble import RandomForestRegressor +from sklearn.pipeline import Pipeline + from scikit_mol.conversions import SmilesToMolTransformer from scikit_mol.core import SKLEARN_VERSION_PANDAS_OUT +from scikit_mol.descriptors import MolecularDescriptorTransformer from scikit_mol.fingerprints import ( - MACCSKeysFingerprintTransformer, - RDKitFingerprintTransformer, AtomPairFingerprintTransformer, - TopologicalTorsionFingerprintTransformer, + AvalonFingerprintTransformer, + MACCSKeysFingerprintTransformer, + MHFingerprintTransformer, MorganFingerprintTransformer, + RDKitFingerprintTransformer, SECFingerprintTransformer, - MHFingerprintTransformer, - AvalonFingerprintTransformer, + TopologicalTorsionFingerprintTransformer, ) from scikit_mol.fingerprints.baseclasses import BaseFpsTransformer -from scikit_mol.descriptors import MolecularDescriptorTransformer - -from fixtures import ( +from .fixtures import ( SLC6A4_subset, SLC6A4_subset_with_cddd, - skip_pandas_output_test, - mols_container, - featurizer, combined_transformer, + featurizer, + mols_container, + skip_pandas_output_test, ) From 64bc1179bc7900649a8bb11ef46c5c93e274c535 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Mar 2025 11:17:19 +0100 Subject: [PATCH 18/24] Setting numpy random seed automatically for consistent testing behaviour. --- tests/conftest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index ee916ec..c1ca602 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from urllib.parse import urlsplit from urllib.request import urlopen +import numpy as np import pandas as pd import pytest import sklearn @@ -60,3 +61,10 @@ def pandas_output(): sklearn.set_config(transform_output="pandas") yield sklearn.set_config(transform_output="default") + + +# Fixed Numpy random seed in all tests automatically +@pytest.fixture(autouse=True) +def setup_random(): + """Set fixed random seed before each test.""" + np.random.seed(0xDEADFACE) From e8a4c7b45478c65a2bb5f349637b5ac4d570ada4 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sat, 8 Mar 2025 11:52:31 +0100 Subject: [PATCH 19/24] Adding a link for the new notebook --- README.md | 1 + docs/index.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 1cc0006..0707d61 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Example notebooks and API documentation are now hosted on [https://scikit-mol.re - [Testing different fingerprints as part of the hyperparameter optimization](https://scikit-mol.readthedocs.io/en/latest/notebooks/09_Combinatorial_Method_Usage_with_FingerPrint_Transformers/) - [Using pandas output for easy feature importance analysis and combine pre-exisitng values with new computations](https://scikit-mol.readthedocs.io/en/latest/notebooks/10_pipeline_pandas_output/) - [Working with pipelines and estimators in safe inference mode for handling prediction on batches with invalid smiles or molecules](https://scikit-mol.readthedocs.io/en/latest/notebooks/11_safe_inference/) +- [Estimating applicability domain using feature based estimators](https://scikit-mol.readthedocs.io/en/latest/notebooks/11_safe_inference/12_applicability_domain/) We also put a software note on ChemRxiv. [https://doi.org/10.26434/chemrxiv-2023-fzqwd](https://doi.org/10.26434/chemrxiv-2023-fzqwd) diff --git a/docs/index.md b/docs/index.md index 352cbf8..6c1e065 100644 --- a/docs/index.md +++ b/docs/index.md @@ -65,6 +65,7 @@ Example notebooks and API documentation are now hosted on [https://scikit-mol.re - [Testing different fingerprints as part of the hyperparameter optimization](https://scikit-mol.readthedocs.io/en/latest/notebooks/09_Combinatorial_Method_Usage_with_FingerPrint_Transformers/) - [Using pandas output for easy feature importance analysis and combine pre-exisitng values with new computations](https://scikit-mol.readthedocs.io/en/latest/notebooks/10_pipeline_pandas_output/) - [Working with pipelines and estimators in safe inference mode for handling prediction on batches with invalid smiles or molecules](https://scikit-mol.readthedocs.io/en/latest/notebooks/11_safe_inference/) +- [Estimating applicability domain using feature based estimators](https://scikit-mol.readthedocs.io/en/latest/notebooks/11_safe_inference/12_applicability_domain/) We also put a software note on ChemRxiv. [https://doi.org/10.26434/chemrxiv-2023-fzqwd](https://doi.org/10.26434/chemrxiv-2023-fzqwd) From 07a25d85cc39bed36b9ea3ea882b86d53bed4759 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sun, 6 Apr 2025 09:03:17 +0200 Subject: [PATCH 20/24] Fixing MD to proper links --- mkdocs.yml | 1 + scikit_mol/applicability/README.md | 4 ++-- scikit_mol/applicability/bounding_box.py | 2 +- scikit_mol/applicability/convex_hull.py | 2 +- scikit_mol/applicability/hotelling.py | 2 +- scikit_mol/applicability/isolation_forest.py | 2 +- scikit_mol/applicability/kernel_density.py | 2 +- scikit_mol/applicability/knn.py | 2 +- scikit_mol/applicability/leverage.py | 2 +- scikit_mol/applicability/local_outlier.py | 2 +- scikit_mol/applicability/standardization.py | 2 +- scikit_mol/applicability/topkat.py | 2 +- 12 files changed, 13 insertions(+), 12 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 5c44fa5..3ac50dd 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -64,5 +64,6 @@ nav: - Using pandas output for easy feature importance analysis and combine pre-existing values with new computations: notebooks/10_pipeline_pandas_output.ipynb - Working with pipelines and estimators in safe inference mode: notebooks/11_safe_inference.ipynb - Creating custom fingerptint transformers: notebooks/12_custom_fingerprint_transformer.ipynb + - Estimating applicability domain using feature based estimators: notebooks/13_applicability_domain.ipynb - Contributing: contributing.md \ No newline at end of file diff --git a/scikit_mol/applicability/README.md b/scikit_mol/applicability/README.md index 3fd2bfb..80e1786 100644 --- a/scikit_mol/applicability/README.md +++ b/scikit_mol/applicability/README.md @@ -4,13 +4,13 @@ This module contains applicability domain estimators for chemical modeling. ## License Information -Files in this module are licensed under LGPL as part of scikit-mol, with some files containing code adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD). +Files in this module are licensed under LGPL as part of scikit-mol, with some files containing code adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD). - Files containing the following header are adapted from MLChemAD (originally MIT licensed): ```python """ - This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) + This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/bounding_box.py b/scikit_mol/applicability/bounding_box.py index 9d8ec4a..8de2d7f 100644 --- a/scikit_mol/applicability/bounding_box.py +++ b/scikit_mol/applicability/bounding_box.py @@ -1,7 +1,7 @@ """ Bounding box applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/convex_hull.py b/scikit_mol/applicability/convex_hull.py index 9b7a486..df68d93 100644 --- a/scikit_mol/applicability/convex_hull.py +++ b/scikit_mol/applicability/convex_hull.py @@ -1,7 +1,7 @@ """ Convex hull applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/hotelling.py b/scikit_mol/applicability/hotelling.py index 2f9b6b8..8d1d8a0 100644 --- a/scikit_mol/applicability/hotelling.py +++ b/scikit_mol/applicability/hotelling.py @@ -1,7 +1,7 @@ """ Hotelling T² applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/isolation_forest.py b/scikit_mol/applicability/isolation_forest.py index ee29659..5a827cc 100644 --- a/scikit_mol/applicability/isolation_forest.py +++ b/scikit_mol/applicability/isolation_forest.py @@ -1,7 +1,7 @@ """ Isolation Forest applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/kernel_density.py b/scikit_mol/applicability/kernel_density.py index c32090c..88d24ac 100644 --- a/scikit_mol/applicability/kernel_density.py +++ b/scikit_mol/applicability/kernel_density.py @@ -1,7 +1,7 @@ """ Kernel Density applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD)Chem Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/knn.py b/scikit_mol/applicability/knn.py index c9be686..4cee9c8 100644 --- a/scikit_mol/applicability/knn.py +++ b/scikit_mol/applicability/knn.py @@ -1,7 +1,7 @@ """ K-Nearest Neighbors applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/leverage.py b/scikit_mol/applicability/leverage.py index d5a9e01..65c4150 100644 --- a/scikit_mol/applicability/leverage.py +++ b/scikit_mol/applicability/leverage.py @@ -1,7 +1,7 @@ """ Leverage-based applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) as described in the README.md file. +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/local_outlier.py b/scikit_mol/applicability/local_outlier.py index 5dfb055..f4ad937 100644 --- a/scikit_mol/applicability/local_outlier.py +++ b/scikit_mol/applicability/local_outlier.py @@ -1,7 +1,7 @@ """ Local Outlier Factor applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/standardization.py b/scikit_mol/applicability/standardization.py index babeeba..cd9cd69 100644 --- a/scikit_mol/applicability/standardization.py +++ b/scikit_mol/applicability/standardization.py @@ -1,7 +1,7 @@ """ Standardization approach applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. diff --git a/scikit_mol/applicability/topkat.py b/scikit_mol/applicability/topkat.py index 4849fc3..1725884 100644 --- a/scikit_mol/applicability/topkat.py +++ b/scikit_mol/applicability/topkat.py @@ -1,7 +1,7 @@ """ TOPKAT's Optimal Prediction Space (OPS) applicability domain. -This module was adapted from MLChemAD (https://github.com/OlivierBeq/MLChemAD) +This module was adapted from [MLChemAD](https://github.com/OlivierBeq/MLChemAD) Original work Copyright (c) 2023 Olivier J. M. Béquignon (MIT License) Modifications Copyright (c) 2025 scikit-mol contributors (LGPL License) See LICENSE.MIT in this directory for the original MIT license. From 0b804a47fce4e07f85edd5986c98123608a22f31 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sun, 6 Apr 2025 09:05:50 +0200 Subject: [PATCH 21/24] Adding to docs --- docs/api/scikit_mol.applicability.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/api/scikit_mol.applicability.md diff --git a/docs/api/scikit_mol.applicability.md b/docs/api/scikit_mol.applicability.md new file mode 100644 index 0000000..69d1c82 --- /dev/null +++ b/docs/api/scikit_mol.applicability.md @@ -0,0 +1,5 @@ +# `scikit-mol.applicability` + +::: scikit_mol.applicability + + From 6c95acc005eb21b5a53916319a132ac72c9044b8 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sun, 6 Apr 2025 09:14:02 +0200 Subject: [PATCH 22/24] Adding baseapplicability to include into docs, and deleing outcommented code --- scikit_mol/applicability/__init__.py | 1 + scikit_mol/applicability/kernel_density.py | 35 ---------------------- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/scikit_mol/applicability/__init__.py b/scikit_mol/applicability/__init__.py index 003c3be..55f9265 100644 --- a/scikit_mol/applicability/__init__.py +++ b/scikit_mol/applicability/__init__.py @@ -1,3 +1,4 @@ +from .base import BaseApplicabilityDomain # noqa: F401 from .bounding_box import BoundingBoxApplicabilityDomain from .convex_hull import ConvexHullApplicabilityDomain from .hotelling import HotellingT2ApplicabilityDomain diff --git a/scikit_mol/applicability/kernel_density.py b/scikit_mol/applicability/kernel_density.py index 88d24ac..c3a2eca 100644 --- a/scikit_mol/applicability/kernel_density.py +++ b/scikit_mol/applicability/kernel_density.py @@ -117,38 +117,3 @@ def _transform(self, X: NDArray) -> NDArray[np.float64]: """ scores = self.kde_.score_samples(X) return scores.reshape(-1, 1) - - # def predict(self, X): - # """Predict whether samples are within the applicability domain. - - # Parameters - # ---------- - # X : array-like of shape (n_samples, n_features) - # The samples to predict. - - # Returns - # ------- - # y_pred : ndarray of shape (n_samples,) - # Returns 1 for samples inside the domain and -1 for samples outside - # (following scikit-learn's convention for outlier detection). - # """ - # scores = self._transform(X).ravel() - # return np.where(scores >= self.threshold_, 1, -1) - - # def fit_threshold(self, X): - # """Update the threshold using new data without refitting the model. - - # Parameters - # ---------- - # X : array-like of shape (n_samples, n_features) - # Data to compute threshold from. - - # Returns - # ------- - # self : object - # Returns the instance itself. - # """ - # densities = self._transform(X).ravel() - # self.threshold_ = np.percentile(densities, self.percentile) - - # return self From 14758c182367eccde1a7973c3552fee597f85405 Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sun, 6 Apr 2025 09:54:07 +0200 Subject: [PATCH 23/24] Fixes to mkdocs references --- docs/api/scikit_mol.applicability.md | 1 + mkdocs.yml | 1 + scikit_mol/applicability/base.py | 8 ++++---- scikit_mol/applicability/convex_hull.py | 17 ----------------- scikit_mol/applicability/knn.py | 8 +++++--- scikit_mol/descriptors.py | 2 +- 6 files changed, 12 insertions(+), 25 deletions(-) diff --git a/docs/api/scikit_mol.applicability.md b/docs/api/scikit_mol.applicability.md index 69d1c82..07da383 100644 --- a/docs/api/scikit_mol.applicability.md +++ b/docs/api/scikit_mol.applicability.md @@ -1,5 +1,6 @@ # `scikit-mol.applicability` +::: scikit_mol.applicability.base ::: scikit_mol.applicability diff --git a/mkdocs.yml b/mkdocs.yml index 3ac50dd..7807873 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ plugins: nav: - Overview: index.md - API: + - scikit-mol.applicability: api/scikit_mol.applicability.md - scikit-mol.core: api/scikit_mol.core.md - scikit-mol.conversion: api/scikit_mol.conversions.md - scikit-mol.descriptors: api/scikit_mol.descriptors.md diff --git a/scikit_mol/applicability/base.py b/scikit_mol/applicability/base.py index 813a8b1..8fb0c2b 100644 --- a/scikit_mol/applicability/base.py +++ b/scikit_mol/applicability/base.py @@ -60,12 +60,12 @@ class BaseApplicabilityDomain(BaseEstimator, TransformerMixin, _ADOutputMixin, A Notes ----- - Subclasses must define _scoring_convention as either: + Subclasses must define `_scoring_convention` as either: - 'high_outside': Higher scores indicate samples outside domain (e.g., distances) - 'high_inside': Higher scores indicate samples inside domain (e.g., likelihoods) - The raw scores from transform() should maintain their natural interpretation, - while predict() will handle the conversion to ensure consistent output + The raw scores from `.transform()` should maintain their natural interpretation, + while `.predict()` will handle the conversion to ensure consistent output (1 = inside domain, -1 = outside domain). Attributes @@ -171,7 +171,7 @@ def transform( Returns ------- scores : ndarray or pandas DataFrame - Method-specific scores. Interpretation depends on _scoring_convention: + Method-specific scores. Interpretation depends on `_scoring_convention`: - 'high_outside': Higher scores indicate samples further from training data - 'high_inside': Higher scores indicate samples closer to training data Shape (n_samples, 1). diff --git a/scikit_mol/applicability/convex_hull.py b/scikit_mol/applicability/convex_hull.py index df68d93..7e2a1b3 100644 --- a/scikit_mol/applicability/convex_hull.py +++ b/scikit_mol/applicability/convex_hull.py @@ -114,20 +114,3 @@ def _transform(self, X: NDArray) -> NDArray[np.float64]: distances.append(0.0 if result.success else 1.0) return np.array(distances).reshape(-1, 1) - - # def predict(self, X): - # """Predict whether samples are within the applicability domain. - - # Parameters - # ---------- - # X : array-like of shape (n_samples, n_features) - # The samples to predict. - - # Returns - # ------- - # y_pred : ndarray of shape (n_samples,) - # Returns 1 for samples inside the domain and -1 for samples outside - # (following scikit-learn's convention for outlier detection). - # """ - # scores = self._transform(X).ravel() - # return np.where(scores == 0, 1, -1) diff --git a/scikit_mol/applicability/knn.py b/scikit_mol/applicability/knn.py index 4cee9c8..e897756 100644 --- a/scikit_mol/applicability/knn.py +++ b/scikit_mol/applicability/knn.py @@ -7,7 +7,7 @@ See LICENSE.MIT in this directory for the original MIT license. """ -from typing import Callable, ClassVar, Optional, Union +from typing import Callable, ClassVar, Literal, Optional, Union import numpy as np from numpy.typing import ArrayLike @@ -32,7 +32,7 @@ class KNNApplicabilityDomain(BaseApplicabilityDomain): Percentile of training set distances to use as threshold (0-100). If None, uses 99.0 (include 99% of training samples). distance_metric : str or callable, default='euclidean' - Distance metric to use. Options: + Distance metric to use. As examples: - 'euclidean': Euclidean distance (default) - 'manhattan': Manhattan distance - 'cosine': Cosine distance @@ -95,7 +95,9 @@ def __init__( self, n_neighbors: int = 5, percentile: Optional[float] = None, - distance_metric: Union[str, Callable] = "euclidean", + distance_metric: Union[ + Literal["euclidean", "manhattan", "cosine", "tanimoto", "jaccard"], Callable + ] = "euclidean", n_jobs: Optional[int] = None, feature_name: str = "KNN", ) -> None: diff --git a/scikit_mol/descriptors.py b/scikit_mol/descriptors.py index ee1d496..c115377 100644 --- a/scikit_mol/descriptors.py +++ b/scikit_mol/descriptors.py @@ -133,7 +133,7 @@ def transform(self, x: List[Mol], y=None) -> Union[np.ndarray, np.ma.MaskedArray """Transform a list of molecules into an array of descriptor values Parameters ---------- - X : (List, np.array, pd.Series) + x : (List, np.array, pd.Series) A list of RDKit molecules y : NoneType, optional Target values for scikit-learn compatibility, not used, by default None From b7c19d0dbdb6f5d0a524c4369fb15044351b8d7d Mon Sep 17 00:00:00 2001 From: Esben Jannik Bjerrum Date: Sun, 6 Apr 2025 10:17:59 +0200 Subject: [PATCH 24/24] Further updates to mkdocs configuration and some cleanup --- README.md | 2 +- docs/api/scikit_mol.applicability.md | 1 - scikit_mol/applicability/__init__.py | 3 ++- tests/applicability/test_base.py | 4 ---- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index fdee10e..afa0ee4 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ There are more information about how to contribute to the project in [CONTRIBUTI Probably still, please check issues at GitHub and report there -## Contributors: +## Contributors Scikit-Mol has been developed as a community effort with contributions from people from many different companies, consortia, foundations and academic institutions. diff --git a/docs/api/scikit_mol.applicability.md b/docs/api/scikit_mol.applicability.md index 07da383..69d1c82 100644 --- a/docs/api/scikit_mol.applicability.md +++ b/docs/api/scikit_mol.applicability.md @@ -1,6 +1,5 @@ # `scikit-mol.applicability` -::: scikit_mol.applicability.base ::: scikit_mol.applicability diff --git a/scikit_mol/applicability/__init__.py b/scikit_mol/applicability/__init__.py index 55f9265..b1a1d3e 100644 --- a/scikit_mol/applicability/__init__.py +++ b/scikit_mol/applicability/__init__.py @@ -1,4 +1,4 @@ -from .base import BaseApplicabilityDomain # noqa: F401 +from .base import BaseApplicabilityDomain from .bounding_box import BoundingBoxApplicabilityDomain from .convex_hull import ConvexHullApplicabilityDomain from .hotelling import HotellingT2ApplicabilityDomain @@ -12,6 +12,7 @@ from .topkat import TopkatApplicabilityDomain __all__ = [ + "BaseApplicabilityDomain", "BoundingBoxApplicabilityDomain", "ConvexHullApplicabilityDomain", "HotellingT2ApplicabilityDomain", diff --git a/tests/applicability/test_base.py b/tests/applicability/test_base.py index 56eba50..24b6ba4 100644 --- a/tests/applicability/test_base.py +++ b/tests/applicability/test_base.py @@ -5,10 +5,6 @@ from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.estimator_checks import check_estimator -# def test_estimator_api(ad_estimator): -# """Test scikit-learn API compatibility.""" -# check_estimator(ad_estimator) - def test_basic_functionality(ad_estimator, reduced_fingerprints): """Test basic fit/transform on reduced fingerprints."""