Source code for finchnmr.model

"""
Tools to build models.

Authors: Nathan A. Mahynski
"""
import copy
import tqdm

import numpy as np
import sklearn.linear_model as sklm

from . import library
from . import substance
from . import analysis

from sklearn.base import BaseEstimator, RegressorMixin
from itertools import product
from typing import Union, Any, ClassVar
from numpy.typing import NDArray


[docs]def optimize_models( targets: list["substance.Substance"], nmr_library: "library.Library", nmr_model: "_Model", param_grid: dict[str, list], model_kw: Union[dict[str, Any], None] = None, ) -> tuple[list["_Model"], list["analysis.Analysis"]]: """ Optimize a model to fit each wild spectra in a list. All combinations of parameters in `param_grid` are tested and the best performer is retained. Parameters ---------- targets : list[Substance] Unknown/wild HSQC NMR spectrum to fit with the `nmr_library`. nmr_library : Library Library of HSQC NMR spectra to use for fitting `targets`. nmr_model : _Model Uninstantiated model class to fit the spectra with. param_grid : dict(str, list) Dictionary of parameter grid to search over; this follows the same convention as `sklearn.model_selection.GridSearchCV`. model_kw : dict(str, Any), optional(default=None) Default keyword arguments to your model. If `None` then the `nmr_model` defaults are used. Returns ------- optimized_models : list(_Model) List of optimized models fit to each target HSQC NMR spectrum. analyses : list(Analysis) List of analysis objects to help visualize and understand each fitted model. Example ------- >>> target = finchnmr.substance.Substance(...) # Load target(s) >>> nmr_library = finchnmr.library.Library(...) # Create library >>> optimized_models, analyses = finchnmr.model.optimize_models( ... targets=[target], ... nmr_library=nmr_library, ... nmr_model=finchnmr.model.LASSO, ... param_grid={'alpha': np.logspace(-5, 1, 100)}, ... ) >>> analyses[0].plot_top_spectra(k=5) """ optimized_models = [] analyses = [] def build_fitted_model_(model_kw, param_set, nmr_library, target): """Create and train the model.""" if model_kw is None: estimator = nmr_model() # Use model default parameters else: estimator = nmr_model(**model_kw) # Set basic parameters manually estimator.set_params( **param_set ) # Set specific parameters (alpha, etc.) _ = estimator.fit(nmr_library, target) return estimator def unroll_(param_grid): """Create every possible combination of parameters in the grid.""" param_sets = [] for values in product(*param_grid.values()): combination = dict(zip(param_grid.keys(), values)) param_sets.append(combination) return param_sets param_sets = unroll_(param_grid) for i, target in tqdm.tqdm( enumerate(targets), desc="Iterating through targets" ): scores = [] for param_set in tqdm.tqdm( param_sets, desc="Iterating through parameter sets" ): try: estimator_ = build_fitted_model_( model_kw, param_set, nmr_library, target ) except Exception as e: pass # Do not score this model else: scores.append(estimator_.score()) if len(scores) == 0: raise Exception(f"Unable to fit any models for target index {i}") # Fit final estimator with the "best" parameters estimator = build_fitted_model_( model_kw, param_sets[np.argmax(scores)], nmr_library, target ) optimized_models += [estimator] analyses += [analysis.Analysis(model=estimator)] return optimized_models, analyses
class _Model(RegressorMixin, BaseEstimator): """Model base class wrapper for linear models.""" model: ClassVar[Any] model_: ClassVar[Any] _nmr_library: ClassVar["library.Library"] _score: ClassVar[float] _scale_y: ClassVar[NDArray[np.floating]] is_fitted_: ClassVar[bool] def __init__(self) -> None: """ Instantiate the model. Note that the sklearn API requires all estimators (subclasses of this) to specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs). """ setattr(self, "is_fitted_", False) setattr(self, "model_", None) def set_params(self, **parameters: Any) -> "_Model": """Set parameters; for consistency with scikit-learn's estimator API.""" for parameter, value in parameters.items(): setattr(self, parameter, value) return self def target(self): """Return the target this model is meant to reproduce.""" if self.is_fitted_: return copy.deepcopy(self._nmr_library._fit_to) else: raise Exception("Model has not been fit yet.") def fit( self, nmr_library: "library.Library", target: "substance.Substance" ) -> "_Model": """ Fit the model. The library is "fit"/aligned to the target first, then the linear model is fit. Parameters ---------- nmr_library : Library Library of HSQC NMR spectra to use for fitting `unknown`. target : substance.Substance Unknown/wild HSQC NMR spectrum to fit with the `nmr_library`. Returns ------- self : _Model Fitted model. """ if self.model_ is None: raise Exception("model has not been set yet.") else: setattr(self, "model", self.model_(**self.get_model_params())) setattr(self, "_nmr_library", copy.deepcopy(nmr_library)) # Align library with target - this also saves target internally self._nmr_library.fit(target) # Transform library to normalize X, _ = self.transform(self._nmr_library.X) # Tansform target in a similar way y, scale_y = self.transform(target.flatten().reshape(-1, 1)) setattr(self, "_scale_y", scale_y) # y = target.flatten().reshape(-1, 1) # setattr(self, "_scale_y", 1.0) # Fit the model _ = self.model.fit(X, y) # Store the score of this fit setattr(self, "_score", self.model.score(X, y)) setattr(self, "is_fitted_", True) return self @staticmethod def transform(X): X_t = np.abs( X ) # Convert library intensities to absolute values, [0, inf) scale = np.max(X_t, axis=0) # Scale library to [0, 1] return X_t / scale, scale def predict(self) -> NDArray[np.floating]: """ Predict the (flattened) target HSQC spectra. Returns ------- spectrum : ndarray(float, ndim=1) Predicted (flattened) spectrum fit to the given `nmr_library`. """ if not self.is_fitted_: raise Exception("Model has not been fit yet.") y_pred = self.model.predict(self.transform(self._nmr_library.X)[0]) # When predicting / reconstructing, scale the model output back to # "real" units. This is still absolute value / intensity space but # has the proper magnitude for comparison with the target spectrum. return y_pred * self._scale_y def score(self) -> float: """ Score the model's performance (fit). Returns ------- score : float Coefficient of determination of the model that uses `nmr_library` to predict `target`. """ if not self.is_fitted_: raise Exception("Model has not been fit yet.") return self._score def reconstruct(self) -> "substance.Substance": """Reconstruct a 2D HSQC NMR spectrum using the fitted model.""" if not self.is_fitted_: raise Exception("Model has not been fit yet.") reconstructed = self.target() reconstructed._set_data(reconstructed.unflatten(self.predict())) return reconstructed def importances(self) -> NDArray[np.floating]: """Return the importances of each feature in the model.""" raise NotImplementedError def get_model_params(self) -> dict[str, Any]: """Get the parameters needed to instantiate the model.""" raise NotImplementedError
[docs]class LASSO(_Model): """LASSO model from sklearn.""" alpha: ClassVar[float] precompute: ClassVar[bool] copy_X: ClassVar[bool] max_iter: ClassVar[int] tol: ClassVar[float] warm_start: ClassVar[bool] random_state: ClassVar[Union[int, None]] selection: ClassVar[str] fit_intercept: ClassVar[bool] positive: ClassVar[bool] def __init__( self, alpha: float = 1.0, precompute: bool = False, copy_X: bool = True, max_iter: int = 10000, tol: float = 0.0001, warm_start: bool = False, random_state: Union[int, None] = None, selection: str = "cyclic", ) -> None: """ Instantiate the class. Inputs are identical to `sklearn.linear_model.Lasso` except for `fit_intercept` and `positive` which are forced to be `False` and `True`, respectively. Also, `max_iter` is increased from 1,000 to 10,000 by default. See https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html """ super().__init__() self.set_params( **{ "alpha": alpha, "fit_intercept": False, # Always assume no offset "precompute": precompute, "copy_X": copy_X, "max_iter": max_iter, "tol": tol, "warm_start": warm_start, "positive": True, # Force coefficients to be positive "random_state": random_state, "selection": selection, } ) setattr(self, "model_", sklm.Lasso)
[docs] def get_model_params(self) -> dict[str, Any]: """Return the parameters for an sklearn.linear_model.Lasso model.""" return { "alpha": self.alpha, "fit_intercept": self.fit_intercept, "precompute": self.precompute, "copy_X": self.copy_X, "max_iter": self.max_iter, "tol": self.tol, "warm_start": self.warm_start, "positive": self.positive, "random_state": self.random_state, "selection": self.selection, }
[docs] def importances(self) -> NDArray[np.floating]: """Return the Lasso model coefficients as importances.""" return self.model.coef_