Source code for watex.models.premodels

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Created on Tue May 17 11:30:51 2022

from importlib import resources
import warnings 
from .._docstring import refglossary
from .._watexlog import watexlog 
from .._typing import (  
    Optional, 
    ArrayLike, 
    NDArray, 
    )
from ..decorators import refAppender 
from ..exceptions import (
    EstimatorError, 
    NotFittedError 
    )
from ..utils.funcutils import (
    repr_callable_obj,
    smart_format,
    smart_strobj_recognition, 
    )
from ..utils.validator import ( 
    check_X_y, 
    check_array
    )
from ..utils.mlutils import (
    controlExistingEstimator , 
    fetchModel 
    )
from ._metapredictors import ( 
        _pMODELS 
    )

__all__=["p", "pModels"]

def cloneObj (cls, attributes ): 
    """ Clone object and update attributes """
    obj = cls.__new__(cls) 
    obj.__dict__.update (attributes ) 
    
    return obj

[docs] @refAppender(refglossary.__doc__) class pModels : """ Pretrained Models class. The pretrained model class is composed of estimators already trained in a case study region in West -Africa `Bagoue region`_. Refer to `Kouadio et al`_, 2022 for furher details. It is a set of ``support vector machines``, `decision tree``, ``k-nearest neighbors``, ``Extreme ``gradient boosting machines``, benchmart ``voting classifier``, and `` ``bagging classifier``. Each retrained model is considered as a class object and attributes compose the training parameters from cross-validation results. Parameters ----------- model: str Name of the pretrained model. Note that the pretrained SVMs is composed of 04 kernels such as the ``rbf`` for radial basis function , the ``poly`` for polynomial , ``sig`` for sigmoid and ``lin`` for linear. Default is ``rbf``. Each kernel is a model attributes of SVM class. For instance to retrieve the pretrained model with kernel = 'poly', we must use after fitting :class:`.pModels` class:: >>> pModels(model='svm', kernel='poly').fit().SVM.poly.best_estimator_ ... SVC(C=128.0, coef0=7, degree=5, gamma=0.00048828125, kernel='poly', tol=0.01) >>> # or >>> pModels(model='svm', kernel='poly').fit().estimator_ ... SVC(C=128.0, coef0=7, degree=5, gamma=0.00048828125, kernel='poly', tol=0.01) kernel: str kernel refers to SVM machines kernels. It can be ``rbf`` for radial basis function , the ``poly`` for polynomial , ``sig`` for sigmoid and ``lin`` for linear. No need to provide since it can be retrieved as an attribute of the SVM model like:: >>> pModels(model='svm').fit().SVM.rbf # is an object instance >>> # to retreive the rbf values use attribute `best_estimator_ >>> pModels(model='svm').fit().SVM.rbf.best_estimator_ ... SVC(C=2.0, coef0=0, degree=1, gamma=0.125) target: str Two types of classification is predicted. The binary classification ``bin`` and the multiclass classification ``multi``. default is ``bin``. When turning target to ``multi``, be aware that only the SVMs are trained for multiclass prediction. Futhernore, the `bin` consisted to predict the flow rate (FR) with label {0} and {1} where {0} means the :math:`FR <=1 m^3/hr` and {1} for :math:`FR> 1m^3/hr`. About `multi`, four classes are predicted such as: .. math:: FR0 & = & FR = 0 FR1 & = & 0 < FR <=1 m^3/hr FR2 & = & 1< FR <=3 m^3/hr FR3 & = & FR> 3 m^3/hr oob_score: bool, Out-of-bag. Setting `oob_score` to ``true``, you will retrieve some pretrained model with ``obb_score`` set to true when training. The pretrained models with fine-tuned model with `oob_score` set to true are 'RandomForest' and 'Extratrees'. objective: str, default='fr' Is the prediction aim goal, the reason for storing the pretrained models. The default `objective` is 'fr' i.e. for flow rate prediction. Other objectives will be added as new engineering problems are solved and published. Examples ---------- >>> from watex.models.premodels import pModels >>> # fetch the the pretrained Adaboost model >>> p= pModels (model ='ada') >>> p.fit() >>> p.AdaBoost.best_estimator_ ... AdaBoostClassifier(estimator=LogisticRegression(), learning_rate=0.09, n_estimators=500) >>> p.model = 'vot' >>> p.fit() >>> p.Voting.best_estimator_ ... VotingClassifier(estimators=[('lr', LogisticRegression()), ... ('knn', ... KNeighborsClassifier(metric='manhattan', ... n_neighbors=9)), ... ('dt', ... DecisionTreeClassifier(criterion='entropy', ... max_depth=7)), ... ('pSVM', ... SVC(C=2.0, coef0=0, degree=1, gamma=0.125))]) >>> p2 = pModels(model='extree', oob_score= True ).fit() >>> p2.ExtraTrees.best_estimator_ ... ExtraTreesClassifier(bootstrap=True, criterion='entropy', max_depth=18, max_features='auto', n_estimators=300, oob_score=True) """ pdefaults_ = list(map ( lambda e: controlExistingEstimator(e), ['xgboost', 'svm', 'dtc', 'stc', 'bag', 'logit', 'vtc', 'rfc', 'ada', 'extree', 'knn'])) def __init__( self, model:str='svm', target:str='bin', kernel:Optional[str]=None , oob_score:bool=False, objective: str='fr', ): self._logging=watexlog.get_watex_logger(self.__class__.__name__) self.model=model self.target=target self.objective=objective self.oob_score=oob_score self.kernel=kernel
[docs] def fit ( self, X:NDArray = None , y: ArrayLike = None , **fit_params ): """ Fit X and y with the pretrained models. Note that to retrieve only the pretrained model, don't pass anything in `fit` method. For instance to fetch the best SVM estimator with `kernel = 'sigmoid'`, one just needs to fit:class:`.pModels` class as follow:: >>> pModels(model='svm', kernel='sigmoid').fit().estimator_ Out[24]: SVC(C=512.0, coef0=0, degree=1, gamma=0.001953125, kernel='sigmoid', tol=1.0) If `model='svm'` and none `kernel` is passed, the ``rbf`` is used instead as default. Parameters ---------- X: Ndarray of shape ( M x N), :math:`M=m-samples x N=n-features` training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. The notation is uppercase to denote that it is ordinarily a matrix. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. y: array-like of shape (M, ) `:math:`M=m-samples` train target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. Returns -------- :class:`pModels` instance Returns ``self`` for easy method chaining. """ self._fit(X, y ) if X is not None: X, y =check_X_y ( X, y, accept_sparse=True, to_frame =True, estimator= self.name_ ) self.estimator_.fit(X, y, **fit_params ) return self
def _fit (self, X:NDArray = None , y: ArrayLike = None ): """ Fit the pretrained model data and populate its corresponding attributes. :param X: NoneType X does nothing, it is used for API consistency :param y: NoneType y does nothing, it is used for API consistency :example: >>> from watex.models.premodels import pModels >>> # fetch the the pretrained Adaboosting >>> p= pModels (model ='ada') >>> p.fit() >>> p.AdaBoost.best_estimator_ ... AdaBoostClassifier(estimator=LogisticRegression(), learning_rate=0.09, n_estimators=500) """ if self.model is None: raise TypeError( "NoneType can't be a model.") self.objective = str(self.objective).lower() assert self.objective =='fr',( f"Pretrained objective is for flow rate prediction 'fr' passed to" f" parameter 'objective'; not {self.objective}" ) assert self.target in ("bin", "multi"), ( "Two types of learning targets are expected: the multiclass" f"'multi' and binary 'bin'. Got {self.target!r}" ) self.model, self.name_ = controlExistingEstimator( self.model , raise_err = True ) # change the name of SVC if self.name_ =='SupportVectorClassifier' : self.name_ = 'SVM' else: self. name_ = self.name_.replace('Classifier', '') if self.name_ =='ExtremeGradientBoosting': self.name_ ='XGB' if self.model not in list(map(lambda d: d[0], self.pdefaults_)): pl = list(map(lambda d: str(d[0]) + ' -> ' + str(d[1]), self.pdefaults_)) raise EstimatorError( f"Unsupport model : {self.model}." f" Expects {smart_format(pl, 'or')}") try : data_= _pDATA # fetch data from module # force to fetch default # values in exception if data_ is None: raise except : data_ = _pMODELS if self.oob_score: if self.model in ('svc', 'extree', 'rdf'): self.name_ +='_' else : raise EstimatorError( "Pretrained model for 'oob_score=True' is only available" " for RandomForest <'rdf'> and Extratrees <'extree'>'," f" not {self.model!r}" ) obj = type (self.name_, (), {}) try: obj = cloneObj(obj, attributes=data_.get(self.name_).__dict__) except AttributeError: obj = cloneObj(obj, attributes=data_.get(self.name_)) if self.target =='multi': if self.name_== 'SVM_': self.name_= 'SVM' else : if '_' in self.name_ : self.name_= self.name_.replace ('_', '') self.__setattr__(self.name_, obj ) try: self.estimator_ = getattr(self, self.name_).best_estimator_ self.params_ = getattr(self, self.name_).best_params_ except AttributeError : # collect some data for quick access if self.kernel is None: m=("Kernel is None. Default kernel 'rbf' is used instead.") self._logging.info(m);warnings.warn(m) self.estimator_ = getattr(self, self.name_).rbf.best_estimator_ self.params_ = getattr(self, self.name_).rbf.best_params_ else : self.estimator_ = getattr ( getattr(self, self.name_), self.kernel) .best_estimator_ self.params_ = getattr ( getattr(self, self.name_),self.kernel) .best_params_ return self
[docs] def predict(self, X: NDArray ) : """ Predict object from the pretrained model Parameters ---------- X: Ndarray of shape ( M x N), :math:`M=m-samples x N=n-features` training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. The notation is uppercase to denote that it is ordinarily a matrix. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. Returns -------- y_pred: Array-like, shape (M, ) the predicted target values from `X`. """ self.inspect X= check_array( X, accept_sparse=True, estimator=self.name_, input_name ="X", to_frame=True, ) return self.estimator_.predict (X)
@property def inspect (self): """ Inspect object whether is fitted or not""" msg = ( "{obj.__class__.__name__} instance is not fitted yet." " Call 'fit' with appropriate arguments before using" " this method" ) if not hasattr (self, 'estimator_'): raise NotFittedError(msg.format( obj=self) ) return 1 def __repr__(self): """ Pretty format for programmer guidance following the API... """ return repr_callable_obj (self) def __getattr__(self, name): if not name.endswith ('__') and name.endswith ('_'): raise NotFittedError ( f"{self.__class__.__name__!r} instance is not fitted yet." " Call 'fit' method with appropriate arguments before" f" retreiving the attribute {name!r} value." ) rv = smart_strobj_recognition(name, self.__dict__, deep =True) appender = "" if rv is None else f'. Do you mean {rv!r}' raise AttributeError ( f'{self.__class__.__name__!r} object has no attribute {name!r}' f'{appender}{"" if rv is None else "?"}' )
class _objectview(object): """ View object of a superclass created from each subclasss of dict elements. Is a container of dict element resulting from model instance element. Thus, each element can be retrieved as its own attribute. For instance:: >>> from watex.models.premodels import p >>> p.SVM.poly.best_estimator_ ... SVC(C=128.0, coef0=7, degree=5, gamma=0.00048828125, kernel='poly', tol=0.01) >>> p.XGB.best_estimator_ ... XGBClassifier(base_score=None, booster='gbtree', colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, ... tree_method=None, validate_parameters=None, verbosity=None) >>> p.RandomForest.best_estimator_ ... RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=350) >>> p.keys ... ('SVM', 'SVM_', 'LogisticRegression', 'KNeighbors', 'DecisionTree', 'Voting', 'RandomForest', 'RandomForest_', 'ExtraTrees', 'ExtraTrees_', 'Bagging', 'AdaBoost', 'XGB', 'Stacking' ) """ def __init__(self, kwds ): for key in list (kwds.keys()): setattr(self, key, kwds[key]) setattr(self ,'keys', tuple(self.__dict__.keys()) ) p = _objectview(_pMODELS) p.__doc__= """\ p Object is a supclass that contains all the pretrained models. each pretrained model composes its own class object with dict element as attributes. Each pretrained model can fetched as an attribute. For instance:: >>> from watex.models.premodels import p >>> # get the pretrained models using the key attributes >>> p.keys ... ('SVM', 'SVM_', 'LogisticRegression', 'KNeighbors', 'DecisionTree', 'Voting', 'RandomForest', 'RandomForest_', 'ExtraTrees', 'ExtraTrees_', 'Bagging', 'AdaBoost', 'XGB', 'Stacking' ) >>> # fetch the pretrained LogisticRegression best parameters >>> p.LogisticRegression.best_params_ ... {'penalty': 'l2', 'dual': False, 'tol': 0.0001, 'C': 1.0, 'fit_intercept': True, 'intercept_scaling': 1, 'class_weight': None, 'random_state': None, 'solver': 'lbfgs', 'max_iter': 100, 'multi_class': 'auto', 'verbose': 0, 'warm_start': False, 'n_jobs': None, 'l1_ratio': None } >>> # fetcth the pretrained RandomForest with out-of-bagg equal to True >>> p.RandomForest.best_estimator_ ... RandomForestClassifier(max_depth=15, oob_score=True) Note ------ To fetch the pretrained model with parameter (out-of-bag ), need to use the '_' at the end of the model name like 'ExtraTrees_'. However the pretrained model of Support Vector Machines with underscore means the fine tuned multiclassification targets not 'out-of-bag' parameters. """ #-- Fetch the pretrained model data # XXX pickling models should be removed next release # with resources.path ('watex.etc', 'p.models.pkl') as f : data_file = str(f) try : _pDATA, = fetchModel (data_file, default = False ) except: # set to None if something goes wrong _pDATA = None