Source code for watex.cases.modeling

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>

from __future__ import ( 
    print_function,
    division 
)

import warnings 
import numpy as np 
import pandas as pd 

from .._typing import ( 
    T, 
    Generic,
    Iterable , 
    Dict ,
    Callable,
    Optional,
    Union 
  )
from ..exlib.sklearn  import ( 
    make_pipeline, 
    validation_curve, 
    RandomizedSearchCV, 
    GridSearchCV, 
    learning_curve, 
    permutation_importance, 
    confusion_matrix
    )
from .processing import Processing 
from ..utils.mlutils import ( 
    formatModelScore 
    )
from ..decorators import ( 
    pfi, 
    visualize_valearn_curve, 
    predplot, 
    )
from .._watexlog import watexlog 
from ..exceptions import ( 
    EstimatorError, 
    ArgumentError
    )

# import  watex.exceptions as Wex 
# import  watex.decorators as deco
_logger =watexlog().get_watex_logger(__name__)

__all__=["BaseModel"] 

[docs] class BaseModel: """ Base model class. The most interesting and challenging part of modeling is the `tuning hyperparameters` after designing a composite estimator. Getting the best params is a better way to reorginize the created pipeline `{transformers +estimators}` so to have a great capability of data generalization. Arguments ---------- *dataf_fn*: str Path to analysis data file. *df*: pd.Core.DataFrame Dataframe of features for analysis . Must be contains of main parameters including the target name of pd.Core.series of columns of `df`. Holds on others optionals infos in ``kwargs`` arguments: ================= ============ ======================================= Attributes Type Description ================= ============ ======================================= auto bool Trigger the composite estimator. If ``True`` a SVC-composite estimator `preprocessor` is given. *default* is False. pipelines dict Collect your own pipeline for model preprocessor trigging. it should be find automatically. estimators Callable A given estimator. If ``None``, `SVM` is auto-selected as default estimator. model_score float/dict Model test score. Observe your test model score using your compose estimator for enhancement or your own pipelines. model_prediction array_like Observe your test model prediction for as well as the compose estimator enhancement. processor Callable Compose piplenes and estimators for default model scorage. ================= ============ ======================================= Examples -------- >>> from watex.bases.modeling import BaseModel >>> from sklearn.preprocessing import RobustScaler, PolynomialFeatures >>> from sklearn.feature_selection import SelectKBest, f_classif >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.compose import make_column_selector >>> estimator2= RandomForestClassifier() >>> modelObj = BaseModel( ... data_fn ='data/geo_fdata/BagoueDataset2.xlsx', ... pipelines = { ... 'num_column_selector_': make_column_selector( ... dtype_include=np.number), ... 'cat_column_selector_': make_column_selector( ... dtype_exclude=np.number), ... 'features_engineering_':PolynomialFeatures( ... 2, include_bias=False), ... 'selectors_': SelectKBest(f_classif, k=2), ... 'encodages_': RobustScaler() ... }, ... estimator = RandomForestClassifier() ... ) """ def __init__(self, data_fn =None, df=None , **kwargs)->None: self._logging = watexlog().get_watex_logger(self.__class__.__name__) self._data_fn = data_fn self._df =df self.pipelines = kwargs.pop('pipelines', None) self.estimator =kwargs.pop('estimator', None) self.auto= kwargs.pop('auto', False) self.random_state =kwargs.pop('random_state', 7) self.savefig = kwargs.pop('savefig', None) self.Processing = Processing() self.lc_kws = kwargs.pop('lc_kws', { 'train_sizes':np.linspace(0.2, 1, 10), 'cv':4, 'scoring':'accuracy'}) self.vc_kws = kwargs.pop('vc_kws', {'param_name':'C', 'param_range':np.arange(1, 200, 10), 'cv':4}) self.figsize =kwargs.pop('fig_size', (12, 8)) self.fimp_kws=kwargs.pop('fimp_kws', {"width": 0.3, "color":'navy', "edgecolor" : 'blue', "linewidth" : 2, "ecolor" : 'magenta', "capsize" :5, 'figsize':self.figsize}) self.X_train = None self.y_train = None self.X_test = None self.y_test = None self.X= None self.train_score =None self.val_score =None self._processor =None self._composite_model=None self._model_score =None self.best_params_ =None self.best_score_= None self._model_pred =None self.y_pred= None self.confusion_matrix=None for key in list(kwargs.keys()): setattr(self, key, kwargs[key]) if (self._data_fn or self._df) is not None: self._read_modelingObj() @property def model_(self): """ Get a set of `processor` and `eestimator` composed of the composite model """ if (self.processor and self.estimator) is not None : self._composite_model = make_pipeline(self.processor, self.estimator) return self._composite_model @model_.setter def model_(self, pipeline_and_estimator): """ Set a composite estimator usinng a tuple of pipeline plus estimator""" if len(pipeline_and_estimator) <=1 : warnings.warn( 'A Composite model creation need ' 'at least the `pipeline` and `estimator`.') self._logging.debug( 'Need at least a `pipeline` and `estimators`') if self.processor is None : self.processor, self.estimator = pipeline_and_estimator # self._composite_model = make_pipeline(self.processor, # self.estimator) @property def processor (self): """ Get te `processor` after supplying the `pipelines` """ return self._processor @processor.setter def processor (self, pipeline): """ Build your processor with your pipelines. If pipeline is not given, the default preprocessor will be considered instead.""" import sklearn if pipeline is None : if self.Processing.preprocessor is None : if (self._data_fn or self._df) is not None : self.Processing=Processing(data_fn = self._data_fn, df=self._df ,auto=True, random_state=self.random_state) self._processor = self.Processing.preprocessor elif pipeline is not None : if isinstance(pipeline, sklearn.compose._column_transformer.ColumnTransformer): self._processor = pipeline else : self.Processing.preprocessor = pipeline self._processor = self.Processing.preprocessor if self.estimator is None : self.estimator = self.Processing._select_estimator_ @property def model_score(self): """ Estimate your composite model prediction """ if self.model_ is not None: self.model_.fit(self.X_train, self.y_train) self._model_score = self.model_.score(self.X_test, self.y_test) try : formatModelScore(self._model_score, self.Processing._estimator_name) except: self._logging.debug( f'Error finding the {self.Processing._estimator_name}') warnings.warn( f'Error finding the {self.Processing._estimator_name}') return self._model_score def _read_modelingObj (self, data_fn:Optional[T]=None, df=None, pipelines: Generic[T]=None, estimator:Callable[...,T] =None)->None: """ Modeling object implicity inherits from ``Processing`` usefull attributes. Read the `Processing` class and from that super class populate the usefull attributes. :param data_fn: Full path to features data files. Refer to `../data` directory to have a look how data are look like. To get this list of features. Call `Features` class to automatic generate this datafile. :param df: `pd.core.frame.DataFrame` """ self._logging.info('Reading and populating modeling <%s> object' ' attributes.'%self.__class__.__name__) if data_fn is not None : self._data_fn = data_fn if df is not None: self._df = df if pipelines is not None : self.pipelines =pipelines if estimator is not None : self.estimator = estimator if self._data_fn is not None or self._df is not None : self.Processing= Processing(data_fn = self._data_fn, df =self._df, estimator = self.estimator, pipelines = self.pipelines, auto= self.auto, random_state=self.random_state) self.X_train = self.Processing.X_train self.y_test = self.Processing.y_test self.X_test = self.Processing.X_test self.y_train = self.Processing.y_train self.X= self.Processing.X if self.estimator is not None: try: self.estimator.__class__.__name__ except : self.Processing.estimator = self.estimator self.estimator = self.Processing._select_estimator_ if self.pipelines is not None : self.Processing.preprocessor= self.pipelines # self.pipelines = self.Processing.preprocessor if self.auto: self.processor = self.pipelines
[docs] @visualize_valearn_curve(reason ='learn_curve',turn='off', plot_style='line', train_kws={'color':'blue', 'linewidth':2, 'marker':'o','linestyle':'dashed', 'label':'Training set'}, val_kws ={'color':'r', 'linewidth':3,'marker':'H', 'linestyle':'-', 'label':'Validation set'}, xlabel={'xlabel':'Training set '}, ylabel={'ylabel':'performance on the validation set '}) def get_learning_curve (self, estimator:Callable[..., T]=None, X_train=None, y_train=None, learning_curve_kws:Generic[T]=None, **kws )-> Iterable[T]: """ Compute the train score and validation curve to visualize your learning curve. :param estimator: The creating model. If ``None`` :param X_train: pd.core.frame.DataFrame of selected trainset :param x_test: pd.DataFrame of selected Data for testset :param y_train: array_like of selected data for evaluation set. :param y_test: array_like of selected data for model test :param val_kws: `validation_curve` keywords arguments. if none the *default* should be:: val_curve_kws = {"param_name":'C', "param_range": np.arange(1,210,10), "cv":4} :returns: - `train_score`: float|dict of trainset score. - `val_score` : float/dict of valisation score. - `switch`: Turn ``on`` or ``off`` the learning curve of validation curve. -`trigDec`: Trigger the decorator. - `N`: number of param range for plotting. :Example: >>> from watex.bases.modeling import BaseModel >>> processObj = BaseModel( data_fn = 'data/geo_fdata/BagoueDataset2.xlsx') >>> processObj.get_learning_curve ( switch_plot='on', preprocessor=True) """ def compute_validation_curve(model, X_train, y_train, **param_kws): """ Compute learning curve and plot errors with training set size""" train_score , val_score = validation_curve(model, X_train, y_train, **param_kws ) return train_score , train_score valPlot =kws.pop('val_plot', False) learning_curve_kws = kws.pop('lc_kws', None) trigDec = kws.pop('switch_plot', 'off') trig_preprocessor = kws.pop('preprocessor', False) val_kws = kws.pop('val_kws', None) train_kws = kws.pop('train_kws', None) if learning_curve_kws is not None: self.lc_kws =learning_curve_kws if val_kws is not None : self.vc_kws = val_kws if estimator is not None : self.estimator = estimator elif estimator is None : if trig_preprocessor: if not self.Processing._auto: self.Processing._auto=True self.Processing.auto = self.Processing._auto self.estimator = self.Processing.estimator else : self._logging.info( 'Estimator is not provide! Trigger the `preprocessor`' ' by setting to ``True`` to visualize the default pipelines ' ' implementations.') warnings.warn( 'Estimator is not given! Set `preprocessor` to ``True``' ' to get the default estimator curve.') raise EstimatorError( 'Estimator not found! Please provide your estimator model ' ' or trigger the default composite estimator by enabling ' '`preprocessor` to ``True``.') if X_train is not None : self.X_train = X_train if y_train is not None: self.y_train = y_train if valPlot: N = self.vc_kws['param_range'] self.train_score, self.val_score = compute_validation_curve( model= self.estimator, X_train=self.X_train, y_train= self.y_train,**self.vc_kws) try : pname = self.vc_kws['param_name'] except : pname ='' else : N, self.train_score, self.val_score = learning_curve( self.estimator, X=self.X_train, y=self.y_train, **self.lc_kws) pname ='' return (N, self.train_score, self.val_score , trigDec, pname, val_kws, train_kws)
[docs] def tuning_hyperparameters (self, estimator: Callable[...,T]=None, hyper_params:Generic[T]=None, cv:T=4, grid_kws:Generic[T]=None, **kws): """ Tuning hyperparametres from existing estimator to evaluate performance. Boosting the model using the model `best_param` :param estimator: Callable estimator or model to boost :param hyper_params: dict of hyperparameters of the `estimator` :param cv: Cross validation cutting off. the *default* is 4 :param grid_kws:dict of other gridSearch parameters :Example: >>> from watex.modeling.basics import SLModeling >>> from sklearn.preprocessing import RobustScaler,PolynomialFeatures >>> from sklearn.feature_selection import SelectKBest, f_classif >>> from sklearn.svm import SVC >>> from sklearn.compose import make_column_selector >>> my_own_pipelines= { 'num_column_selector_': make_column_selector( dtype_include=np.number), 'cat_column_selector_': make_column_selector( dtype_exclude=np.number), 'features_engineering_':PolynomialFeatures( 3, include_bias=False), 'selectors_': SelectKBest(f_classif, k=3), 'encodages_': RobustScaler() } >>> my_estimator = SVC(C=1, gamma=1e-4, random_state=7) >>> modelObj = SLModeling(data_fn ='data/geo_fdata/BagoueDataset2.xlsx', pipelines =my_own_pipelines , estimator = my_estimator) >>> hyperparams ={ 'columntransformer__pipeline-1__polynomialfeatures__degree': np.arange(2,10), 'columntransformer__pipeline-1__selectkbest__k': np.arange(2,7), 'svc__C': [1, 10, 100], 'svc__gamma':[1e-1, 1e-2, 1e-3]} >>> my_compose_estimator_ = modelObj.model_ >>> modelObj.tuning_hyperparameters( estimator= my_compose_estimator_ , hyper_params= hyperparams, search='rand') >>> modelObj.best_params_ >>> modelObj.best_score_ """ with_gridS= kws.pop('Search','GridSearchCV' ) X_train =kws.pop('X_train', None) y_train =kws.pop('y_train', None) if grid_kws is None : grid_kws={} if X_train is not None : self.X_train = X_train if y_train is not None: self.y_train = y_train if with_gridS is None : self._logging.debug( ' `Search` is set to ``None``. ``GgridSearchCV`` is used as' 'as default tuning hyperparameters.') warnings.warn( ' `Search` is set to ``None``. ``GgridSearchCV`` is used as' 'as default tuning hyperparameters.') with_gridS = 'gridsearchcv' if 'grid' in with_gridS.lower(): with_gridS = 'gridsearchcv' elif 'rand' in with_gridS.lower() : with_gridS = 'randomizedsearchcv' if with_gridS == 'gridsearchcv': model_grid = GridSearchCV(estimator, hyper_params, cv =cv, **grid_kws ) elif with_gridS == 'randomizedsearchcv': model_grid = RandomizedSearchCV(estimator, hyper_params, cv=cv, **grid_kws) model_grid.fit(self.X_train, self.y_train) self._model_pred = model_grid.predict(self.X_test) self.best_score_= model_grid.best_score_ self.best_params_= model_grid.best_params_ return self.best_score_ , self.best_params_
[docs] @predplot(turn='off', fig_size =(10, 5), ObsLine =('on','ypred')) def get_model_prediction(self, estimator:Callable[..., T]=None, X_test:Optional[T]=None , y_test:Optional[T]=None, **kws) -> Iterable[T]: """ Get the model prediction and quick plot using the surche decorator. The decorator holds many keyword arguments to customize plot. Refer to :class:`watex.utils.decorator.predPlot`. :param estimator: The creating model. If ``None`` :param x_test: pd.DataFrame of selected Data for testset :param y_test: array_like of selected data for model test :param kws: Additional keywords arguments which refer to the `data_fn` `df` and `pipelines` parameters. :param switch: Turn `on` or `off` the decorator. :Example: >>> from watex.modeling.sl import Modeling >>> modelObj = Modeling( data_fn ='data/geo_fdata/BagoueDataset2.xlsx', pipelines ={ 'num_column_selector_': make_column_selector( dtype_include=np.number), 'cat_column_selector_': make_column_selector( dtype_exclude=np.number), 'features_engineering_':PolynomialFeatures(2, include_bias=False), 'selectors_': SelectKBest(f_classif, k=2), 'encodages_': RobustScaler() }, estimator = SVC(C=1, gamma=0.1)) >>> modelObj.get_model_prediction(estimator =testim, switch ='on') """ data_fn: Optional[T]= kws.pop('data_fn', None) df:Optional[T] = kws.pop('df', None) pipelines:Callable[..., Generic[T]]=kws.pop('pipelines', None) switch:Union [bool, str]= kws.pop('switch', 'off') if estimator is not None : self.estimator =estimator if pipelines is not None: self.pipelines =pipelines if X_test is not None: self.X_test= X_test if y_test is not None: self.y_test =y_test if (self._data_fn and self._df) is None : if data_fn is not None : self._data_fn =data_fn if df is not None: self._df = df if (self._data_fn or self._df ) is not None: self._read_modelingObj(data_fn=self._data_fn, df=self._df , pipelines = self.pipelines, estimator= self.estimator) else: raise ArgumentError( "Could not find any data for reading!") self.y_pred= self.estimator.predict(self.X_test) self.confusion_matrix = confusion_matrix(self.y_test, self.y_pred) # create y_pred dataframe df_ypred = pd.DataFrame(self.y_pred, index=self.y_test.index) return self.y_test, df_ypred , switch
@property def feature_importances_(self): """ Get the bar plot of features importances. If the estimator has not `feature_importances_` attributes, it will raise an error.""" import matplotlib.pyplot as plt try : estim_name = self.estimator.__class__.__name__ except : warnings.warn( 'Error occurs when trying to find the estimator name.') else : self.estimator.fit(self.X_train, self.y_train) self.estimator.score(self.X_test, self.y_test) try : pd.DataFrame(self.estimator.feature_importances_ *100, index=self.X_train.columns).plot.bar(**self.fimp_kws) except AttributeError as e: print(e.args) plt.close() except: self._logging.info( f"{estim_name} object has no attribute " "feature_importances_'" ) warnings.warn('Could not plot the `feature_importances_`.' f' The `{estim_name}` estimator has no' ' attributes`feature_importances_') plt.close() else: plt.xlabel('Name of features') plt.ylabel('Importance of feature in %') plt.show()
[docs] @pfi(reason ='pfi', turn='off', fig_size= (10,3),savefig=None, barh_kws= {'color':'blue','edgecolor':'k', 'linewidth':2}, box_kws= {'vert':False}, dendro_kws={'leaf_rotation':90}, fig_title= 'PFI diagram') def permutation_feature_importance(self, estimator:Callable[..., T]=None, X_train:Optional[T] =None , y_train:Optional[T]=None, pfi_kws:Dict[str, T]=None, **kws): """ Evaluation of features importance with tree estimators before shuffle and after shuffling trees. Permutation feature importance is a model inspection technique that can be used for any fitted estimator when the data is tabular. This is especially useful for non-linear or opaque estimators. Refer to :ref:`this link <https://scikit-learn.org/stable/modules/permutation_importance.html>`_ for more details. :param estimator: The estimator to evaluate the importance of features. The default is ``RandomForestClassifier``. :param X_train: pd.core.frame.DataFrame of selected trainset. :param y_train: array_like of selected data for evaluation set. :param n_estimators: Number of estimator composed the tree. The *default* is 100 :param n_repeats: Number of tree shuffling. The *default* is 10. :param pfi_kws: `permution_importance` callable additional keywords arguments. :param pfi_stype: Type of plot. Can be : - ``pfi`` for permutation feature importance before and after shuffling trees -``dendro`` for dendrogram plot . The *default* is `pfi`. :param switch: Turn ``on`` or ``off`` the decorator. :Example: >>> from watex.bases.modeling import BaseModel >>> from sklearn.ensemble import AdaBoostClassifier >>> modelObj = BaseModel() >>> modelObj.permutation_feature_importance( ... estimator = AdaBoostClassifier(random_state=7), ... data_fn ='data/geo_fdata/BagoueDataset2.xlsx', ... switch ='on', pfi_style='pfi') """ savefig:Optional[T] =kws.pop('savefig', None) random_state:Optional[T] = kws.pop('random_state', None) n_estimators: int = kws.pop('n_estimators', 100) n_repeats: int = kws.pop('n_repeats', 10) X_train:Optional[T] =kws.pop('X_train', None) y_train:Optional[T] =kws.pop('y_train', None) X:Optional[T]=kws.pop('X', None) data_fn: Optional[T]= kws.pop('data_fn', None) df:Optional[T] = kws.pop('df', None) pfi_type= kws.pop('pfi_style','pfi') switch: Union[str, bool]= kws.pop('switch', 'off') n_jobs: Optional[T]=kws.pop('n_jobs', -1) if pfi_kws is None : pfi_kws={} if savefig is not None : self.savefig = savefig if X_train is not None : self.X_train = X_train if y_train is not None: self.y_train = y_train if X is not None : self.X = X if random_state is not None : self.random_state = random_state if estimator is None : from sklearn.ensemble import RandomForestClassifier self.estimator = RandomForestClassifier( n_estimators=n_estimators, random_state= self.random_state) # self.estimator = clf if (self._data_fn and self._df) is None : if data_fn is not None : self._data_fn =data_fn if df is not None: self._df = df if (self._data_fn or self._df ) is None: self._logging.error( 'No data found ! Could not read modeling object.') warnings.warn( 'No data found to read. Could not read the modeling object.') raise ArgumentError( "Could not find any data to read!") elif (self._data_fn or self._df ) is not None: self._read_modelingObj(data_fn=self._data_fn, df=self._df, estimator = self.estimator) if estimator is not None : self.estimator = estimator self.estimator.fit(self.X_train, self.y_train) try : print("Accuracy on test data:") formatModelScore(self.estimator.score( self.X_test, self.y_test), self.estimator.__class__.__name__) except AttributeError as e: print(e.args) except: pass result = permutation_importance(self.estimator, self.X_train, self.y_train, n_repeats=n_repeats, random_state=self.random_state, n_jobs=n_jobs, **pfi_kws) perm_sorted_idx = result.importances_mean.argsort() try: # check whether the estimator has attribute `feature_importances_` tree_importance_sorted_idx = np.argsort( self.estimator.feature_importances_) except Exception: raise AttributeError( f' `{self.estimator}` estimator nas no attribute' ' `feature_importances_`') else: tree_indices = np.arange( 0, len(self.estimator.feature_importances_)) + 0.5 return self.X, result, tree_indices,\ self.estimator, tree_importance_sorted_idx,\ self.X_train.columns, perm_sorted_idx, pfi_type, switch, savefig