Source code for watex.models.validation

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Created on Sat Sep 25 10:10:31 2022

from __future__ import annotations 

import inspect
import warnings  
from pprint import pprint 
import numpy as np 

from .._docstring import ( 
    DocstringComponents, 
    _core_docs 
    ) 
from .._watexlog import watexlog
from ..exlib.sklearn import (
     mean_squared_error,
     cross_val_score,
     GridSearchCV , 
     RandomizedSearchCV, 
     LogisticRegression, 
     Pipeline,
)
from .._typing import (
    List,
    Tuple,
    F, 
    ArrayLike, 
    NDArray, 
    Dict,
    Any, 
    DataFrame, 
    Series,
    
    )
from ..exceptions import ( 
    EstimatorError, 
    NotFittedError, 
    ) 
from ..utils.funcutils import ( 
    _assert_all_types, 
    get_params, 
    savejob, 
    listing_items_format, 
    pretty_printer, 

    )
from ..utils.box import Boxspace 
from ..utils.validator import ( 
    check_X_y, check_array, 
    check_consistent_length, 
    get_estimator_name
    )

_logger = watexlog().get_watex_logger(__name__)

__all__=[
    "BaseEvaluation", 
    "GridSearch", 
    "GridSearchMultiple", 
    "get_best_kPCA_params", 
    "get_scorers", 
    "getGlobalScores", 
    "getSplitBestScores", 
    "displayCVTables", 
    "displayFineTunedResults", 
    "displayModelMaxDetails", 
    "naive_evaluation"
    ]

_param_docs = DocstringComponents.from_nested_components(
    core=_core_docs["params"], 
    )

[docs] class GridSearch: __slots__=( '_base_estimator', 'grid_params', 'scoring', 'cv', '_kind', 'grid_kws', 'best_params_', 'cv_results_', 'feature_importances_', 'best_estimator_', 'verbose', 'grid_kws', ) def __init__( self, base_estimator:F, grid_params:Dict[str,Any], cv:int =4, kind:str ='GridSearchCV', scoring:str = 'nmse', verbose:int=0, **grid_kws ): self._base_estimator = base_estimator self.grid_params = grid_params self.scoring = scoring self.cv = cv self.best_params_ =None self.cv_results_= None self.feature_importances_= None self.grid_kws = grid_kws self._kind = kind self.verbose=verbose @property def base_estimator (self): """ Return the base estimator class""" return self._base_estimator @base_estimator.setter def base_estimator (self, base_est): if not hasattr (base_est, 'fit'): raise EstimatorError( f"Wrong estimator {get_estimator_name(base_est)!r}. Each" " estimator must have a fit method. Refer to scikit-learn" " https://scikit-learn.org/stable/modules/classes.html API" " reference to build your own estimator.") self._base_estimator =base_est @property def kind(self): """ Kind of searched. `RandomizedSearchCV` or `GridSearchCV`.""" return self._kind @kind.setter def kind (self, ksearch): """`kind attribute checker""" if 'gridsearchcv1'.find( str(ksearch).lower())>=0: ksearch = 'GridSearchCV' elif 'randomizedsearchcv2'.find( str(ksearch).lower())>=0: ksearch = 'RandomizedSearchCV' else: raise ValueError ( " Unkown the kind of parameter search {ksearch!r}." " Supports only 'GridSearchCV' and 'RandomizedSearchCV'.") self._kind = ksearch
[docs] def fit(self, X, y): """ Fit method using base Estimator and populate gridSearch attributes. Parameters ---------- X: Ndarray ( M x N) matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. y: array-like, shape (M, ) ``M=m-samples``, train target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. Returns ---------- ``self``: `GridSearch` Returns :class:`~.GridSearch` """ if callable (self.base_estimator): self.base_estimator= self.base_estimator () parameters = get_params (self.base_estimator.__init__) if self.verbose > 0: msg = ("Estimator {!r} is cloned with default arguments{!r}" " for cross validation search.".format( get_estimator_name (self.base_estimator), parameters) ) warnings.warn(msg) self.kind =self._kind if self.kind =='GridSearchCV': searchGridMethod = GridSearchCV elif self.kind=='RandomizedSearchCV': searchGridMethod= RandomizedSearchCV if self.scoring in ( 'nmse', None): self.scoring ='neg_mean_squared_error' # assert scoring values get_scorers(scorer= self.scoring , check_scorer= True, error ='raise' ) gridObj = searchGridMethod( self.base_estimator, self.grid_params, scoring = self.scoring , cv = self.cv, **self.grid_kws ) gridObj.fit(X, y) #make_introspection(self, gridObj) params = ('best_params_','best_estimator_','cv_results_') params_values = [getattr (gridObj , param, None) for param in params] for param , param_value in zip(params, params_values ): setattr(self, param, param_value) # set feature_importances if exists try : attr_value = gridObj.best_estimator_.feature_importances_ except AttributeError: setattr(self,'feature_importances_', None ) else : setattr(self,'feature_importances_', attr_value) return self
GridSearch.__doc__="""\ Fine-tune hyperparameters using grid search methods. Search Grid will be able to fiddle with the hyperparameters until to Parameters ------------ base_estimator: Callable, estimator for trainset and label evaluating; something like a class that implements a fit method. Refer to https://scikit-learn.org/stable/modules/classes.html grid_params: list of dict, list of hyperparameters params to be fine-tuned.For instance:: param_grid=[dict( kpca__gamma=np.linspace(0.03, 0.05, 10), kpca__kernel=["rbf", "sigmoid"] )] pipeline: Callable or :class:`~sklearn.pipeline.Pipeline` object If `pipeline` is given , `X` is transformed accordingly, Otherwise evaluation is made using purely the base estimator with the given `X`. prefit: bool, default=False, If ``False``, does not need to compute the cross validation score once again and ``True`` otherwise. {params.core.cv} The default is ``4``. kind:str, default='GridSearchCV' or '1' Kind of grid parameter searches. Can be ``1`` for ``GridSearchCV`` or ``2`` for ``RandomizedSearchCV``. {params.core.scoring} {params.core.random_state} Examples ----------- >>> from pprint import pprint >>> from watex.datasets import fetch_data >>> from watex.models.validation import GridSearch >>> from watex.exlib.sklearn import RandomForestClassifier >>> X_prepared, y_prepared =fetch_data ('bagoue prepared') >>> grid_params = [ dict( ... n_estimators=[3, 10, 30], max_features=[2, 4, 6, 8]), ... dict(bootstrap=[False], n_estimators=[3, 10], ... max_features=[2, 3, 4]) ... ] >>> forest_clf = RandomForestClassifier() >>> grid_search = GridSearch(forest_clf, grid_params) >>> grid_search.fit(X= X_prepared,y = y_prepared,) >>> pprint(grid_search.best_params_ ) {{'max_features': 8, 'n_estimators': 30}} >>> pprint(grid_search.cv_results_) """.format (params=_param_docs, )
[docs] class GridSearchMultiple: def __init__ ( self, estimators: F, scoring:str, grid_params: Dict[str, Any], *, kind:str ='GridSearchCV', cv: int =7, random_state:int =42, savejob:bool =False, filename: str=None, verbose:int =0, **grid_kws, ): self.estimators = estimators self.scoring=scoring self.grid_params=grid_params self.kind=kind self.cv=cv self.savejob=savejob self.filename=filename self.verbose=verbose self.grid_kws=grid_kws
[docs] def fit( self, X: NDArray, y:ArrayLike, ): """ Fit methods, evaluate each estimator and store models results. Parameters ----------- {params.core.X} {params.core.y} Returns -------- {returns.self} """.format( params =_param_docs , returns = _core_docs['returns'] ) err_msg = (" Each estimator must have its corresponding grid params," " i.e estimators and grid params must have the same length." " Please provide the appropriate arguments.") try: check_consistent_length(self.estimators, self.grid_params) except ValueError as err : raise ValueError (str(err) +f". {err_msg}") self.best_estimators_ =[] self.data_ = {} models_= {} msg ='' self.filename = self.filename or '__'.join( [get_estimator_name(b) for b in self.estimators ]) for j, estm in enumerate(self.estimators): estm_name = get_estimator_name(estm) msg = f'{estm_name} is evaluated with {self.kind}.' searchObj = GridSearch(base_estimator=estm, grid_params= self.grid_params[j], cv = self.cv, kind=self.kind, scoring=self.scoring, **self.grid_kws ) searchObj.fit(X, y) best_model_clf = searchObj.best_estimator_ if self.verbose > 7 : msg += ( f"\End {self.kind} search. Set estimator {estm_name!r}" " best parameters, cv_results and other importances" " attributes\n'" ) self.data_[estm_name]= { 'best_model_':searchObj.best_estimator_ , 'best_params_':searchObj.best_params_ , 'cv_results_': searchObj.cv_results_, 'grid_params':self.grid_params[j], 'scoring':self.scoring, "grid_kws": self.grid_kws } models_[estm_name] = searchObj msg += ( f"Cross-evaluatation the {estm_name} best model." f" with KFold ={self.cv}" ) bestim_best_scores, _ = naive_evaluation( best_model_clf, X, y, cv = self.cv, scoring = self.scoring, display ='on' if self.verbose > 7 else 'off' ) # store the best scores self.data_[f'{estm_name}']['best_scores']= bestim_best_scores self.best_estimators_.append((estm, searchObj.best_estimator_, searchObj.best_params_, bestim_best_scores) ) # save models into a Box d = {**models_, ** dict( keys_ = list (models_.values() ), values_ = list (models_.values() ), models_= models_, ) } self.models= Boxspace(**d) if self.verbose: msg += ('\Pretty print estimators results using' f' scoring ={self.scoring!r}') pretty_printer(clfs=self.best_estimators_, scoring =self.scoring, clf_scores= None) if self.savejob: msg += ('\Serialize the dict of fine-tuned ' f'parameters to `{self.filename}`.') savejob (job= self.data_ , savefile = self.filename ) _logger.info(f'Dumping models `{self.filename}`!') if self.verbose: pprint(msg) bg = ("Job is successfully saved. Try to fetch your job from " f"{self.filename!r} using") lst =[ "{}.load('{}') or ".format('joblib', self.filename ), "{}.load('{}')".format('pickle', self.filename)] listing_items_format(lst, bg ) if self.verbose: pprint(msg) return self
GridSearchMultiple.__doc__="""\ Search and find multiples best parameters from differents estimators. Parameters ---------- estimators: list of callable obj list of estimator objects to fine-tune their hyperparameters For instance:: random_state=42 # build estimators logreg_clf = LogisticRegression(random_state =random_state) linear_svc_clf = LinearSVC(random_state =random_state) sgd_clf = SGDClassifier(random_state = random_state) svc_clf = SVC(random_state =random_state) ) estimators =(svc_clf,linear_svc_clf, logreg_clf, sgd_clf ) grid_params: list list of parameters Grids. For instance:: grid_params= ([ dict(C=[1e-2, 1e-1, 1, 10, 100], gamma=[5, 2, 1, 1e-1, 1e-2, 1e-3], kernel=['rbf']), dict(kernel=['poly'],degree=[1, 3,5, 7], coef0=[1, 2, 3], 'C': [1e-2, 1e-1, 1, 10, 100])], [dict(C=[1e-2, 1e-1, 1, 10, 100], loss=['hinge'])], [dict()], [dict()] ) {params.core.cv} {params.core.scoring} kind:str, default='GridSearchCV' or '1' Kind of grid parameter searches. Can be ``1`` for ``GridSearchCV`` or ``2`` for ``RandomizedSearchCV``. {params.core.random_state} savejob: bool, default=False Save your model parameters to external file using 'joblib' or Python persistent 'pickle' module. Default sorted to 'joblib' format. {params.core.verbose} grid_kws: dict, Argument passed to `grid_method` additional keywords. Examples -------- >>> from watex.models import GridSearchMultiple , displayFineTunedResults >>> from watex.exlib import LinearSVC, SGDClassifier, SVC, LogisticRegression >>> X, y = wx.fetch_data ('bagoue prepared') >>> X ... <344x18 sparse matrix of type '<class 'numpy.float64'>' ... with 2752 stored elements in Compressed Sparse Row format> >>> # As example, we can build 04 estimators and provide their >>> # grid parameters range for fine-tuning as :: >>> random_state=42 >>> logreg_clf = LogisticRegression(random_state =random_state) >>> linear_svc_clf = LinearSVC(random_state =random_state) >>> sgd_clf = SGDClassifier(random_state = random_state) >>> svc_clf = SVC(random_state =random_state) >>> estimators =(svc_clf,linear_svc_clf, logreg_clf, sgd_clf ) >>> grid_params= ([dict(C=[1e-2, 1e-1, 1, 10, 100], gamma=[5, 2, 1, 1e-1, 1e-2, 1e-3],kernel=['rbf']), dict(kernel=['poly'],degree=[1, 3,5, 7], coef0=[1, 2, 3], C= [1e-2, 1e-1, 1, 10, 100])], [dict(C=[1e-2, 1e-1, 1, 10, 100], loss=['hinge'])], [dict()], # we just no provided parameter for demo [dict()] ) >>> #Now we can call :class:`watex.models.GridSearchMultiple` for >>> # training and self-validating as: >>> gobj = GridSearchMultiple(estimators = estimators, grid_params = grid_params , cv =4, scoring ='accuracy', verbose =1, #> 7 put more verbose savejob=False , # set true to save job in binary disk file. kind='GridSearchCV').fit(X, y) >>> # Once the parameters are fined tuned, we can display the fined tuning >>> # results using displayFineTunedResults`` function >>> displayFineTunedResults (gobj.models.values_) MODEL NAME = SVC BEST PARAM = {{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}} BEST ESTIMATOR = SVC(C=100, gamma=0.01, random_state=42) MODEL NAME = LinearSVC BEST PARAM = {{'C': 100, 'loss': 'hinge'}} BEST ESTIMATOR = LinearSVC(C=100, loss='hinge', random_state=42) MODEL NAME = LogisticRegression BEST PARAM = {{}} BEST ESTIMATOR = LogisticRegression(random_state=42) MODEL NAME = SGDClassifier BEST PARAM = {{}} BEST ESTIMATOR = SGDClassifier(random_state=42) Notes -------- Call :func:`~.get_scorers` or use `sklearn.metrics.SCORERS.keys()` to get all the metrics used to evaluate model errors. Can be any others metrics in `~metrics.metrics.SCORERS.keys()`. Furthermore if `scoring` is set to ``None`` ``nmse`` is used as default value for 'neg_mean_squared_error'`. """.format (params=_param_docs, )
[docs] class BaseEvaluation: def __init__( self, estimator: F, cv: int = 4, pipeline: List[F]= None, prefit:bool=False, scoring: str ='nmse', random_state: int=42, verbose: int=0, ): self._logging =watexlog().get_watex_logger(self.__class__.__name__) self.estimator = estimator self.cv = cv self.pipeline =pipeline self.prefit =prefit self.scoring = scoring self.random_state=random_state self.verbose=verbose def _check_callable_estimator (self, base_est ): """ Check wether the estimator is callable or not. If callable use the default parameter for initialization. """ if not hasattr (base_est, 'fit'): raise EstimatorError( f"Wrong estimator {get_estimator_name(base_est)!r}. Each" " estimator must have a fit method. Refer to scikit-learn" " https://scikit-learn.org/stable/modules/classes.html API" " reference to build your own estimator.") self.estimator = base_est if callable (base_est): self.estimator = base_est () # use default initialization return self.estimator
[docs] def fit(self, X, y, sample_weight= .75 ): """ Quick methods used to evaluate eastimator, display the error results as well as the sample model_predictions. Parameters ----------- X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. y: array-like, shape (M, ) ``M=m-samples``, train target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. sample_weight: float,default = .75 The ratio to sample X and y. The default sample 3/4 percent of the data. If given, will sample the `X` and `y`. If ``None``, will sample the half of the data. Returns --------- `self` : :class:`~.BaseEvaluation` :class:`~.BaseEvaluation` object. """ # pass when pipeline is supplied. # we expect data be transform into numeric dtype dtype = object if self.pipeline is not None else "numeric" X, y = check_X_y ( X,y, to_frame =True, dtype =dtype, estimator= get_estimator_name(self.estimator), ) self.estimator = self._check_callable_estimator(self.estimator ) self._logging.info ( 'Quick estimation using the %r estimator with config %r arguments %s.' %(repr(self.estimator),self.__class__.__name__, inspect.getfullargspec(self.__init__))) sample_weight = float( _assert_all_types(sample_weight, int, float, objname ="Sample weight")) if sample_weight <= 0 or sample_weight >1: raise ValueError ("Sample weight must be range between 0 and 1," f" got {sample_weight}") # sampling train data. # use 75% by default of among data n = int ( sample_weight * len(X)) if hasattr (X, 'columns'): X = X.iloc [:n] else : X=X[:n, :] y= y[:n] if self.pipeline is not None: X =self.pipeline.fit_transform(X) if not self.prefit: #for consistency if self.scoring is None: warnings.warn("'neg_mean_squared_error' scoring is used when" " scoring parameter is ``None``.") self.scoring ='neg_mean_squared_error' self.scoring = "neg_mean_squared_error" if self.scoring in ( None, 'nmse') else self.scoring self.mse_, self.rmse_ , self.cv_scores_ = self._fit( X, y, self.estimator, cv_scores=True, scoring = self.scoring ) return self
def _fit(self, X, y, estimator, cv_scores=True, scoring ='neg_mean_squared_error' ): """Fit data once verified and compute the ``rmse`` scores. Parameters ---------- X: array-like of shape (n_samples, n_features) training data for fitting y: arraylike of shape (n_samples, ) target for training estimator: callable or scikit-learn estimator Callable or something that has a fit methods. Can build your own estimator following the API reference via https://scikit-learn.org/stable/modules/classes.html cv_scores: bool,default=True compute the cross validations scores scoring: str, default='neg_mean_squared_error' metric dfor scores evaluation. Type of scoring for cross validation. Please refer to :doc:`~.slkearn.model_selection.cross_val_score` for further details. Returns ---------- (mse, rmse, scores): Tuple - mse: Mean Squared Error - rmse: Root Meam Squared Error - scores: Cross validation scores """ mse = rmse = None def display_scores(scores): """ Display scores...""" n=("scores:", "Means:", "RMSE scores:", "Standard Deviation:") p=(scores, scores.mean(), np.sqrt(scores), scores.std()) for k, v in zip (n, p): pprint(k, v ) self._logging.info( "Fit data with a supplied pipeline or using purely estimator") estimator.fit(X, y) y_pred = estimator.predict(X) if self.scoring !='accuracy': # if regression task mse = mean_squared_error(y , y_pred) rmse = np.sqrt(mse) scores = None if cv_scores: scores = cross_val_score( estimator, X, y, cv=self.cv, scoring=self.scoring ) if self.scoring == 'neg_mean_squared_error': rmse= np.sqrt(-scores) else: rmse= np.sqrt(scores) if self.verbose: if self.scoring =='neg_mean_squared_error': scores = -scores display_scores(scores) return mse, rmse, scores
[docs] def predict (self, X ): """ Quick prediction and get the scores. Parameters ----------- X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Test set; Denotes data that is observed at testing and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. Returns ------- y: array-like, shape (M, ) ``M=m-samples``, test predicted target. """ self.inspect dtype = object if self.pipeline is not None else "numeric" X = check_array(X, accept_sparse= False, input_name ='X', dtype= dtype, estimator=get_estimator_name(self.estimator), ) if self.pipeline is not None: X= self.pipeline.fit_transform (X) return self.estimator.predict (X )
@property def inspect (self): """ Inspect object whether is fitted or not""" msg = ( "{obj.__class__.__name__} instance is not fitted yet." " Call 'fit' with appropriate arguments before using" " this method" ) if not hasattr (self, 'cv_scores_'): raise NotFittedError(msg.format( obj=self) ) return 1
BaseEvaluation.__doc__="""\ Evaluation of dataset using a base estimator. Quick evaluation of the data after preparing and pipeline constructions. Parameters ----------- estimator: Callable, estimator for trainset and label evaluating; something like a class that implements a fit methods. Refer to https://scikit-learn.org/stable/modules/classes.html {params.core.cv} The default is ``4``. {params.core.scoring} pipeline: Callable or :class:`~sklearn.pipeline.Pipeline` object If `pipeline` is given , `X` is transformed accordingly, Otherwise evaluation is made using purely the base estimator with the given `X`. Refer to https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline for further details. kind: str, default ='GridSearchCV' Kind of grid search method. Could be ``GridSearchCV`` or ``RandomizedSearchCV``. prefit: bool, default=False, If ``False``, does not need to compute the cross validation score once again and ``True`` otherwise. {params.core.random_state} Examples -------- >>> import watex as wx >>> from watex.datasets import load_bagoue >>> from watex.models import BaseEvaluation >>> X, y = load_bagoue (as_frame =True ) >>> # categorizing the labels >>> yc = wx.smart_label_classifier (y , values = [1, 3, 10 ], # labels =['FR0', 'FR1', 'FR2', 'FR4'] ) >>> # drop the subjective columns ['num', 'name'] >>> X = X.drop (columns = ['num', 'name']) >>> # X = wx.cleaner (X , columns = 'num name', mode='drop') >>> X.columns Index(['shape', 'type', 'geol', 'east', 'north', 'power', 'magnitude', 'sfi', 'ohmS', 'lwi'], dtype='object') >>> X = wx.naive_imputer ( X, mode ='bi-impute') # impute data >>> # create a pipeline for X >>> pipe = wx.make_naive_pipe (X) >>> Xtrain, Xtest, ytrain, ytest = wx.sklearn.train_test_split(X, yc) >>> b = BaseEvaluation (estimator= wx.sklearn.RandomForestClassifier, scoring = 'accuracy', pipeline = pipe) >>> b.fit(Xtrain, ytrain ) # accepts only array >>> b.cv_scores_ Out[174]: array([0.75409836, 0.72131148, 0.73333333, 0.78333333]) >>> ypred = b.predict(Xtest) >>> scores = wx.sklearn.accuracy_score (ytest, ypred) 0.7592592592592593 """.format (params=_param_docs, )
[docs] def get_best_kPCA_params( X:NDArray | DataFrame, n_components: float | int =2, *, y: ArrayLike | Series=None, param_grid: Dict[str, Any] =None, clf: F =None, cv: int =7, **grid_kws )-> Dict[str, Any]: from ..analysis.dimensionality import ( get_component_with_most_variance, KernelPCA) if n_components is None: n_components= get_component_with_most_variance(X) if clf is None: clf =Pipeline([ ('kpca', KernelPCA(n_components=n_components)), ('log_reg', LogisticRegression()) ]) gridObj =GridSearch(base_estimator= clf, grid_params= param_grid, cv=cv, **grid_kws ) gridObj.fit(X, y) return gridObj.best_params_
get_best_kPCA_params.__doc__="""\ Select the Kernel and hyperparameters using GridSearchCV that lead to the best performance. As kPCA( unsupervised learning algorithm), there is obvious performance measure to help selecting the best kernel and hyperparameters values. However dimensionality reduction is often a preparation step for a supervised task(e.g. classification). So we can use grid search to select the kernel and hyperparameters that lead the best performance on that task. By default implementation we create two steps pipeline. First reducing dimensionality to two dimension using kPCA, then applying the `LogisticRegression` for classification. AFter use Grid searchCV to find the best ``kernel`` and ``gamma`` value for kPCA in oder to get the best clasification accuracy at the end of the pipeline. Parameters ---------- {params.core.X} {params.core.y} n_components:int, Number of dimension to preserve. If `n_components` is ranged between 0. to 1., it indicated the number of variance ratio to preserve. param_grid: list list of parameters grids. For instance:: param_grid=[dict( kpca__gamma=np.linspace(0.03, 0.05, 10), kpca__kernel=["rbf", "sigmoid"] )] {params.core.clf} It can also be a base estimator or a composite estimor with pipeline. For instance:: clf =Pipeline([ ('kpca', KernelPCA(n_components=2)) ('log_reg', LogisticRegression()) ]) {params.core.cv} grid_kws: dict, Additional keywords arguments passed to Grid parameters from :class:`~watex.models.validation.GridSearch` Examples --------- >>> from watex.analysis.dimensionality import get_best_kPCA_params >>> from watex.datasets import fetch_data >>> X, y=fetch_data('Bagoue analysis data') >>> param_grid=[dict( kpca__gamma=np.linspace(0.03, 0.05, 10), kpca__kernel=["rbf", "sigmoid"] )] >>> clf =Pipeline([ ('kpca', KernelPCA(n_components=2)), ('log_reg', LogisticRegression()) ]) >>> kpca_best_params =get_best_kPCA_params( X,y=y,scoring = 'accuracy', n_components= 2, clf=clf, param_grid=param_grid) >>> kpca_best_params ... {{'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}} """.format( params=_param_docs, )
[docs] def getGlobalScores ( cvres : Dict[str, ArrayLike] ) -> Tuple [float]: """ Retrieve the global mean and standard deviation score from the cross validation containers. Parameters ------------ cvres: dict of (str, Array-like) cross validation results after training the models of number of parameters equals to N. The `str` fits the each parameter stored during the cross-validation while the value is stored in Numpy array. Returns --------- ( mean_test_scores', 'std_test_scores') scores on CV test data and standard deviation """ return ( cvres.get('mean_test_score').mean() , cvres.get('std_test_score').mean())
[docs] def getSplitBestScores(cvres:Dict[str, ArrayLike], split:int=0)->Dict[str, float]: """ Get the best score at each split from cross-validation results Parameters ----------- cvres: dict of (str, Array-like) cross validation results after training the models of number of parameters equals to N. The `str` fits the each parameter stored during the cross-validation while the value is stored in Numpy array. split: int, default=1 The number of split to fetch parameters. The number of split must be the number of cross-validation (cv) minus one. Returns ------- bests: Dict, Dictionnary of the best parameters at the corresponding `split` in the cross-validation. """ #if split ==0: split =1 # get the split score split_score = cvres[f'split{split}_test_score'] # take the max score of the split max_sc = split_score.max() ix_max = split_score.argmax() mean_score= split_score.mean() # get parm and mean score bests ={'param': cvres['params'][ix_max], 'accuracy_score':cvres['mean_test_score'][ix_max], 'std_score':cvres['std_test_score'][ix_max], f"CV{split}_score": max_sc , f"CV{split}_mean_score": mean_score, } return bests
[docs] def displayModelMaxDetails(cvres:Dict[str, ArrayLike], cv:int =4): """ Display the max details of each stored model from cross-validation. Parameters ----------- cvres: dict of (str, Array-like) cross validation results after training the models of number of parameters equals to N. The `str` fits the each parameter stored during the cross-validation while the value is stored in Numpy array. cv: int, default=1 The number of KFlod during the fine-tuning models parameters. """ for k in range (cv): print(f'split = {k}:') b= getSplitBestScores(cvres, split =k) print( b) globalmeansc , globalstdsc= getGlobalScores(cvres) print("Global split scores:") print('mean=', globalmeansc , 'std=',globalstdsc)
[docs] def displayFineTunedResults ( cvmodels: list[F] ): """Display fined -tuning results Parameters ----------- cvmnodels: list list of fined-tuned models. """ bsi_bestestimators = [model.best_estimator_ for model in cvmodels ] mnames = ( get_estimator_name(n) for n in bsi_bestestimators) bsi_bestparams = [model.best_params_ for model in cvmodels] for nam, param , estimator in zip(mnames, bsi_bestparams, bsi_bestestimators): print("MODEL NAME =", nam) print('BEST PARAM =', param) print('BEST ESTIMATOR =', estimator) print()
[docs] def displayCVTables(cvres:Dict[str, ArrayLike], cvmodels:list[F] ): """ Display the cross-validation results from all models at each k-fold. Parameters ----------- cvres: dict of (str, Array-like) cross validation results after training the models of number of parameters equals to N. The `str` fits the each parameter stored during the cross-validation while the value is stored in Numpy array. cvmnodels: list list of fined-tuned models. Examples --------- >>> from watex.datasets import fetch_data >>> from watex.models import GridSearchMultiple, displayCVTables >>> X, y = fetch_data ('bagoue prepared') >>> gobj =GridSearchMultiple(estimators = estimators, grid_params = grid_params , cv =4, scoring ='accuracy', verbose =1, savejob=False , kind='GridSearchCV') >>> gobj.fit(X, y) >>> displayCVTables (cvmodels=[gobj.models.SVC] , cvres= [gobj.models.SVC.cv_results_ ]) ... """ modelnames = (get_estimator_name(model.best_estimator_ ) for model in cvmodels ) for name, mdetail, model in zip(modelnames, cvres, cvmodels): print(name, ':') displayModelMaxDetails(cvres=mdetail) print('BestParams: ', model.best_params_) try: print("Best scores:", model.best_score_) except: pass finally: print()
[docs] def get_scorers (*, scorer:str=None, check_scorer:bool=False, error:str='ignore')-> Tuple[str] | bool: """ Fetch the list of available metrics from scikit-learn or verify whether the scorer exist in that list of metrics. This is prior necessary before the model evaluation. :param scorer: str, Must be an metrics for model evaluation. Refer to :mod:`sklearn.metrics` :param check_scorer:bool, default=False Returns bool if ``True`` whether the scorer exists in the list of the metrics for the model evaluation. Note that `scorer`can not be ``None`` if `check_scorer` is set to ``True``. :param error: str, ['raise', 'ignore'] raise a `ValueError` if `scorer` not found in the list of metrics and `check_scorer `is ``True``. :returns: scorers: bool, tuple ``True`` if scorer is in the list of metrics provided that ` scorer` is not ``None``, or the tuple of scikit-metrics. :mod:`sklearn.metrics` """ from sklearn import metrics try: scorers = tuple(metrics.SCORERS.keys()) except: scorers = tuple (metrics.get_scorer_names()) if check_scorer and scorer is None: raise ValueError ("Can't check the scorer while the scorer is None." " Provide the name of the scorer or get the list of" " scorer by setting 'check_scorer' to 'False'") if scorer is not None and check_scorer: scorers = scorer in scorers if not scorers and error =='raise': raise ValueError( f"Wrong scorer={scorer!r}. Supports only scorers:" f" {tuple(metrics.SCORERS.keys())}") return scorers
[docs] def naive_evaluation( clf: F, X:NDArray, y:ArrayLike, cv:int =7, scoring:str ='accuracy', display: str ='off', **kws ): scores = cross_val_score(clf , X, y, cv = cv, scoring=scoring, **kws) if display is True or display =='on': print('clf=:', clf.__class__.__name__) print('scores=:', scores ) print('scores.mean=:', scores.mean()) return scores , scores.mean()
naive_evaluation.__doc__="""\ Quick scores evaluation using cross validation. Parameters ---------- clf: callable Classifer for testing default data. X: ndarray trainset data y: array_like label data cv: int KFold for data validation. scoring: str type of error visualization. display: str or bool, show the show on the stdout kws: dict, Additional keywords arguments passed to :func:`watex.exlib.slearn.cross_val_score`. Returns --------- scores, mean_core: array_like, float scaore after evaluation and mean of the score Examples --------- >>> import watex as wx >>> from watex.models.validation import naive_evaluation >>> X, y = wx.fetch_data ('bagoue data prepared') >>> clf = wx.sklearn.DecisionTreeClassifier() >>> naive_evaluation(clf, X, y , cv =4 , display ='on' ) clf=: DecisionTreeClassifier scores=: [0.6279 0.7674 0.7093 0.593 ] scores.mean=: 0.6744186046511629 Out[57]: (array([0.6279, 0.7674, 0.7093, 0.593 ]), 0.6744186046511629) """ # deprecated in scikit-learn 0.21 to 0.23 # from sklearn.externals import joblib # import sklearn.externals # from abc import ABC,abstractmethod# ABCMeta # class AttributeCkecker(ABC): # """ Check attributes and inherits from module `abc` # for Data validators. # Validate DataType mainly `X` train or test sets and `y` labels or # and any others params types. # """ # def __set_name__(self, owner, name): # try: # self.private_name = '_' + name # except AttributeError: # warnings.warn('Object {owner!r} has not attribute {name!r}') # def __get__(self, obj, objtype =None): # return getattr(obj, self.private_name) # def __set__(self, obj, value): # self.validate(value) # setattr(obj, self.private_name, value) # @abstractmethod # def validate(self, value): # pass # class checkData (AttributeCkecker): # """ Descriptor to check data type `X` or `y` or else.""" # def __init__(self, Xdtypes): # self.Xdtypes =eval(Xdtypes) # def validate(self, value) : # """ Validate `X` and `y` type.""" # if not isinstance(value, self.Xdtypes): # raise TypeError( # f'Expected {value!r} to be one of {self.Xdtypes!r} type.') # class checkValueType_ (AttributeCkecker): # """ Descriptor to assert parameters values. Default assertion is # ``int`` or ``float``""" # def __init__(self, type_): # self.six =type_ # def validate(self, value): # """ Validate `cv`, `s_ix` parameters type""" # if not isinstance(value, self.six ): # raise ValueError(f'Expected {self.six} not {type(value)!r}') # class checkClass (AttributeCkecker): # def __init__(self, klass): # self.klass = klass # def validate(self, value): # """ Validate the base estimator whether is a class or not. """ # if not inspect.isclass(value): # raise TypeError('Estimator might be a class object ' # f'not {type(value)!r}.')