Source code for watex.cases.processing

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Wed Jul 14 20:00:26 2021

from __future__ import (
    print_function ,
    division, 
    annotations
)
import copy
import warnings 
import inspect
import numpy as np 
import pandas as pd 

from .._docstring import  DocstringComponents, _core_docs
from .._watexlog import watexlog 
from ..decorators import visualize_valearn_curve
from ..exceptions import ( 
    FeatureError , 
    NotFittedError , 
    ProcessingError, 
    EstimatorError
  )
from ..exlib.sklearn  import ( 
    DecisionTreeClassifier, 
    KNeighborsClassifier, 
    OneHotEncoder, 
    SelectKBest,  
    SGDClassifier,
    SVC, 
    PolynomialFeatures, 
    RobustScaler, 
    ColumnTransformer,
    confusion_matrix , 
    classification_report, 
    f_classif, 
    train_test_split, 
    validation_curve ,
    SimpleImputer,
    Pipeline,
    _HAS_ENSEMBLE_,   
    ) 
from ..utils.coreutils import ( 
    _is_readable , 
    _assert_all_types
    )
from ..utils.funcutils import ( 
    format_notes, 
    repr_callable_obj, 
    smart_strobj_recognition, 
    smart_format
    )
from ..utils.mlutils import (
    formatModelScore, 
    findCatandNumFeatures,
    selectfeatures, 
    evalModel 
    )
from .._typing import ( 
    List, 
    Callable, 
    NDArray, 
    ArrayLike,
    F
    ) 
_logger =watexlog().get_watex_logger(__name__)
d_estimators_={'dtc':DecisionTreeClassifier, 
                'svc':SVC, 
                'sgd':SGDClassifier, 
                'knn':KNeighborsClassifier 
                 }
if _HAS_ENSEMBLE_ :
    from ..exlib.sklearn import skl_ensemble_
    
    for es_, esf_ in zip(['rdf', 'ada', 'vtc', 'bag','stc'], skl_ensemble_): 
        d_estimators_[es_]=esf_ 
        
# append repeat docs to dictdocs
_preproces_params =dict ( 
    pipe_ = """
pipe_:Callable, preprocessor object from :mod:`sklearn.pipeline`
    Pipeline can  be buit by your own pipeline with different transformer. 
    For base model prediction, it is possible to use the default pipeline.
    Call `get_default_pipe` to get the transformation list and steps. 
    """, 
    estimator_="""
estimator: Callable, F or :mod:`sklearn.metaestimator`
    Callable estimator method to fit the model:: 
        
        estimators= SGDClassifier(random_state=13)    
    """
    )
_param_docs = DocstringComponents.from_nested_components(
    core=_core_docs["params"], 
    base=DocstringComponents(_preproces_params), 
    )
[docs] class Preprocessing : def __init__(self, tname:str ='flow', drop_features: List[str]=None, #['lwi'] random_state: int =42 , default_estimator: F|str= 'svc', test_size: float =.2 , verbose: int = 0 , ): self._logging = watexlog().get_watex_logger(self.__class__.__name__) self.tname=tname self.drop_features=drop_features self.random_state=random_state self.default_estimator=default_estimator self.test_size=test_size self.verbose=verbose self.X=None self.y=None self.Xt= None self.yt=None self.features_ = None self.cat_features_ =None self.num_features_ =None self.y_=None self.X_= None self.estimator_ =None self.pipe_ = None self.ypred_= None self.model_results_=None self.base_score_=None self.data_ = None self.model_ =None @property def data(self): return self.data_ @data.setter def data (self, d): """ Read the given data and create a pd.core.frame.DataFrame . Call :class:~analysis.features.sl_analysis` and retrieve the best information. """ self.data_ = _is_readable(d) @property def features(self): """ Collect the list of features""" return self.features_ @features.setter def features(self, feats): """ Set the features once given""" if isinstance(feats , str): self.features_= [feats] else: self.features_ = list( self.data.columns )
[docs] def fit (self, X:NDArray =None, y:ArrayLike = None, **fit_params ) -> 'Preprocessing': """ Read the whole dataset, encode the categorial features and populate class attributes. If `X` and `y` are provided, they are considered as a features set and target respectively. They should be splitted to the training set and test set respectively. Parameters ----------- X: N-d array, shape (N, M) the feature arrays composed of N-columns and the M-samples. The feature set excludes the target `y`. y: arraylike , shape (M) the target is composed of M-examples in supervised learning. data: Dataframe or shape (M, N) from :class:`pandas.DataFrame` Dataframe containing samples M and features N including the target `y`. Note that if the data is given, it is not necessary to provide the `X` and `y`. By specifying the target name `tname`, the target should be remove to the data. split_X_y: bool, default {'True'} split the datatset to training set {X, y } and test set {Xt, yt}. Otherwise `X` and `y` should be considered as traning sets. Returns -------- ``self``: `Preprocessing` instance for easy method chaining. Examples --------- >>> from watex.cases.processing import Preprocessing >>> from watex.datasets import fetch_data >>> data = fetch_data('bagoue original').get('data=dfy2') >>> pc = Preprocessing (drop_features = ['lwi', 'num', 'name'] ).fit(data =data ) >>> len(pc.X ), len(y), len(pc.Xt ), len(pc.yt) ... (344, 344, 87, 87) # trainset (X,y) and testset (Xt, yt) """ data = fit_params.pop('data', None) split_X_y= fit_params.pop('split_X_y', True) self.X_ = None or X self.y_ = None or y if data is not None: self.data = data self.X_= self.data.copy() if not isinstance(self.X_, (pd.DataFrame, np.ndarray) ) : msg =f"Expect an nd-array not {type (self.X_).__name__!r}." raise FeatureError( (msg + "Use param 'data' in fit params to read the file") if isinstance(self.X_, str) else msg ) if self.y_ is not None: self.y_ = _assert_all_types(self.y_, pd.Series, np.ndarray) if self.drop_features is not None: if isinstance (self.drop_features , str): self.drop_features =[self.drop_features ] self.X_ = self.X_.drop(columns = self.drop_features) # find numerical and categorial features self.cat_features_, self.num_features_ = findCatandNumFeatures( self.X_ ) # encode categorical values if exists self.X_[self.cat_features_] = (self.X_[self.cat_features_ ] .apply ( lambda c: c.astype( 'category').cat.codes) ) if self.tname is not None: self.y_ = selectfeatures(self.X_, features=self.tname) self.X_.drop(columns=self.tname, inplace =True) # remove the tname and update cat_features or num_features list if self.tname in (self.cat_features_): self.cat_features_.remove (self.tname) elif self.tname in self.num_features_ : self.num_features_.remove (self.tname ) # for consistency, encode label y and let it untouchable if numerical # value is given self.y_ = self.y_.astype ('category').cat.codes # splitted dataset if split_X_y: if self.y_ is None : warnings.warn("target name 'tname' is None. Cannot retrieve" " the target 'y' from the dataset") raise FeatureError("'tname' is missing. specify the target name" " before splitting the datasets.") self.X , self.Xt, self.y, self.yt =\ train_test_split (self.X_, self.y_, test_size = self.test_size, random_state = self.random_state ) else: # consider X and y as a trainig set. self.X, self.y = copy.deepcopy(self.X_) , copy.deepcopy(self.y_ ) return self
@property def inspect(self): """ Inspect data and trigger plot after checking the data entry. Raises `NotFittedError` if ``self`` is not fitted yet.""" if self.X is None: raise NotFittedError(self.msg.format( expobj=self) ) return 1
[docs] def makeModel( self, pipe: F=None, estimator:F=None, )-> Callable[..., F]: """ Assemble pipes and estimator to create the model The model is composed of the transformers and estimator, If one is set to None, it uses the default pipe and estimator which might be not the one expected. Therefore providing a pipe and estimator is suggested. Parameters ----------- pipe: Callable, pipeline or preprocessor Callable pipeline. Pipeline can your own pipeline with different transformer. Refer to the :class:`sklearn.pipeline.Pipeline` for futher details. Call `get_default_pipe` to get the default pipe. estimator: Callable, F or {sklearn estimator} Callable estimator method to fit the model:: estimators= SGDClassifier(random_state=13) `Some pre-estimators can be fetched by providing the prefix as a key of the estimator default dict. For instance to fetch the `DecisionTreeClassifier` estimators:: >>> from watex.cases.processing import Preprocessing >>> Preprocessing._getdestimators()['dtc'] ... DecisionTreeClassifier(max_depth=100, random_state=42) Returns --------- `model_`: Callable, {preprocessor + estimator } Examples ---------- (1) We can get the default preprocessor by merely calling: >>> from watex.cases.processing import Preprocessing >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num']) >>> data = fetch_data ('bagoue original').get('data=dfy2') >>> pc.fit(data =data) >>> pc.makeModel() # use default model and preprocessor >>> pc.model_ (2)-> Or build your own preprocesor object using the example below: >>> from sklearn.pipeline import Pipeline >>> from sklearn.compose import ColumnTransformer >>> from sklearn.impute import SimpleImputer >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder >>> from sklearn.linear_model import LogisticRegression >>> from watex.datasets import fetch_data >>> from watex.cases.processing import Preprocessing >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num']) >>> numeric_features = ['east', 'north', 'power', 'magnitude', 'sfi', 'ohmS'] >>> numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] ) >>> categorical_features = ['shape', 'geol', 'type'] >>> categorical_transformer = OneHotEncoder(handle_unknown="ignore") >>> preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) >>> pc.makeModel (pipe = preprocessor, estimator = LogisticRegression()) >>> pc.model_ """ self.pipe_ = pipe or self.get_default_pipe () if estimator is not None : self.estimator_= estimator # set default configuration of estimators if self.estimator_ is None: if self.verbose: self._logging.info('Loading default parameters into estimators.') print("### -> Use default estimator instead ...") #load all default config parameters des= copy.deepcopy(self.default_estimator) self.default_estimator = str(self.default_estimator).lower().strip() if self.default_estimator not in d_estimators_.keys(): raise ValueError (f"Unknow default estimator :{des!r}") self.estimator_ = self._getdestimators()[self.default_estimator] self.model_ = Pipeline ( steps = [( 'preprocessor', self.pipe_), (f'{self.estimator_.__class__.__name__}', self.estimator_) ] ) return self.model_
[docs] def get_default_pipe(self ): """ make a default pipe to preprocess the data. Create a preprocessor by assembling multiple transformers. The default pipeline is not exhaustive so to have full control of the data, it is recommended to provide a strong preprocessor for the data processing at once. the method returns `self.pipe_`as callable, preprocessor pipeline from :class:`sklearn.pipeline.Pipeline` object. Basically since, the default transformers are composed of: - :meth:`sklearn.pipeline.make_pipeline` for pipeline creation. - :meth:`sklearn.preprocessing.OneHotEncoder` for categorial `features` encoding. - :meth:`sklearn.preprocessing.PolynomialFeatures` for features engineering. - :meth:`sklearn.preprocessing.RobustScaler` for data scaling - :meth:`sklearn.compose.make_column_transformer` for data transformation. - :meth:`sklearn.compose.make_column_selector` for features composing. Default pipeline composition ----------------------------- * imputer callable to fit the missing NaN values in the dataset.the default behaviour use the `strategy` equals to ``mean``. Refer to :class:`sklearn.imputer.SimpleImputer` * num_column_selector Callable method from `sklearn.compose.make_column_selector` Numerical column maker. Refer to sklearn site for :ref:'more details <https://scikit-learn.org/stable/modules/classes.html>` The default is ``make_column_selector(dtype_include=np.number)`` * cat_column_selector Callable from `sklearn.compose.make_column_selector` Callable method. Categorical column selector. The default is ``make_column_selector(dtype_exclude=np.number)``. * features_engineering applies the `Polynomial features` callable from `sklearn.feature_selection` Callable argument using :mod:`sklearn.preprocessing` different method. the default is:: `PolynomialFeatures(10, include_bias=False)` * selectors Selector callable argument including many test methods like `f_classif` or Anova test.The default is:: `SelectKBest(f_classif, k=4),` * scalers Scaling data using many normalization or standardization. The default is ``RobustScaler``. """ num_pipe = Pipeline( steps = [ # since fit method alread separated the numerical # and categorical columns, not need to add as # a transformer again # ('num_selector', make_column_selector(dtype_include=np.number, ) ), ('imputer', SimpleImputer()), ('polynomialfeatures', PolynomialFeatures(10, include_bias=False) ), ('selectors', SelectKBest(f_classif, k=4) ), ('scalers', RobustScaler()), ] ) cat_pipe = Pipeline( steps = [ # ('num_selector', make_column_selector( dtype_exclude=np.number) ), ('imputer', SimpleImputer()), ('onehotencoder', OneHotEncoder(handle_unknown="ignore") ) ] ) self.pipe_ = ColumnTransformer ( transformers=[ ('numpipe', num_pipe , self.num_features_), ( 'catpipe', cat_pipe, self.cat_features_ ) ] ) return self.pipe_
[docs] def baseEvaluation(self, model:F=None, eval_metric=False, **kws )->float: """ Dummy baseline model from preprocessing pipeline. onto a model by providing an estimator. Parameters ----------- model: Callable, {'preprocessor + estimator }, A model is scikit-learn estimator or or composite model built from a Pipeline. If `model` is ``None`` , use the default model from the default `preprocessor and `estimator`. `model` can be a dict of multiples estimators. Therefore the evaluation of each estimator is set to dictionnary where the key is each estimator name. eval_metric: bool, if set to ``True``, confusion matrix and classification report scores are evaluated assuming the the supervised learning is a classification problem. *default* is ``False``. scorer: str, Callable, a scorer is a metric function for model evaluation. If given as string it should be the prefix of the following metrics: * "classification_report" -> for classification_report, * 'precision_recall' -> for precision_recall_curve, * "confusion_matrix" -> for a confusion_matrix, * 'precision' -> for precision_score, * "accuracy" -> for accuracy_score * "mse" -> for mean_squared_error, * "recall" -> for recall_score, * 'auc' -> for roc_auc_score, * 'roc' -> for roc_curve * 'f1' -> for f1_score, Other string prefix values should raises an errors kws: dict, Additionnal keywords arguments from scklearn metric function. Returns ---------- `self.base_score_` : base score after predicting Notes ------ If ``None`` estimator is given, the *default* estimator is `svm` otherwise, provide the prefix to select the convenience estimator into the default dict `default_estimator`. Get the default dict by calling `<instance>._getdestimators()>` Examples --------- >>> from watex.cases.processing import Preprocessing >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num']) >>> data = fetch_data ('bagoue original').get('data=dfy2') >>> pc.fit(data =data) (1) -> default estimator >>> pc.baseEvaluation (eval_metric=True) ... 0.47126436781609193 (2) -> multiples estimators >>> from watex.exlib.sklearn import RandomForestClassifier , SGDClassifier, SimpleImputer >>> estimators={'RandomForestClassifier':RandomForestClassifier (n_estimators=200, random_state=0), 'SDGC':SGDClassifier(random_state=0)} >>> pc.X= SimpleImputer().fit_transform(pc.X) >>> pc.Xt= SimpleImputer().fit_transform(pc.Xt) # remove NaN values >>> pc.baseEvaluation(model={ 'RandomForestClassifier':RandomForestClassifier( n_estimators=200, random_state=0), 'SDGC':SGDClassifier(random_state=0)}, eval_metric =True) >>> pc.ypred_ Out[128]: {'RandomForestClassifier': array([2, 1, 2, 2, 2, 2, 0, 1, 1, 2, 3, 1, 0, 0, 1, 1, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 0, 2, 2, 3, 2, 2, 1, 1, 3, 3, 0, 2, 3, 3, 2, 1, 0, 2, 1, 1, 2, 2, 2, 2, 1, 1, 0, 2, 0, 2, 1, 2, 1, 1, 2, 0, 1, 2, 0, 2, 2, 3, 2, 2, 3, 0, 1, 2, 2, 3, 1, 1, 0, 1, 1, 2, 0, 0, 2, 0, 1], dtype=int8), 'SGDClassifier': array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int8)} >>> pc.base_score_ Out[130]: {'RandomForestClassifier': 0.7816091954022989, 'SGDClassifier': 0.14942528735632185} """ self.inspect self.model_results_ ={} if model is not None : self.model_= model elif self.model_ is None: self.model_ = self.makeModel() # ---> run model for prediction if hasattr (self.model_, '__dict__') and hasattr( self.model_, '__class__'): self.ypred_, self.base_score_ = evalModel( model=self.model_, X=self.X, y=self.y, Xt=self.Xt, yt=self.yt, eval = eval_metric, **kws) self.model_results_[f'{self.model_.__class__.__name__}']= ( self.base_score_ , self.ypred_ ) return self.base_score_ if isinstance(self.model_, dict): print(self.model_) # when mutiples estimators are given for est in list(self.model_.values()) : psc, msc= evalModel( model = est, X= self.X , y=self.y, Xt=self.Xt, yt=self.yt, eval= eval_metric, **kws) self.model_results_ [f'{est.__class__.__name__}'] = (psc, msc) self.ypred_ ={ k: s for k, (s, _) in self.model_results_.items() } self.base_score_ ={ k: s for k, (_, s) in self.model_results_.items() } return self.base_score_
def _getdestimators (self): """ Load default estimator fit default arguments and returns a dict of each default estimator with default hyperparameters already set.""" ens={} d= dict ( knn= dict( n_neighbors=10, metric='manhattan'), svc = dict ( C=100, gamma=1e-3, random_state=self.random_state), dtc = dict( max_depth=100,random_state=self.random_state), rdf = dict( n_estimators=200, random_state=self.random_state), bag = dict (base_estimator=KNeighborsClassifier(), n_estimators=100), sdg = dict(random_state=self.random_state) ) for key in ('vtc', 'stc'): d[key] = dict (estimators = [ ('sdg', SGDClassifier( random_state=self.random_state)), ('dtc', DecisionTreeClassifier( max_depth=100, random_state=self.random_state)), ('knn', KNeighborsClassifier()) ] ) for key , func in d_estimators_.items () : if key not in d.keys() : ens [key] = func (** dict(random_state=self.random_state)) else : ens[key] = func (**d[key]) return ens def __repr__(self): """ Pretty format for programmer guidance following the API... """ return repr_callable_obj (self, skip = ('data', 'y', 'X', 'Xt', 'yt') ) def __getattr__(self, name): rv = smart_strobj_recognition(name, self.__dict__, deep =True) appender = "" if rv is None else f'. Do you mean {rv!r}' raise AttributeError ( f'{self.__class__.__name__!r} object has no attribute {name!r}' f'{appender}{"" if rv is None else "?"}' )
Preprocessing.__doc__="""\ Base preprocessing class. Give a baseline preprocessing model with a base score. Usefull before fidlling the model hyperparameters. Parameters ------------- {params.core.tname} drop_features: list or str, Optional List the useless `features` for predicting or list of column names to drop out. random_state: int, default is ``42`` The state of data shuffling. The default is ``42``. default_estimator: callable, F or sckitlearn estimator The default estimator name for predicting the tname value. A predifined defaults estimators prameters are set and keep in cache for quick preprocessing like: - 'dtc': For DecisionTreeClassifier - 'svc': Support Vector Classifier - 'sdg': SGDClassifier - 'knn': KNeighborsClassifier - 'rdf`: RandmForestClassifier - 'ada': AdaBoostClassifier - 'vtc': VotingClassifier - 'bag': BaggingClassifier - 'stc': StackingClassifier If estimator is not given the default is ``svm`` or ``svc``. test_size: float, The test set data size. Must be less than 1.The sample test size is ``0.2`` either 20% of dataset. {params.core.verbose} Attributes ----------- {params.core.X} {params.core.y} {params.core.Xt} {params.core.yt} {params.core.data} {params.base.pipe_} {params.base.estimator_} {params.core.model} cat_features_: list or str, Optional list of categorical features list. If not given it should be find automatically. num_features_ : list of str, Optional list Numerical features list. If not given, should be find automatically. model: Callable, {{preprocessor + estimator }}, Use the predifined pipelines i.e can be a Pipeline can your build by your own pipeline with different composite estimator. If `model` is ``None`` , use the default model from the default `preprocessor` and `estimator`. Examples --------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.linear_model import SGDClassifier >>> from sklearn.impute import SimpleImputer >>> estimators=dict( ... RandomForestClassifier=RandomForestClassifier( ... n_estimators=200, random_state=0), >>> pc.X= SimpleImputer().fit_transform(pc.X) >>> pc.Xt= SimpleImputer().fit_transform(pc.Xt) # remove NaN values >>> pc.baseEvaluation(estimator=estimators, eval_metric =True) >>> pc.base_score_ ... 0.72586369 """.format( params=_param_docs, )
[docs] class Processing (Preprocessing) : def __init__(self, pipeline:F=None, estimator:F= None, **kws ): super().__init__(**kws) self.pipeline=pipeline self.estimator_=estimator self.model_score_=None self.model_prediction_=None self.estimator_name_=None self.processing_model_=None @property def auto (self): """ Trigger the composite pipeline building and greate a composite default model estimator `CE-SVC` """ return self.auto_ @auto.setter def auto (self, auto): """ Trigger the `CE-SVC` buiLding using default parameters with default pipeline.""" if not auto: return self.auto_= auto format_notes(text= ''.join( [f'Automatic Option is set to ``{self.auto_}``.Composite', ' estimator building is auto-triggered with default ', 'pipeline. The default estimation score should be displayed.', ' ']), cover_str='*',inline = 70, margin_space = 0.05) self._logging.info( ' Automatic Option to design a default composite estimator' f' is triggered <`{self.auto_}``> with default pipeline.') warnings.warn( ' Automatic Option to design a composite estimator is ' f' triggered <`auto={self.auto_}``> with default pipeline ' 'construction. The default estimation score should be ' ' displayed.') self.inspect self.model_score_ = self.baseEvaluation(eval_metric=True) self.preprocessor_ = self.pipe_ formatModelScore(self.model_score_, self.default_estimator) self.model_prediction_ = self.ypred_ @property def processing_model(self): """ Get the default composite model """ return self.processing_model_ @property def preprocessor (self): """ Preoprocessor for `composite_estimator` design """ return self.preprocesor_ def _validate_estimator (self, e): """ Assert whether estimator is valid refering to scikit-learn " conventions""" msg = ( ":https://scikit-learn.org/stable/developers/develop.html &&" "https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline" ) try : from sklearn.utils.estimator_checks import check_estimator check_estimator(e ) except: if not ( hasattr(e, '__dict__') and hasattr( e, '__class__') ): warnings.warn("'estimator does not adhere to sckit-learn conventions." f" Refer to {msg!r} for more guidelines.") raise ProcessingError(f"wrong estimator. Refer to {msg}" " for furher details.") return True @preprocessor.setter def preprocessor(self, pipe): """ Create your preprocessor. If `preprocess` is given, it must be the collection of transformer and encoders which composed of the pipeline like:: my_own_pipelines= {'num_column_selector_': make_column_selector( dtype_include=np.number), 'cat_column_selector_': make_column_selector( dtype_exclude=np.number), 'features_engineering_':PolynomialFeatures(3, include_bias=True), 'selectors_': SelectKBest(f_classif, k=4), 'encodages_': StandardScaler() } """ self._validate_estimator(pipe) self.preprocesor_= pipe @property def model (self): """ Concatenate preprocessor and estimator to var""" if self.model_ is None: self.model_ = self.makeModel( pipe= self.preprocesor_, estimator=self.estimator_) return self.model_ @property def estimator (self): """ Get your estimator of the existing default estimator """ return self.estimator_ @estimator.setter def estimator (self, e): """ Set estimator value. If string value is given, it is considered as the default estimator is expected. Raise and error is not found.""" msg=("A string value assumes to be a default estimator prefix.") if isinstance (e, str): if e not in d_estimators_.keys(): raise EstimatorError( msg + f"Expect {e!r} being in {smart_format(d_estimators_.keys())}" ) e = self._getdestimators()[e] elif isinstance(e, dict): # estimator is a dict or many estimators # check wether each given values much scikit # conventions estimators self.estimator_name_ = [ f'{es.__class__.__name__}' for es in e.values() ] else : self._validate_estimator(e) if self.estimator_name_ is None: self.estimator_name_ = self.estimator_.__class__.__name__ @property def model_score(self): """ Get the composite estimator score """ self.model_score_ = self.baseEvaluation( self.model , eval_metric=True ) self.model_prediction_ = self.ypred_ try : formatModelScore(self.model_score_, self.estimator_name_) except: self._logging.debug( f'{self.estimator_name_ !r} name not found') warnings.warn( f'Unable to find esimator {self.estimator_name_!r} name') return self.model_score_ @model_score.setter def model_score (self, print_score): """ Display score value """ if isinstance(print_score, str): self.estimator_name_ = print_score try : self.estimator_name_ = self.estimator_.__class__.__name__ except : self.estimator_name_ = print_score # hints.formatModelScore(self.model_score_, self.estimator_name_) @property def model_prediction(self): """ Get the model prediction after composite estimator designed""" return self.model_prediction_
[docs] @visualize_valearn_curve(reason ='valcurve', turn='off', k= np.arange(1,210,10), plot_style='line',savefig=None) def get_validation_curve( self, val_params:dict=None, switch_plot:str= 'off', preprocess_step:bool= False, train_pkws: dict=None, val_pkws:dict =None, **kws ): """ Compute the validation score and plot the validation curve if the argument `turn` of decorator is switched to ``on``. If validation keywords arguments `val_curve_kws` does not contain a `param_range` key, the default param_range should be the one of decorator. Parameters ----------- val_params: `validation_curve` keywords arguments. if none the *default* should be:: val_params = {"param_name":'C', "param_range": np.arange(1,210,10), "cv":4} switch_plot: str, default ='on' visualize the validation plot preprocess_step: bool, default=False Trigger the default step of preprocessing. train_pkws: dict, keywords arguments passed to matplotlib.line/scatter plots for training curve val_pkws: dict , keyword arguments passed to matplotlib.line/scatter plot for validation curve. Returns --------- - `train_score`: float|dict of trainset score - `val_score` : float/dict of valisation score - `switch`: Turn ``on`` or ``off`` the validation_plot. - `kk`: the validation `param_range` for plot. Examples ------------- >>> from watex.cases.processing import Processing >>> from watex.datasets import fetch_data >>> data = fetch_data ('bagoue original').get('data=dfy2') >>> processObj= Processing (tname = 'flow', drop_features =['lwi', 'name', 'num']) >>> processObj.fit(data=data ) >>> processObj.get_validation_curve( switch_plot='on', preprocess_step=True) """ dvalp = {"param_range": np.arange(1,210,10), "param_name": "C", "cv":4, "scoring": 'accuracy' } self.inspect if not hasattr(self, "estimator_"): if preprocess_step : if self.verbose : print('---> Preprocessing step is enabled.') self._logging.info( 'By default, the`preprocessing_step` is activated.') self.auto =True else: warnings.warn("Expect one 'estimator' at least") self._logging.error("Expect one 'estimator' at least") raise ProcessingError( "'Estimator' not found. Expect one " "'estimator' at least or set `auto=True`") if val_params is None: if str(self.default_estimator).lower().strip()=='svc': val_params = dvalp self._logging.debug( f'Use default `SVM` params configurations <{val_params}>.') if inspect.isfunction(self.get_validation_curve): _code = self.get_validation_curve.__code__ filename = _code.co_filename lineno = _code.co_firstlineno + 1 else: filename = self.get_validation_curve.__module__ lineno = 1 warnings.warn_explicit( 'Use default `SVM` params configurations <{val_params}>.', category=DeprecationWarning, filename =filename, lineno= lineno) else : raise ProcessingError( "None parameters are detected. Need validation parameters " f" passed to kws 'val_params' for {self.estimator_}. Check the" " list of available parameters with `estimator.get_params().keys()`." " e.g for SVC , the ``val_params` arguments should be " f" `val_params={dvalp}`" ) if not isinstance(self.estimator_, dict) : self.model_dict={'___':self.estimator_ } else : self.model_dict = self.estimator_ for mkey , mvalue in self.model_dict.items(): if len(self.model_dict) ==1: self.train_score, self.val_score = validation_curve( mvalue, self.X, self.y, **val_params ) elif len(self.model_dict) > 1 : trainScore, valScore = validation_curve(mvalue, self.X, self.y, **val_params) self.train_score [mkey] = trainScore self.val_score[mkey] = valScore kk = val_params['param_range'] pname = val_params["param_name"] return (self.train_score, self.val_score, switch_plot , kk , pname, val_pkws, train_pkws)
[docs] def quick_estimation(self, estimator_name =None, default_estimator :bool =False ): """ Quick run the model without any processing. If none estimator is provided ``SVC`` estimator is used. :param estimators: Callable estimator. If ``None``, a ``svc`` is used to quick estimate prediction. :param random_state: The state of data shuffling.The default is ``7``. :Example: >>> from watex.cases.processing import Processing >>> processObj = Processing( data = 'data/geo_fdata/BagoueDataset2.xlsx') >>> processObj.quick_estimation(estimator=DecisionTreeClassifier( max_depth=100, random_state=13) >>> processObj.model_score >>> processObj.model_prediction """ self.inspect if not hasattr (self, "Xt") or not hasattr (self, 'yt'): raise ProcessingError( "Missing of test data Xt and yt. Cannot estimate" " the prediction score. 'refit' the data by turning the" " parameter `split_X_y=True`.") if not hasattr(self, "estimator_"): if estimator_name is not None: self.default_estimator= estimator_name if default_estimator: des= copy.deepcopy(self.default_estimator) self.default_estimator = str(self.default_estimator).lower().strip() if self.default_estimator not in d_estimators_.keys(): raise ValueError (f"Unknow default estimator :{des!r}") self.estimator_ = self._getdestimators()[self.default_estimator] else: raise ProcessingError("Missing estimator. It should not be None.") self.estimator_.fit(self.X, self.y) self.model_score_ = self.estimator_.score( self.Xt, self.yt) self.model_prediction_ = self.estimator_.predict( self.Xt) self.confusion_matrix= confusion_matrix(self.yt, self.model_prediction_) self.classification_report= classification_report(self.yt, self.model_prediction_) return self.model_score_ , self.model_prediction_
Processing.__doc__="""\ Processing class for managing baseline model evaluation and learning. Manages the validation curves after fiddling a little bit an estimator hyperparameters. Processing is usefull before modeling step. To process data, a default implementation is given for data `preprocessor` build. It consists of creating a model pipeline using different transformers. If None pipeline is setting and auto is set to 'True', a default pipeline is created though the `prepocessor`to raun the base model evaluation. Indeed a `preprocessor` is a set of `transformers + estimators`. Parameters ------------- auto: bool, default is {{'False'}} trigger the composite estimator.If ``True`` a composite `preprocessor` is built and use for base model evaluation. *default* is False. pipeline: Callable, F or dict of callable F preprocessing steps encapsulated. If not supplied a default pipe is used as auto is set to ``True``. estimator: Callable, An object which manages the estimation and decoding of a model. Estimators must provide a fit method, and should provide set_params and `get_params`, although these are usually provided by inheritance from `base.BaseEstimator`. The core functionality of some estimators may also be available as a function. {params.core.tname} drop_features: list or str, Optional List the useless `features` for predicting or list of column names to drop out. random_state: int, default is ``42`` The state of data shuffling. The default is ``42``. default_estimator: callable, F or sckitlearn estimator The default estimator name for predicting the tname value. A predifined defaults estimators prameters are set and keep in cache for quick preprocessing like: - 'dtc': For DecisionTreeClassifier - 'svc': Support Vector Classifier - 'sdg': SGDClassifier - 'knn': KNeighborsClassifier - 'rdf`: RandmForestClassifier - 'ada': AdaBoostClassifier - 'vtc': VotingClassifier - 'bag': BaggingClassifier - 'stc': StackingClassifier If estimator is not given the default is ``svm`` or ``svc``. test_size: float, The test set data size. Must be less than 1.The sample test size is ``0.2`` either 20% of dataset. {params.core.verbose} Attributes ----------- {params.core.X} {params.core.y} {params.core.Xt} {params.core.yt} {params.core.data} {params.base.pipe_} {params.base.estimator_} {params.core.model} cat_features_: list or str, Optional list of categorical features list. If not given it should be find automatically. num_features_ : list of str, Optional list Numerical features list. If not given, should be find automatically. model: Callable, {{preprocessor + estimator }}, Use the predifined pipelines i.e can be a Pipeline can your build by your own pipeline with different composite estimator. If `model` is ``None`` , use the default model from the default `preprocessor` and `estimator`. model_score_: float/dict Model test score. Observe your test model score using your compose estimator for enhacement model_prediction_: array_like Observe your test model prediction for as well as the compose estimator enhancement. preprocessor_: Callable , F Compose piplenes and estimators for default model scorage. Examples --------- >>> from watex.cases.processing import Processing >>> from watex.exlib.sklearn import (StandardScaler,RandomForestClassifier, make_column_selector, PolynomialFeatures, SelectKBest, f_classif) >>> data = fetch_data ('bagoue original').get('data=dfy2') >>> my_own_pipeline= {{'num_column_selector_': ... make_column_selector(dtype_include=np.number), ... 'cat_column_selector_': ... make_column_selector(dtype_exclude=np.number), ... 'features_engineering_': ... PolynomialFeatures(3,include_bias=True), ... 'selectors_': SelectKBest(f_classif, k=4), ... 'encodages_': StandardScaler() ... }} >>> my_estimator={{ ... 'RandomForestClassifier':RandomForestClassifier( ... n_estimators=200, random_state=0) ... }} >>> processObj= Processing (tname = 'flow', drop_features =['lwi', 'name', 'num'], pipeline= my_own_pipeline, estimator=my_estimator) >>> processObj.fit(data=data ) >>> processObj.baseEvaluation (eval_metric=True ) ... 0.4942528735632184 # score is an ensemble score for both model """.format( params=_param_docs, )