Source code for watex.cases.processing

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Wed Jul 14 20:00:26 2021

from __future__ import (
    print_function ,
    division, 
    annotations
)
import copy
import warnings 
import inspect
import numpy as np 
import pandas as pd 

from .._docstring import  DocstringComponents, _core_docs
from .._watexlog import watexlog 
from ..decorators import visualize_valearn_curve
from ..exceptions import ( 
    FeatureError , 
    NotFittedError , 
    ProcessingError, 
    EstimatorError
  )
from ..exlib.sklearn  import ( 
    DecisionTreeClassifier, 
    KNeighborsClassifier, 
    OneHotEncoder, 
    SelectKBest,  
    SGDClassifier,
    SVC, 
    PolynomialFeatures, 
    RobustScaler, 
    ColumnTransformer,
    confusion_matrix , 
    classification_report, 
    f_classif, 
    train_test_split, 
    validation_curve ,
    SimpleImputer,
    Pipeline,
    _HAS_ENSEMBLE_,   
    ) 
from ..utils.coreutils import ( 
    _is_readable , 
    _assert_all_types
    )
from ..utils.funcutils import ( 
    format_notes, 
    repr_callable_obj, 
    smart_strobj_recognition, 
    smart_format
    )
from ..utils.mlutils import (
    formatModelScore, 
    findCatandNumFeatures,
    selectfeatures, 
    evalModel 
    )
from .._typing import ( 
    List, 
    Callable, 
    NDArray, 
    ArrayLike,
    F
    ) 
_logger =watexlog().get_watex_logger(__name__)
d_estimators_={'dtc':DecisionTreeClassifier, 
                'svc':SVC, 
                'sgd':SGDClassifier, 
                'knn':KNeighborsClassifier 
                 }
if _HAS_ENSEMBLE_ :
    from ..exlib.sklearn import skl_ensemble_
    
    for es_, esf_ in zip(['rdf', 'ada', 'vtc', 'bag','stc'], skl_ensemble_): 
        d_estimators_[es_]=esf_ 
        
# append repeat docs to dictdocs
_preproces_params =dict ( 
    pipe_ = """
pipe_:Callable, preprocessor object from :mod:`sklearn.pipeline`
    Pipeline can  be buit by your own pipeline with different transformer. 
    For base model prediction, it is possible to use the default pipeline.
    Call `get_default_pipe` to get the transformation list and steps. 
    """, 
    estimator_="""
estimator: Callable, F or :mod:`sklearn.metaestimator`
    Callable estimator method to fit the model:: 
        
        estimators= SGDClassifier(random_state=13)    
    """
    )
_param_docs = DocstringComponents.from_nested_components(
    core=_core_docs["params"], 
    base=DocstringComponents(_preproces_params), 
    )

[docs]
class Preprocessing : 
    def __init__(self, 
                 tname:str ='flow', 
                 drop_features: List[str]=None,  #['lwi']
                 random_state: int =42 , 
                 default_estimator: F|str= 'svc', 
                 test_size: float =.2 ,                
                 verbose: int = 0 , 
                 ): 
        self._logging = watexlog().get_watex_logger(self.__class__.__name__)
        
        self.tname=tname
        self.drop_features=drop_features
        self.random_state=random_state
        self.default_estimator=default_estimator
        self.test_size=test_size
        self.verbose=verbose 
        self.X=None 
        self.y=None 
        self.Xt= None
        self.yt=None 
        self.features_ = None 
        self.cat_features_ =None
        self.num_features_ =None
        self.y_=None
        self.X_= None 
        self.estimator_ =None 
        self.pipe_ = None 
        self.ypred_= None 
        self.model_results_=None 
        self.base_score_=None 
        self.data_ = None 
        self.model_ =None 
  
    @property 
    def data(self): 
        return self.data_ 
    
    @data.setter
    def data (self, d): 
        """ Read the given data and create a pd.core.frame.DataFrame . 
        Call :class:~analysis.features.sl_analysis` and retrieve the best 
          information. """

        self.data_ = _is_readable(d)
    

    @property 
    def features(self): 
        """ Collect the list of features"""
        return self.features_ 
    @features.setter 
    def features(self, feats): 
        """ Set the features once given"""
        
        if isinstance(feats , str):
            self.features_= [feats]
        else: self.features_ = list( self.data.columns )
            
        

[docs]
    def fit (self, 
             X:NDArray =None, 
             y:ArrayLike = None, 
             **fit_params
             ) -> 'Preprocessing': 
        """ 
        Read the whole dataset, encode the categorial features and 
        populate class attributes.
        
        If `X` and `y` are provided, they are considered as a features set
        and target respectively. They should be splitted to the training set 
        and test set respectively.
        
        Parameters 
        -----------
        X: N-d array, shape (N, M) 
            the feature arrays composed of N-columns and the M-samples. The 
            feature set excludes the target `y`. 
        y: arraylike , shape (M)
            the target is composed of M-examples in supervised learning. 
            
        data: Dataframe or shape (M, N) from :class:`pandas.DataFrame` 
            Dataframe containing samples M  and features N including the 
            target `y`. 
            Note that if the data is given, it is not necessary to provide the
            `X` and `y`. By specifying the target name `tname`, the target 
            should be remove to the data. 
        split_X_y: bool, default {'True'}
            split the datatset to training set {X, y } and test set {Xt, yt}. 
            Otherwise `X` and `y` should be considered as traning sets.  
            
        Returns 
        --------
        ``self``: `Preprocessing` instance for easy method chaining.
        
        Examples
        ---------
        >>> from watex.cases.processing import Preprocessing 
        >>> from watex.datasets import fetch_data 
        >>> data = fetch_data('bagoue original').get('data=dfy2')
        >>> pc = Preprocessing (drop_features = ['lwi', 'num', 'name']
                                ).fit(data =data )
        >>> len(pc.X ),  len(y), len(pc.Xt ),  len(pc.yt)
        ... (344, 344, 87, 87) # trainset (X,y) and testset (Xt, yt)

        """
        data = fit_params.pop('data', None)
        split_X_y= fit_params.pop('split_X_y', True)
        
        self.X_ = None or X 
        self.y_ = None or y 
        
        if data is not None: 
            self.data = data 
            self.X_= self.data.copy()
        
        if not isinstance(self.X_, (pd.DataFrame, np.ndarray) ) : 
            msg  =f"Expect an nd-array not {type (self.X_).__name__!r}."
            raise FeatureError( 
                (msg + "Use param 'data' in fit params to read the file") 
                               if isinstance(self.X_, str) else msg )
            
        if self.y_ is not None: 
            self.y_ = _assert_all_types(self.y_, pd.Series, np.ndarray)
            
        if self.drop_features is not None: 
            if isinstance (self.drop_features , str): 
                self.drop_features =[self.drop_features ]
            self.X_ = self.X_.drop(columns = self.drop_features)
             
        # find numerical and categorial features 
        self.cat_features_, self.num_features_ = findCatandNumFeatures(
            self.X_ 
            )
        # encode categorical values if exists 
        self.X_[self.cat_features_] = (self.X_[self.cat_features_ ]
                                       .apply ( lambda c: c.astype(
                                               'category').cat.codes)
                                       )
        if self.tname is not None: 
            self.y_ = selectfeatures(self.X_, features=self.tname)
            self.X_.drop(columns=self.tname, inplace =True)
            # remove the tname and update cat_features or num_features list 
            if self.tname in (self.cat_features_): 
                self.cat_features_.remove (self.tname) 
            elif self.tname in self.num_features_ : 
                self.num_features_.remove (self.tname )
                
        # for consistency, encode label y and let it untouchable if numerical 
        # value is given
        self.y_ = self.y_.astype ('category').cat.codes
        
        # splitted dataset 
        if split_X_y: 
            if self.y_ is None :
                warnings.warn("target name 'tname' is None. Cannot retrieve"
                              " the target 'y' from the dataset")
                raise FeatureError("'tname' is missing. specify the target name"
                                   " before splitting the datasets.")
                
            self.X , self.Xt, self.y, self.yt =\
                train_test_split (self.X_, self.y_, test_size = self.test_size,
                                  random_state = self.random_state )
        else: 
            # consider X and y as a trainig set. 
            self.X, self.y = copy.deepcopy(self.X_) , copy.deepcopy(self.y_ )
            
        return self 

    
    @property 
    def inspect(self): 
        """ Inspect data and trigger plot after checking the data entry. 
        Raises `NotFittedError` if ``self`` is not fitted yet."""
        if self.X is None: 
            raise NotFittedError(self.msg.format(
                expobj=self)
            )
        return 1
    
     

[docs]
    def makeModel( self, pipe: F=None, estimator:F=None, 
                     )-> Callable[..., F]:
        """
        Assemble pipes and estimator to create the model 
        
        The model is composed of the transformers and estimator, If one is set 
        to None, it uses the default pipe and estimator which might be not the 
        one expected. Therefore providing a pipe and estimator is suggested.
        
        Parameters 
        -----------
        pipe: Callable, pipeline or preprocessor 
            Callable pipeline. Pipeline can your own pipeline with different 
            transformer. Refer to the  :class:`sklearn.pipeline.Pipeline` 
            for futher details. Call `get_default_pipe` to get the default 
            pipe.
            
        estimator: Callable, F or {sklearn estimator}
            Callable estimator method to fit the model:: 
                
                estimators= SGDClassifier(random_state=13)
                
             `Some pre-estimators can be fetched by providing the prefix as  
             a key of the estimator default dict. For instance to fetch the 
             `DecisionTreeClassifier` estimators:: 
                
                 >>> from watex.cases.processing import Preprocessing 
                 >>> Preprocessing._getdestimators()['dtc']
                 ... DecisionTreeClassifier(max_depth=100, random_state=42)
        
        Returns 
        ---------
        `model_`: Callable, {preprocessor + estimator } 
        
        Examples 
        ----------
        (1) We can get the default preprocessor by merely calling: 

        >>> from watex.cases.processing import Preprocessing 
        >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num'])
        >>> data = fetch_data ('bagoue original').get('data=dfy2')
        >>> pc.fit(data =data) 
        >>> pc.makeModel() # use default model and preprocessor 
        >>> pc.model_ 
                 
        (2)-> Or build your own preprocesor object using the example below: 

        >>> from sklearn.pipeline import Pipeline  
        >>> from sklearn.compose import ColumnTransformer
        >>> from sklearn.impute import SimpleImputer
        >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
        >>> from sklearn.linear_model import LogisticRegression
        >>> from watex.datasets import fetch_data 
        >>> from watex.cases.processing import Preprocessing 
        >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num'])
        >>> numeric_features = ['east', 'north', 'power', 'magnitude', 'sfi', 'ohmS']
        >>> numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="median")), 
                   ("scaler", StandardScaler())]
            )
        >>> categorical_features = ['shape', 'geol', 'type']
        >>> categorical_transformer = OneHotEncoder(handle_unknown="ignore")
        >>> preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ])
        >>> pc.makeModel (pipe = preprocessor, 
                          estimator =  LogisticRegression()) 
        >>> pc.model_
        
        """
        
        self.pipe_ = pipe or self.get_default_pipe ()
        if estimator is not  None : 
            self.estimator_= estimator 
        
        # set default configuration of estimators 
        if self.estimator_ is None:
            if self.verbose: 
                self._logging.info('Loading default parameters into estimators.')
                print("### -> Use default estimator instead ...")
            #load all default config parameters
            des= copy.deepcopy(self.default_estimator)
            self.default_estimator = str(self.default_estimator).lower().strip() 
            
            if self.default_estimator not in d_estimators_.keys(): 
                raise ValueError (f"Unknow default estimator :{des!r}")
                
            self.estimator_ = self._getdestimators()[self.default_estimator]
            
        self.model_ = Pipeline ( 
            steps = [( 'preprocessor', self.pipe_), 
                     (f'{self.estimator_.__class__.__name__}', self.estimator_) 
                     ]
            )
        return self.model_ 

    

[docs]
    def get_default_pipe(self ):
        """ make a default pipe to preprocess the data. 
        
        Create a preprocessor by assembling multiple transformers. The default 
        pipeline is not exhaustive so to have full control of the data, it 
        is recommended to provide a strong preprocessor for the data 
        processing at once. 
        
        the method returns `self.pipe_`as  callable, preprocessor pipeline 
        from :class:`sklearn.pipeline.Pipeline` object. Basically since, the 
        default transformers are composed of: 
            
            - :meth:`sklearn.pipeline.make_pipeline` for pipeline creation. 
            - :meth:`sklearn.preprocessing.OneHotEncoder` for categorial 
                `features` encoding. 
            - :meth:`sklearn.preprocessing.PolynomialFeatures` for features 
               engineering. 
            - :meth:`sklearn.preprocessing.RobustScaler` for data scaling 
            - :meth:`sklearn.compose.make_column_transformer` for data 
                transformation. 
            - :meth:`sklearn.compose.make_column_selector` for features 
                composing.
                
        Default pipeline composition  
        -----------------------------
        * imputer
            callable to fit the missing NaN values in the dataset.the default 
            behaviour use the `strategy` equals to ``mean``. Refer to 
            :class:`sklearn.imputer.SimpleImputer`
            
        * num_column_selector 
            Callable method from `sklearn.compose.make_column_selector`
            Numerical column maker. Refer to  sklearn site for  
            :ref:'more details <https://scikit-learn.org/stable/modules/classes.html>` 
            The default is ``make_column_selector(dtype_include=np.number)``
            
        * cat_column_selector
            Callable from `sklearn.compose.make_column_selector`
            Callable method. Categorical column selector. The default is
            ``make_column_selector(dtype_exclude=np.number)``. 
            
        * features_engineering applies the `Polynomial features`
            callable from `sklearn.feature_selection`
            Callable argument using :mod:`sklearn.preprocessing` different 
            method. the default is::
            
                `PolynomialFeatures(10, include_bias=False)`
                
        * selectors
            Selector callable argument including many test 
            methods like `f_classif` or Anova test.The default is::
                
                `SelectKBest(f_classif, k=4),` 
           
        * scalers 
            Scaling data using many normalization or standardization. The 
            default is  ``RobustScaler``. 

        """
        num_pipe = Pipeline(
            steps = [ 
            # since fit method alread separated the numerical 
            # and categorical columns, not need to add as 
            # a transformer again 
            # ('num_selector', make_column_selector(dtype_include=np.number, ) ), 
            ('imputer', SimpleImputer()), 
            ('polynomialfeatures', PolynomialFeatures(10, include_bias=False) ), 
            ('selectors', SelectKBest(f_classif, k=4) ), 
            ('scalers', RobustScaler()), 
            ] 
        )
        cat_pipe = Pipeline(
            steps = [ 
            # ('num_selector', make_column_selector( dtype_exclude=np.number) ),
            ('imputer', SimpleImputer()), 
            ('onehotencoder', OneHotEncoder(handle_unknown="ignore") )
            ]
            
            )
        self.pipe_ =  ColumnTransformer ( 
            transformers=[ 
                    ('numpipe', num_pipe , self.num_features_), 
                    ( 'catpipe', cat_pipe, self.cat_features_ )
                                ]  )
            
        return self.pipe_  

        

[docs]
    def baseEvaluation(self, model:F=None, eval_metric=False, **kws
                 )->float: 
        """
        Dummy baseline model from preprocessing pipeline. 
        
        onto a model by providing an estimator. 
        
        Parameters 
        -----------
        model: Callable, {'preprocessor + estimator },
            A model is scikit-learn estimator or or  composite model  built 
            from a Pipeline. If `model` is ``None`` , use the default model 
            from the default `preprocessor and `estimator`. `model` can be 
            a dict of multiples estimators. Therefore the evaluation of each 
            estimator is set to dictionnary where the key is each estimator 
            name. 
     
        eval_metric: bool, 
            if set to ``True``, confusion matrix and classification report scores
            are evaluated assuming the the supervised learning is a classification
            problem. *default* is ``False``. 
            
        scorer: str, Callable, 
            a scorer is a metric  function for model evaluation. If given as 
            string it should be the prefix of the following metrics: 
                
                * "classification_report"     -> for classification_report,
                * 'precision_recall'          -> for precision_recall_curve,
                * "confusion_matrix"          -> for a confusion_matrix,
                * 'precision'                 -> for  precision_score,
                * "accuracy"                  -> for  accuracy_score
                * "mse"                       -> for mean_squared_error, 
                * "recall"                    -> for  recall_score, 
                * 'auc'                       -> for  roc_auc_score, 
                * 'roc'                       -> for  roc_curve 
                * 'f1'                        -> for f1_score,
                
            Other string prefix values should raises an errors 
            
        kws: dict, 
            Additionnal keywords arguments from scklearn metric function.
            
        Returns 
        ----------
        `self.base_score_` : base score after predicting 
        
        Notes
        ------
        If ``None`` estimator is given, the *default* estimator is `svm`
        otherwise, provide the  prefix to select  the convenience estimator 
        into the  default dict `default_estimator`. Get the default dict by 
        calling `<instance>._getdestimators()>`

        Examples 
        ---------
        >>> from watex.cases.processing import Preprocessing 
        >>> pc = Preprocessing (tname = 'flow', drop_features =['lwi', 'name', 'num'])
        >>> data = fetch_data ('bagoue original').get('data=dfy2')
        >>> pc.fit(data =data)
        
        (1) -> default estimator 
        >>> pc.baseEvaluation (eval_metric=True)
        ... 0.47126436781609193
        
        (2) -> multiples estimators 
        >>> from watex.exlib.sklearn import RandomForestClassifier , SGDClassifier, SimpleImputer 
        >>> estimators={'RandomForestClassifier':RandomForestClassifier
        		(n_estimators=200, random_state=0), 
        		'SDGC':SGDClassifier(random_state=0)}
        >>> pc.X= SimpleImputer().fit_transform(pc.X)
        >>> pc.Xt= SimpleImputer().fit_transform(pc.Xt) # remove NaN values 
        >>> pc.baseEvaluation(model={
        'RandomForestClassifier':RandomForestClassifier(
          n_estimators=200, random_state=0), 
        'SDGC':SGDClassifier(random_state=0)}, eval_metric =True)
        >>> pc.ypred_
        Out[128]: 
        {'RandomForestClassifier': array([2, 1, 2, 2, 2, 2, 0, 1, 1, 2, 3, 1, 0, 0, 1, 1, 1, 2, 2, 3, 2, 3,
                1, 2, 1, 2, 0, 2, 2, 3, 2, 2, 1, 1, 3, 3, 0, 2, 3, 3, 2, 1, 0, 2,
                1, 1, 2, 2, 2, 2, 1, 1, 0, 2, 0, 2, 1, 2, 1, 1, 2, 0, 1, 2, 0, 2,
                2, 3, 2, 2, 3, 0, 1, 2, 2, 3, 1, 1, 0, 1, 1, 2, 0, 0, 2, 0, 1],
               dtype=int8),
         'SGDClassifier': array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
               dtype=int8)} 
        >>> pc.base_score_ 
        Out[130]: 
        {'RandomForestClassifier': 0.7816091954022989,
         'SGDClassifier': 0.14942528735632185}
        
        """
        self.inspect 
        self.model_results_ ={} 
        if model is not  None : 
            self.model_= model 
        elif self.model_ is None: 
            self.model_ = self.makeModel() 
            
        # ---> run model for prediction 
        if hasattr (self.model_, '__dict__') and hasattr(
                self.model_, '__class__'): 
            self.ypred_, self.base_score_ = evalModel(
                model=self.model_, X=self.X, y=self.y,  Xt=self.Xt, yt=self.yt,
                eval = eval_metric, **kws)
            
            self.model_results_[f'{self.model_.__class__.__name__}']= (
                self.base_score_ , self.ypred_ )

            return self.base_score_ 
        
        if isinstance(self.model_, dict): 
            print(self.model_)
            # when mutiples estimators are given 
            for est in list(self.model_.values())  : 
                psc, msc= evalModel(
                    model = est, X= self.X , y=self.y,  Xt=self.Xt, yt=self.yt, 
                    eval= eval_metric, **kws)
                self.model_results_  [f'{est.__class__.__name__}'] =  (psc, msc)
                
            self.ypred_ ={
                k: s for k, (s, _) in self.model_results_.items() 
                              }  
            self.base_score_ ={
                k: s for k, (_, s) in self.model_results_.items() 
                              }  
                                    
        return self.base_score_  


    def _getdestimators (self): 
        """ Load default estimator fit default arguments and returns a dict 
        of each default estimator with default hyperparameters already set."""
        ens={}
        d= dict ( 
            knn= dict(
                n_neighbors=10,  metric='manhattan'), 
            svc = dict (
                C=100, gamma=1e-3, random_state=self.random_state), 
            dtc = dict(
                max_depth=100,random_state=self.random_state), 
            rdf = dict(
                n_estimators=200, random_state=self.random_state), 
        
            bag = dict (base_estimator=KNeighborsClassifier(), 
                        n_estimators=100), 
            sdg = dict(random_state=self.random_state)
            )
        
        for key in ('vtc', 'stc'): 
            d[key] = dict (estimators = [
                ('sdg', SGDClassifier(
                 random_state=self.random_state)),
                ('dtc', DecisionTreeClassifier(
                    max_depth=100, 
                    random_state=self.random_state)), 
                ('knn', KNeighborsClassifier())
                ]
            )
        
        for key , func in d_estimators_.items () :
            if key not in d.keys() :
                ens [key] = func (** dict(random_state=self.random_state)) 
            
            else : ens[key] = func (**d[key])
            
        return ens 
     
    def __repr__(self):
       """ Pretty format for programmer guidance following the API... """
       return repr_callable_obj  (self, skip = ('data', 'y', 'X', 'Xt', 'yt') )
       
    def __getattr__(self, name):
        rv = smart_strobj_recognition(name, self.__dict__, deep =True)
        appender  = "" if rv is None else f'. Do you mean {rv!r}'
        
        raise AttributeError (
            f'{self.__class__.__name__!r} object has no attribute {name!r}'
            f'{appender}{"" if rv is None else "?"}'
            )        


Preprocessing.__doc__="""\
Base preprocessing class. 

Give a baseline preprocessing model with a base score. Usefull before fidlling 
the model hyperparameters. 

Parameters 
-------------
{params.core.tname}

drop_features: list or str, Optional
    List the useless `features` for predicting or list of column names to drop 
    out. 
random_state: int, default is ``42``
    The state of data shuffling. The default is ``42``.
    
default_estimator: callable, F or sckitlearn estimator 
    The default estimator name for predicting the tname value. A predifined 
    defaults estimators prameters are set and keep in cache for quick 
    preprocessing like: 
    - 'dtc': For DecisionTreeClassifier 
    - 'svc': Support Vector Classifier 
    - 'sdg': SGDClassifier
    - 'knn': KNeighborsClassifier
    - 'rdf`: RandmForestClassifier 
    - 'ada': AdaBoostClassifier 
    - 'vtc': VotingClassifier
    - 'bag': BaggingClassifier 
    - 'stc': StackingClassifier
    If estimator is not given the default is ``svm`` or 
                                    ``svc``.
test_size: float,       
    The test set data size. Must be less than 1.The sample test size is 
    ``0.2`` either 20% of dataset.      

{params.core.verbose} 

Attributes
-----------
{params.core.X}
{params.core.y}
{params.core.Xt}
{params.core.yt}
{params.core.data}
{params.base.pipe_}
{params.base.estimator_}
{params.core.model}

cat_features_: list or str, Optional
     list of categorical features list. If not given it should be find 
     automatically.           
num_features_ : list of str, Optional
     list Numerical features list. If not given, should be find automatically. 
     
model: Callable, {{preprocessor + estimator }},
    Use the predifined pipelines i.e can be a Pipeline can your build 
    by your own pipeline with different composite estimator.
    If `model` is ``None`` , use the default model from the default 
    `preprocessor` and `estimator`. 

Examples
--------- 
>>> from sklearn.ensemble import RandomForestClassifier 
>>> from sklearn.linear_model import SGDClassifier
>>> from sklearn.impute import SimpleImputer 
>>> estimators=dict(
...    RandomForestClassifier=RandomForestClassifier(
...        n_estimators=200, random_state=0), 
>>> pc.X= SimpleImputer().fit_transform(pc.X)
>>> pc.Xt= SimpleImputer().fit_transform(pc.Xt) # remove NaN values 
>>> pc.baseEvaluation(estimator=estimators, eval_metric =True)
>>> pc.base_score_
... 0.72586369
""".format(
    params=_param_docs,
)  
    
   

[docs]
class Processing (Preprocessing) : 
    def __init__(self, 
                 pipeline:F=None, 
                 estimator:F= None,  
                 **kws
                 ):
        super().__init__(**kws)
        
        self.pipeline=pipeline 
        self.estimator_=estimator
        self.model_score_=None 
        self.model_prediction_=None 
        self.estimator_name_=None 
        self.processing_model_=None
            
    
    @property 
    def auto (self): 
        """ Trigger the composite pipeline building and greate 
        a composite default model estimator `CE-SVC` """
        return self.auto_ 
    
    @auto.setter 
    def auto (self, auto): 
        """ Trigger the `CE-SVC` buiLding using default parameters with 
        default pipeline."""
        if not auto: return 
    
        self.auto_= auto 
        
        format_notes(text= ''.join(
            [f'Automatic Option is set to ``{self.auto_}``.Composite',
            '  estimator building is auto-triggered with default ',
            'pipeline. The default estimation score should be displayed.',
            '  ']), 
            cover_str='*',inline = 70, margin_space = 0.05)
    
        self._logging.info(
            ' Automatic Option to design a default composite estimator'
            f' is triggered <`{self.auto_}``> with default pipeline.')
        warnings.warn(
            ' Automatic Option to design a composite estimator is '
            f' triggered <`auto={self.auto_}``> with default pipeline '
            'construction. The default estimation score should be '
            ' displayed.')
        
        self.inspect 
        
        self.model_score_ = self.baseEvaluation(eval_metric=True)
        self.preprocessor_ = self.pipe_ 
        formatModelScore(self.model_score_, self.default_estimator)
        self.model_prediction_ = self.ypred_
        
    
    @property 
    def processing_model(self): 
        """ Get the default composite model """
        return self.processing_model_ 
    
    @property 
    def preprocessor (self): 
        """ Preoprocessor for `composite_estimator` design """
        return self.preprocesor_ 
    
    def _validate_estimator (self, e):
        """ Assert whether estimator is valid refering to scikit-learn "
        conventions"""
        msg = ( ":https://scikit-learn.org/stable/developers/develop.html &&"
            "https://scikit-learn.org/stable/modules/classes.html#module-sklearn.pipeline"
            )
        try : 
            from sklearn.utils.estimator_checks import check_estimator
            check_estimator(e )
        except: 
            if not  ( hasattr(e, '__dict__') and hasattr(
                       e, '__class__') ):
                warnings.warn("'estimator does not adhere to sckit-learn conventions."
                    f" Refer to {msg!r} for more guidelines.")
                raise ProcessingError(f"wrong estimator. Refer to {msg}"
                                      " for furher details.")
        return True 
    
    @preprocessor.setter 
    def preprocessor(self, pipe): 
        """ Create your preprocessor. If `preprocess` is given, it must be
        the collection of transformer and encoders which composed of
        the pipeline like:: 
            
            my_own_pipelines= {'num_column_selector_': make_column_selector(
                                        dtype_include=np.number),
            'cat_column_selector_': make_column_selector(
                                        dtype_exclude=np.number),
            'features_engineering_':PolynomialFeatures(3,
                                        include_bias=True),
            'selectors_': SelectKBest(f_classif, k=4), 
             'encodages_': StandardScaler()
                         }
        """
        self._validate_estimator(pipe) 
        self.preprocesor_= pipe  
    
    @property 
    def model (self):
        """ Concatenate preprocessor and estimator to var"""
        if self.model_ is None: 
            self.model_ = self.makeModel(
                pipe= self.preprocesor_, estimator=self.estimator_)
            
        return self.model_ 
    
    @property 
    def estimator (self): 
        """ Get your estimator of  the existing default estimator """
        return self.estimator_ 
    
    @estimator.setter 
    def estimator (self, e): 
        """ Set estimator value. If string value is given, it is considered 
        as the default estimator is expected. Raise and error is not found."""
        msg=("A string value assumes to be a default estimator prefix.")
        
        if isinstance (e, str): 
            if e not in d_estimators_.keys(): 
                raise EstimatorError( msg + 
                    f"Expect {e!r} being in {smart_format(d_estimators_.keys())}"
                    )
            e = self._getdestimators()[e]
        elif isinstance(e, dict): 
            # estimator is a dict or many estimators 
            # check wether each given values much scikit 
            # conventions estimators 
            self.estimator_name_ = [
                f'{es.__class__.__name__}' for es in e.values()
                ]
        else : 
            self._validate_estimator(e)
            
        if self.estimator_name_ is None: 
            self.estimator_name_ = self.estimator_.__class__.__name__  
        
    
    @property 
    def model_score(self): 
        """ Get the composite estimator score """
        self.model_score_ = self.baseEvaluation(
            self.model , eval_metric=True )
        self.model_prediction_ = self.ypred_
        
        try : 
            formatModelScore(self.model_score_, self.estimator_name_)
        except: 
            self._logging.debug(
                f'{self.estimator_name_ !r} name not found')
            warnings.warn(
                f'Unable to find esimator {self.estimator_name_!r} name')
            
        return self.model_score_ 
    
    @model_score.setter 
    def model_score (self, print_score): 
        """ Display score value """
        if isinstance(print_score, str): 
            self.estimator_name_ = print_score 
        try : 
            self.estimator_name_ = self.estimator_.__class__.__name__
        except : 
            self.estimator_name_ = print_score
        # hints.formatModelScore(self.model_score_, self.estimator_name_)
        
    @property 
    def model_prediction(self):
        """ Get the model prediction after composite estimator designed"""
        return self.model_prediction_ 
        

[docs]
    @visualize_valearn_curve(reason ='valcurve', turn='off', 
               k= np.arange(1,210,10), plot_style='line',savefig=None)               
    def get_validation_curve(
            self, 
            val_params:dict=None, 
            switch_plot:str= 'off', 
            preprocess_step:bool= False, 
            train_pkws: dict=None,
            val_pkws:dict =None, 
            **kws
            ):
        """ Compute the validation score and plot the validation curve if 
        the argument `turn` of decorator is switched to ``on``. 
        
        If validation keywords arguments `val_curve_kws` does not contain a 
        `param_range` key, the default param_range should be the one of 
            decorator.
            
        Parameters 
        -----------
        val_params:
            `validation_curve` keywords arguments.  if none the *default* 
            should be::
                
                val_params = {"param_name":'C', 
                             "param_range": np.arange(1,210,10), 
                             "cv":4}
        switch_plot: str, default ='on' 
            visualize the validation plot
        
        preprocess_step: bool, default=False 
            Trigger the default step of preprocessing. 
            
        train_pkws: dict, 
            keywords arguments passed to matplotlib.line/scatter plots for 
            training curve 
        val_pkws: dict , 
        keyword arguments passed to matplotlib.line/scatter plot for 
            validation curve. 
            
        Returns 
        ---------
            - `train_score`: float|dict of trainset score 
            - `val_score` : float/dict of valisation score 
            - `switch`: Turn ``on`` or ``off`` the validation_plot.
            - `kk`: the validation `param_range` for plot.
        
        Examples 
        -------------
            
        >>> from watex.cases.processing  import Processing 
        >>> from watex.datasets import fetch_data 
        >>> data = fetch_data ('bagoue original').get('data=dfy2')
        >>> processObj= Processing (tname = 'flow', 
                                    drop_features =['lwi', 'name', 'num'])  
        >>> processObj.fit(data=data )
        >>> processObj.get_validation_curve(
            switch_plot='on', preprocess_step=True)
        """
        dvalp = {"param_range": np.arange(1,210,10), 
                   "param_name": "C", 
                   "cv":4, 
                   "scoring": 'accuracy'
        }
        
        self.inspect 
        
        if not hasattr(self, "estimator_"): 
            if preprocess_step : 
                if self.verbose :
                    print('---> Preprocessing step is enabled.')
                    self._logging.info(
                        'By default, the`preprocessing_step` is activated.')
                self.auto =True 
            else:
                 warnings.warn("Expect one 'estimator' at least")
                 self._logging.error("Expect one 'estimator' at least")
                 raise ProcessingError( "'Estimator' not found. Expect one "
                                   "'estimator' at least or set `auto=True`")
                 
        if val_params is None:
            if str(self.default_estimator).lower().strip()=='svc': 
                val_params = dvalp
                
                self._logging.debug(
                    f'Use default `SVM` params configurations <{val_params}>.')
                
                if inspect.isfunction(self.get_validation_curve): 
                    _code = self.get_validation_curve.__code__
                    filename = _code.co_filename
                    lineno = _code.co_firstlineno + 1
                else: 
                   filename = self.get_validation_curve.__module__
                   lineno = 1
    
                warnings.warn_explicit(
                    'Use default `SVM` params configurations <{val_params}>.',
                                       category=DeprecationWarning, 
                                       filename =filename, lineno= lineno)
            else : 
                raise ProcessingError( 
                    "None parameters are detected. Need validation parameters "
                    f" passed to kws 'val_params' for {self.estimator_}. Check the"
                    " list of available parameters with `estimator.get_params().keys()`."
                    " e.g for SVC , the ``val_params` arguments should be "
                    f" `val_params={dvalp}`"
                    )

        if not isinstance(self.estimator_, dict) : 
            self.model_dict={'___':self.estimator_ }
        else : 
            self.model_dict = self.estimator_
        
        for mkey , mvalue in self.model_dict.items(): 
            if len(self.model_dict) ==1:
                self.train_score, self.val_score = validation_curve(
                                        mvalue,
                                        self.X, self.y,
                                        **val_params
                                        )
                
            elif len(self.model_dict) > 1 :
                trainScore, valScore = validation_curve(mvalue,
                                       self.X, self.y,
                                       **val_params)
                self.train_score [mkey] = trainScore
                self.val_score[mkey] = valScore 

        kk = val_params['param_range']

        pname = val_params["param_name"]
        
        return (self.train_score, self.val_score, switch_plot ,
                kk , pname,  val_pkws, train_pkws)     

    
        

[docs]
    def quick_estimation(self,  estimator_name =None,
                         default_estimator :bool =False ): 
        """ Quick run the model without any processing.  If none estimator 
        is provided ``SVC`` estimator is used.
        
        :param estimators: Callable estimator. If ``None``, a ``svc`` is 
            used to quick estimate prediction. 
                            
        :param random_state: The state of data shuffling.The default is ``7``.
                                        
        :Example: 
        >>> from watex.cases.processing import Processing 
        >>> processObj = Processing(
            data = 'data/geo_fdata/BagoueDataset2.xlsx')
        >>> processObj.quick_estimation(estimator=DecisionTreeClassifier(
            max_depth=100, random_state=13)
        >>> processObj.model_score
        >>> processObj.model_prediction
        
        """
        self.inspect 
        
        if not hasattr (self, "Xt") or not hasattr (self, 'yt'): 
            raise ProcessingError(
                "Missing of test data Xt and yt. Cannot estimate" 
                " the prediction score. 'refit' the data by turning the"
                " parameter `split_X_y=True`.")
    
        if not hasattr(self, "estimator_"): 
            if estimator_name is not None: 
                self.default_estimator= estimator_name 
            
            if default_estimator: 
                des= copy.deepcopy(self.default_estimator)
                self.default_estimator = str(self.default_estimator).lower().strip() 
                
                if self.default_estimator not in d_estimators_.keys(): 
                    raise ValueError (f"Unknow default estimator :{des!r}")
                    
                self.estimator_ = self._getdestimators()[self.default_estimator] 
                
            else:
                raise ProcessingError("Missing estimator. It should not be None.") 
            
        
        self.estimator_.fit(self.X, self.y)
        
        self.model_score_ = self.estimator_.score(
            self.Xt, self.yt)
        self.model_prediction_ = self.estimator_.predict(
            self.Xt)
        
        self.confusion_matrix= confusion_matrix(self.yt,
                                   self.model_prediction_)
        self.classification_report= classification_report(self.yt,
                               self.model_prediction_)
        
        
        return self.model_score_ , self.model_prediction_


                
Processing.__doc__="""\
Processing class for managing baseline model evaluation and learning. 

Manages the validation curves after fiddling a little bit an estimator 
hyperparameters. 

Processing is usefull before modeling step. To process data, a default 
implementation is given for  data `preprocessor` build. It consists of creating 
a model pipeline using different transformers. If None pipeline is setting  
and auto is set to 'True', a default pipeline is created though the 
`prepocessor`to raun the base model evaluation. Indeed  a `preprocessor` is a 
set of `transformers + estimators`.

Parameters 
-------------
auto: bool, default is {{'False'}}
    trigger the composite estimator.If ``True`` a composite  `preprocessor` 
    is built and use for base model evaluation. *default* is False.
pipeline: Callable, F or  dict of callable F            
   preprocessing steps encapsulated. If not supplied a default pipe is 
   used as auto is set to ``True``.   
     
estimator: Callable, 
    An object which manages the estimation and decoding of a model. Estimators 
    must provide a fit method, and should provide set_params and `get_params`, 
    although these are usually provided by inheritance from `base.BaseEstimator`.
    The core functionality of some estimators may also be available as a function.
       
{params.core.tname}

drop_features: list or str, Optional
    List the useless `features` for predicting or list of column names to drop 
    out. 
random_state: int, default is ``42``
    The state of data shuffling. The default is ``42``.
    
default_estimator: callable, F or sckitlearn estimator 
    The default estimator name for predicting the tname value. A predifined 
    defaults estimators prameters are set and keep in cache for quick 
    preprocessing like: 
    - 'dtc': For DecisionTreeClassifier 
    - 'svc': Support Vector Classifier 
    - 'sdg': SGDClassifier
    - 'knn': KNeighborsClassifier
    - 'rdf`: RandmForestClassifier 
    - 'ada': AdaBoostClassifier 
    - 'vtc': VotingClassifier
    - 'bag': BaggingClassifier 
    - 'stc': StackingClassifier
    If estimator is not given the default is ``svm`` or ``svc``.
test_size: float,       
    The test set data size. Must be less than 1.The sample test size is 
    ``0.2`` either 20% of dataset.      

{params.core.verbose} 

Attributes
-----------
{params.core.X}
{params.core.y}
{params.core.Xt}
{params.core.yt}
{params.core.data}
{params.base.pipe_}
{params.base.estimator_}
{params.core.model}

cat_features_: list or str, Optional
     list of categorical features list. If not given it should be find 
     automatically.           
num_features_ : list of str, Optional
     list Numerical features list. If not given, should be find automatically. 
     
model: Callable, {{preprocessor + estimator }},
    Use the predifined pipelines i.e can be a Pipeline can your build 
    by your own pipeline with different composite estimator.
    If `model` is ``None`` , use the default model from the default 
    `preprocessor` and `estimator`. 
                                     
model_score_:  float/dict      
    Model test score. Observe your test model score using your compose 
    estimator for enhacement 
model_prediction_: array_like      
    Observe your test model prediction for as well as the compose estimator 
    enhancement.
preprocessor_: Callable , F       
    Compose piplenes and estimators for default model scorage.


Examples 
---------

>>> from watex.cases.processing  import Processing
>>> from watex.exlib.sklearn import (StandardScaler,RandomForestClassifier, 
                                     make_column_selector, PolynomialFeatures, 
                                     SelectKBest, f_classif)  
>>> data = fetch_data ('bagoue original').get('data=dfy2')
>>> my_own_pipeline= {{'num_column_selector_': 
...                       make_column_selector(dtype_include=np.number),
...                'cat_column_selector_': 
...                    make_column_selector(dtype_exclude=np.number),
...                'features_engineering_':
...                    PolynomialFeatures(3,include_bias=True),
...                'selectors_': SelectKBest(f_classif, k=4), 
...               'encodages_': StandardScaler()
...                 }}
>>> my_estimator={{
...    'RandomForestClassifier':RandomForestClassifier(
...    n_estimators=200, random_state=0)
...    }}
>>> processObj= Processing (tname = 'flow', drop_features =['lwi', 'name', 'num'],
                            pipeline= my_own_pipeline, estimator=my_estimator)  
>>> processObj.fit(data=data )
>>> processObj.baseEvaluation (eval_metric=True ) 
... 0.4942528735632184 # score is an ensemble score for both model 
  
""".format(
    params=_param_docs,
)