Source code for watex.view.mlplot

# -*- coding: utf-8 -*-
#   License: BSD-3-Clause
#   Author: LKouadio <etanoyau@gmail.com>

"""
:mod:`~watex.view.mlplot` is a set of plot templates for visualising and 
inspecting the learning models.  It gives a quick depiction for users for 
models visualization and evaluation with : :class:`~watex.view.EvalPlot`
"""
from __future__ import annotations 
import re
import warnings
import inspect 
from abc import ABCMeta 
import copy 
import numpy as np 
import pandas as pd
import seaborn as sns 
from scipy.cluster.hierarchy import dendrogram 

import matplotlib as mpl 
import matplotlib.pyplot  as plt
import matplotlib.ticker as mticker
from matplotlib import cm 
from matplotlib.colors import BoundaryNorm

from .._watexlog import watexlog
from .._docstring import ( 
    _core_docs, 
    _baseplot_params, 
    DocstringComponents, 
    )
from ..analysis.dimensionality import nPCA
from ..decorators import  docSanitizer 
from ..exlib.sklearn import  ( 
    learning_curve , 
    silhouette_samples , 
    SimpleImputer, 
    StandardScaler, 
    MinMaxScaler, 
    train_test_split, 
    mean_squared_error, 
    KMeans
    ) 
from ..exceptions import ( 
    NotFittedError , 
    LearningError, 
    EstimatorError, 
    PlotError
    )
from ..metrics import ( 
    precision_recall_tradeoff, 
    ROC_curve, 
    confusion_matrix
    )
from ..property import BasePlot 
from .._typing import ( 
    Optional, 
    Tuple, 
    F,
    List,
    ArrayLike, 
    NDArray,
    DataFrame, 
    Series
    )
from ..utils.exmath import linkage_matrix 
from ..utils.hydroutils import check_flow_objectivity 
from ..utils.coreutils import _is_readable 
from ..utils.funcutils import ( 
    is_iterable,
    reshape, 
    to_numeric_dtypes, 
    smart_strobj_recognition, 
    repr_callable_obj , 
    str2columns, 
    make_ids
    )
from ..utils.mlutils import ( 
    exporttarget , 
    selectfeatures, 
    cattarget, 
    projection_validator, 
    )
from ..utils.plotutils import (
    _get_xticks_formatage, 
    # _format_ticks, 
    make_mpl_properties, 
    
    )
from ..utils.validator import ( 
    _check_consistency_size, 
    get_estimator_name , 
    array_to_frame, 
    check_array, 
    check_X_y, 
    check_y,
    )

_logger=watexlog.get_watex_logger(__name__)

#-----
# Add specific params to Evaldocs 

_eval_params = dict( 
    objective="""
objective: str, default=None, 
    The purpose of dataset; what probem do we intend to solve ?  
    Originally the package was designed for flow rate prediction. Thus,  
    if the `objective` is set to ``flow``, plot will behave like the flow 
    rate prediction purpose and in that case, some condition of target   
    values need to be fullfilled.  Furthermore, if the objective 
    is set to ``flow``, `label_values`` as well as the `litteral_classes`
    parameters need to be supplied to right encode the target according 
    to the hydraulic system requirement during the campaign for drinking 
    water supply. For any other purpose for the dataset, keep the objective  
    to ``None``. Default is ``None``.    
    """, 
    yp_ls="""
yp_ls: str, default='-', 
    Line style of `Predicted` label. Can be [ '-' | '.' | ':' ] 
    """, 
    yp_lw="""
yp_lw: str, default= 3
    Line weight of the `Predicted` plot
    """,
    yp_lc ="""
yp_lc: str or :func:`matplotlib.cm`, default= 'k'
    Line color of the `Prediction` plot. *default* is ``k``
    """, 
    yp_marker="""
yp_marker: str or :func:`matplotlib.markers`, default ='o'
    Style of marker in  of `Prediction` points. 
    """, 
    yp_markerfacecolor="""
yp_markerfacecolor: str or :func:`matplotlib.cm`, default='k'
    Facecolor of the `Predicted` label marker.
    """, 
    yp_markeredgecolor="""
yp_markeredgecolor: stror :func:`matplotlib.cm`,  default= 'r' 
    Edgecolor of the `Predicted` label marker.
    """, 
    yp_markeredgewidth="""
yp_markeredgewidth: int, default=2
    Width of the `Predicted`label marker.
    """, 
    rs="""
rs: str, default='--'
    Line style of `Recall` metric 
    """, 
    ps="""
ps: str, default='-'
    Line style of `Precision `metric
    """, 
    rc="""
rc: str, default=(.6,.6,.6)
    Recall metric colors 
    """, 
    pc="""
pc: str or :func:`matplotlib.cm`, default='k'
    Precision colors from Matplotlib colormaps. 
    """
    )

_param_docs = DocstringComponents.from_nested_components(
    core=_core_docs["params"], 
    base=DocstringComponents(_baseplot_params), 
    evdoc=DocstringComponents(_eval_params), 
    )
#-------

[docs] class EvalPlot(BasePlot): def __init__(self, tname:str =None, encode_labels: bool=False, scale: str = None, cv: int =None, objective:str=None, prefix: str=None, label_values:List[int]=None, litteral_classes: List[str]=None, **kws ): self._logging= watexlog().get_watex_logger(self.__class__.__name__) self.tname=tname self.objective=objective self.scale=scale self.cv=cv self.prefix=prefix self.encode_labels=encode_labels self.litteral_classes=litteral_classes self.label_values=label_values # precision(p) and recall(r) # properties self.rs =kws.pop('rs', '--') self.ps =kws.pop('ps', '-') self.rc =kws.pop('rc', (.6, .6, .6)) self.pc =kws.pop('pc', 'k') # predicted properties self.yp_lc =kws.pop('yp_lc', 'k') self.yp_marker= kws.pop('yp_marker', 'o') self.yp_marker_edgecolor = kws.pop('yp_markeredgecolor', 'r') self.yp_lw = kws.pop('yp_lw', 3.) self.yp_ls=kws.pop('yp_ls', '-') self.yp_marker_facecolor =kws.pop('yp_markerfacecolor', 'k') self.yp_marker_edgewidth= kws.pop('yp_markeredgewidth', 2.) super().__init__(**kws) self.data_ =None self.X=None self.y= None self.target_=None @property def inspect(self): """ Inspect data and trigger plot after checking the data entry. Raises `NotFittedError` if `ExPlot` is not fitted yet.""" msg = ( "{expobj.__class__.__name__} instance is not fitted yet." " Call 'fit' with appropriate arguments before using" " this method" ) if self.X is None: raise NotFittedError(msg.format( expobj=self) ) return 1
[docs] def save (self, fig): """ savefigure if figure properties are given. """ if self.savefig is not None: fig.savefig (self.savefig,dpi = self.fig_dpi , bbox_inches = 'tight', orientation=self.fig_orientation ) plt.show() if self.savefig is None else plt.close ()
[docs] def fit(self, X: NDArray |DataFrame =None, y:ArrayLike =None, **fit_params ): """ Fit data and populate the attributes for plotting purposes. There is no conventional procedure for checking if a method is fitted. However, an class that is not fitted should raise :class:`watex.exceptions.NotFittedError` when a method is called. Parameters ------------ X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. y: array-like, shape (M, ) ``M=m-samples``, train target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. data: Filepath or Dataframe or shape (M, N) from :class:`pandas.DataFrame`. Dataframe containing samples M and features N fit_params: dict Additional keywords arguments from :func:watex.utils.coreutils._is_readable` Return ------- ``self``: `EvalPlot` instance returns ``self`` for easy method chaining. """ data = fit_params.pop('data', None) columns = fit_params.pop ('columns', None) if data is not None: self.data_ = _is_readable(data) if self.data_ is not None: if self.tname is not None: self.target_, X = exporttarget( self.data_ , self.tname, inplace= True ) y = reshape (self.target_.values ) # for consistency if X is None: raise TypeError( "X array must not be None, or pass a filepath or " "dataframe object as keyword data argument to set 'X'.") # Create a pseudo frame" # if 'X' is not a dataframe X= array_to_frame(X, to_frame= True, input_name="X", force =True ) X = to_numeric_dtypes(X , columns = columns ) X = selectfeatures( X, include ='number') if len ( X.columns) ==0 : raise TypeError( " The module {self.__class__.__name__!r } expects dataframe " " 'X' with numerical features only. ") self.X = X self.y = np.array (y) return self
[docs] def transform (self, X, **t_params): """ Transform the data and imputs the numerical features. It is not convenient to use `transform` if user want to keep categorical values in the array Parameters ------------ X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. t_params: dict, Keyword arguments passed to :class:`sklearn.impute.SimpleImputer` for imputing the missing data; default strategy is 'most_frequent' or keywords arguments passed to :func:watex.utils.funcutils.to_numeric_dtypes` Return ------- X: NDArray |Dataframe , shape (M x N ) The transformed array or dataframe with numerical features """ self.X = X self.inspect strategy = t_params.pop('strategy', 'most_frequent') columns = list(self.X.columns ) imp = SimpleImputer(strategy = strategy, **t_params ) # create new dataframe X= imp.fit_transform(self.X ) if self.scale: if str(self.scale).find ('minmax') >=0 : sc = MinMaxScaler() else:sc =StandardScaler() X = sc.fit_transform(X) self.X = pd.DataFrame( X , columns = columns ) return self.X
[docs] def fit_transform (self, X, y= None , **fit_params ): """ Fit and transform at once. Parameters ------------ X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. Return ------- X: NDArray |Dataframe , shape (M x N ) The transformed array or dataframe with numerical features """ self.X = self.fit(X, y, **fit_params).transform(self.X ) return self.X
def _cat_codes_y (self, prefix:str =None ,values:List[int]=None, classes: List[str]=None, objective:str =None ): """ Encode y to hold the categorical values. Note that if objective is set to 'flow', the `values` need to be supplied, otherwise an error will raises. :param values: list of values to encoding the numerical target `y`. for instance ``values=[0, 1, 2]`` :param objective: str, relate to the flow rate prediction. Set to ``None`` for any other predictions. :param prefix: the prefix to add to the class labels. For instance, if the `prefix` equals to ``FR``, class labels will become:: [0, 1, 2] => [FR0, FR1, FR2] :classes: list of classes names to replace the default `FR` that is used to specify the flow rate. For instance, it can be:: [0, 1, 2] => [sf0, sf1, sf2] Returns -------- (self.y, classes): Array-like, list[int|str] Array of encoded labels and list of unique class label identifiers """ y =copy.deepcopy(self.y) if y is None : warnings.warn("Expect a target array. Missing y(target)" " is not allowed.") raise TypeError (" NoneType 'y' (target) can be categorized.") if objective =='flow': y, classes = check_flow_objectivity(y,values, classes) else : if self.target_ is not None: y = self.target_ else: y = pd.Series (y, name='none') values = values or self.label_values if values is not None: y = cattarget(y , labels = values, rename_labels= classes or self.litteral_classes ) else: y = y.astype('category').cat.codes # add prefix y = y.map(lambda o: prefix + str(o) ) if prefix else y classes = np.unique (y) return y , classes
[docs] def plotPCA( self, n_components:int =None, *, n_axes: int= None, #2, biplot:bool =False, pc1_label:str ='Axis 1', pc2_label:str='Axis 2', plot_dict:dict= None, **pca_kws )->'EvalPlot': """ Plot PCA component analysis using :class:`~.sklearn.decomposition`. PCA identifies the axis that accounts for the largest amount of variance in the train set `X`. It also finds a second axis orthogonal to the first one, that accounts for the largest amount of remaining variance. Parameters ----------- n_components: Number of dimension to preserve. If`n_components` is ranged between float 0. to 1., it indicates the number of variance ratio to preserve. If ``None`` as default value the number of variance to preserve is ``95%``. n_axes: Number of importance components to retrieve the variance ratio. Default is ``2``. The first two importance components with most variance ratio. biplot: bool, biplot plots PCA features importance (pc1 and pc2) and visualize the level of variance and direction of components for different variables. Refer to `Serafeim Loukas`_ pc1_label:str, default ='Axis 1' the first component with most variance held in 'Axis 1'. Can be modified to any other axis for instance 'Axis 3' to replace the component in 'Axis 1' to the one in `Axis 3` and so one. This will allow to visualize the position of each level of variance for each variable. pc2_label:str, default ='Axis 2', the second component with most variance held in 'Axis 2'. Can be modified to any other axis for instance 'Axis 6' to replace the component in 'Axis 2' to the one in `Axis 6` and so one. plot_dict: dict, dictionnary of font and properties for markers for each sample corresponding to the `label_values`. pca_kws: dict, additional keyword arguments passed to :class:`watex.analysis.dimensionality.nPCA` Return ------- ``self``: `EvalPlot` instance ``self`` for easy method chaining. Notes ------- By default, `nPCA` methods plots the first two principal components named `pc1_label` for axis 1 and `pc2_label` for axis 2. If you want to plot the first component `pc1` vs the third components`pc2` set the `pc2_label` to `Axis 3` and set the `n_components` to 3 that is the max reduced columns to retrieve, otherwise an users warning will be displayed. Commonly Algorithm should automatically detect the digit ``3`` in the litteral `pc1_labels` including Axis (e.g. 'Axis 3`) and will consider as the third component `pc3 `. The same process is available for other axis. Examples --------- >>> from watex.datasets import load_bagoue >>> from watex.view.mlplot import EvalPlot >>> X , y = load_bagoue(as_frame =True ) >>> b=EvalPlot(tname ='flow', encode_labels=True , scale = True ) >>> b.fit_transform (X, y) >>> b.plotPCA (n_components= 2 ) ... >>> # pc1 and pc2 labels > n_components -> raises user warnings >>> b.plotPCA (n_components= 2 , biplot=False, pc1_label='Axis 3', pc2_label='axis 4') ... UserWarning: Number of components and axes might be consistent; '2'and '4 are given; default two components are used. >>> b.plotPCA (n_components= 8 , biplot=False, pc1_label='Axis3', pc2_label='axis4') # works fine since n_components are greater to the number of axes ... EvalPlot(tname= None, objective= None, scale= True, ... , sns_height= 4.0, sns_aspect= 0.7, verbose= 0) """ self.inspect classes , y = self.litteral_classes, self.y classes = classes or np.unique (y) if plot_dict is None: D_COLORS = make_mpl_properties(1e3) plot_dict ={'y_colors': D_COLORS, 's':100.} if self.encode_labels: y, classes = self._cat_codes_y( self.prefix, self.label_values, self.litteral_classes, self.objective ) # go for PCA analysis pca= nPCA(self.X, n_components, n_axes =n_axes, return_X= False, **pca_kws) feature_importances_ = pca.feature_importances_ X_reduced = pca.X # for consistency # Get axis for plots from pca_labels n_axes = n_axes or pca.n_axes try: lbls =[int(re.findall("\d+", str_axes)[0]) for str_axes in [pc1_label, pc2_label]] except : # remove if dot '.'exists by replacing by lbls =[s.replace('.','s') for s in [pc1_label, pc2_label]] lbls=[int ( ''.join(filter(str.isdigit, js) ) ) for js in lbls] else: pca1_ix, pca2_ix = [i-1 for i in lbls] if pca1_ix <0 or pca2_ix<0: pca1_ix =0 pca2_ix = pca1_ix+1 if (pca1_ix >= n_axes) or (pca2_ix >= n_axes) : warnings.warn( "Number of components and axes might be" f" consistent; '{n_axes!r}'and '{max(lbls)!r}" " are given; default two components are used." ) pca1_ix =0 pca2_ix = pca1_ix+1 pc1_label , pc2_label = 'Axis 1', 'Axis 2' # if pca1_ix or pca2 X_= np.c_[X_reduced[:, pca1_ix], X_reduced[:, pca2_ix]] # prepared defaults colors and defaults markers y_palettes = plot_dict ['y_colors'] if classes is not None: if len(y_palettes) > len(classes): # reduce the last colors y_palettes =y_palettes[:len(classes)] if len(y_palettes) < len(classes): # add black colors by default y_palettes += ['k' for k in range( len(classes) - len(y_palettes))] # --Plot Biplot if biplot: mpl.rcParams.update(mpl.rcParamsDefault) # reset ggplot style # Call the biplot function for only the first 2 PCs cmp_= np.concatenate((pca.components_[pca1_ix, :], pca.components_[pca2_ix, :])) try: biPlot(self, self.X, np.transpose(cmp_), y, classes=classes, colors=y_palettes ) except : # plot defaults configurations biPlot(self, X_reduced[:,:2], np.transpose(pca.components_[0:2, :]), y, classes=classes, colors=y_palettes ) plt.show() else : plt.show() return # concatenate reduced dataframe + y_target try: df_pca =pd.concat([ pd.DataFrame(X_,columns =[pc1_label, pc2_label]), pd.Series(y, name=self.tname)], axis =1) except TypeError: # force plot using the defauts first two componnets if # something goes wrong df_pca =pd.concat([ pd.DataFrame(X_reduced[:,:2], columns =[pc1_label, pc2_label]), pd.Series(y, name=self.tname)], axis =1) pca1_ix , pca2_ix =0,1 # Extract the name of the first components # and second components pca_axis_1 = feature_importances_[pca1_ix][1][0] pca_axis_2 = feature_importances_[pca2_ix][1][0] # Extract the name of the values of the first # component and second components in percentage. pca_axis_1_ratio = np.around( abs(feature_importances_[pca1_ix][2][0]),2) *1e2 pca_axis_2_ratio = np.around( abs(feature_importances_[pca2_ix][2][0]),2) *1e2 # create figure obj fig = plt.figure(figsize = self.fig_size) ax = fig.add_subplot(1,1,1) y_label = df_pca.iloc [:, -1].name # get the name of label for target , color in zip(classes, y_palettes): ix = df_pca[y_label] ==target ax.scatter(df_pca.loc[ix, pc1_label], df_pca.loc[ix, pc2_label], c= color, s= plot_dict['s']) # get the max values, set the center plot and set the # of the circle bounds. max_lim = np.ceil(abs(max([X_reduced[:, pca1_ix].max(), X_reduced[:, pca2_ix].max()]))) cercle = plt.Circle((0,0), max_lim, color='blue', fill=False) ax.add_artist(cercle) ax.set_ylim([-max_lim, max_lim]) ax.set_xlim([-max_lim, max_lim]) linev =plt.Line2D ((0, 0), (-max_lim, max_lim), color = self.lc, linewidth = self.lw, linestyle = self.ls, marker = self.marker, markeredgecolor = self.marker_edgecolor, markeredgewidth = self.marker_edgewidth, markerfacecolor = self.marker_facecolor , markersize = self.ms * self.fs ) lineh =plt.Line2D ((-max_lim, max_lim), (0, 0), color = self.lc, linewidth = self.lw, linestyle = self.ls , marker = self.marker, markeredgecolor = self.marker_edgecolor, markeredgewidth = self.marker_edgewidth, markerfacecolor = self.marker_facecolor, markersize = self.ms * self.fs ) #Create string label from pca_axis_1 x_axis_str = pc1_label +':'+ str(pca_axis_1) +' ({}%)'.format( pca_axis_1_ratio ) y_axis_str = pc2_label +':' + str(pca_axis_2) +' ({}%)'.format( pca_axis_2_ratio ) ax.set_xlabel( x_axis_str, color='k', fontsize = self.font_size * self.fs ) ax.set_ylabel(y_axis_str, color='k', fontsize = self.font_size * self.fs ) ax.set_title('PCA', fontsize = (self.font_size +1) * self.fs) ax.add_artist(linev) ax.add_artist(lineh) ax.legend(classes) ax.grid(color=self.lc, linestyle=self.ls, linewidth=self.lw/10 ) self.save(fig) return self
[docs] @docSanitizer() def plotPR( self, clf:F, label:int|str, kind:Optional[str]=None, method:Optional[str]=None, cvp_kws =None, **prt_kws )->'EvalPlot': """ Precision/recall (PR) and tradeoff plots. PR computes a score based on the decision function and plot the result as a score vs threshold. Parameters ----------- clf :callable, always as a function, classifier estimator A supervised predictor with a finite set of discrete possible output values. A classifier must supports modeling some of binary, targets. It must store a classes attribute after fitting. label: int, Specific class to evaluate the tradeoff of precision and recall. `label` needs to be specified and a value within the target. kind: str, ['threshold|'recall'], default='threshold' kind of PR plot. If kind is 'recall', method plots the precision VS the recall scores, otherwiwe the PR tradeoff is plotted against the 'threshold.' method: str Method to get scores from each instance in the trainset. Could be ``decison_funcion`` or ``predict_proba``. When using the scikit-Learn classifier, it generally has one of the method. Default is ``decision_function``. cvp_kws: dict, optional The :func:`sklearn.model_selection.cross_val_predict` keywords additional arguments prt_kws:dict, Additional keyword arguments passed to func:`watex.exlib.sklearn.precision_recall_tradeoff` Return ------- ``self``: `EvalPlot` instance ``self`` for easy method chaining. Examples --------- >>> from watex.exlib.sklearn import SGDClassifier >>> from watex.datasets.dload import load_bagoue >>> from watex.utils import cattarget >>> from watex.view.mlplot import EvalPlot >>> X , y = load_bagoue(as_frame =True ) >>> sgd_clf = SGDClassifier(random_state= 42) # our estimator >>> b= EvalPlot(scale = True , encode_labels=True) >>> b.fit_transform(X, y) >>> # binarize the label b.y >>> ybin = cattarget(b.y, labels= 2 ) # can also use labels =[0, 1] >>> b.y = ybin >>> # plot the Precision-recall tradeoff >>> b.plotPR(sgd_clf , label =1) # class=1 ... EvalPlot(tname= None, objective= None, scale= True, ... , sns_height= 4.0, sns_aspect= 0.7, verbose= 0) """ msg = ("Precision recall metric works for classification " "task; labels must be encoded refering to a particular" " class; set 'encode_labels' param to 'True' and " "provided a list of class integer unique identifier." ) kind = kind or 'threshold' kind=str(kind).lower().strip() if kind.lower().find('thres')>=0: kind = 'threshold' elif kind.lower().find('rec')>=0: kind = 'recall' if kind not in ('threshold', 'recall'): raise ValueError ("Invalid kind={0!r}. Expect {1!r} or {2!r}". format(kind, *('threshold', 'recall')) ) self.inspect # call precision if self.y is None: warnings.warn("Precision-recall deals with supervising learning" " methods which expects a target to be categorized." " Missing target is not allowed.") raise TypeError("Missing target 'y' is not allowed. Can not used" " the 'precision-recall' metric.") if not self.encode_labels : warnings.warn( msg + " Refer to <https://en.wikipedia.org/wiki/Machine_learning>" " for deep understanding." ) raise LearningError (msg) prtObj = precision_recall_tradeoff( clf, self.X, self.y, cv =self.cv, label=label, method =method, cvp_kws=cvp_kws, **prt_kws) # create figure obj fig = plt.figure(figsize = self.fig_size) ax = fig.add_subplot(1,1,1) # for consistency set xlabel and ylabel xlabel = None ylabel = None if kind=='threshold': ax.plot(prtObj.thresholds, prtObj.precisions[:-1], color = self.pc, linewidth = self.lw, linestyle = self.ps, label = 'Precision', **self.plt_kws ) ax.plot(prtObj.thresholds, prtObj.recalls[:-1], color = self.rc, linewidth = self.lw, linestyle = self.rs , label = 'Recall', **self.plt_kws) xlabel = self.xlabel or 'Threshold' ylabel =self.ylabel or 'Score' elif kind =='recall': ax.plot(prtObj.recalls[:-1], prtObj.precisions[:-1], color = self.lc, linewidth = self.lw, linestyle = self.ls , label = 'Precision vs Recall', **self.plt_kws ) xlabel = self.xlabel or 'Recall' ylabel =self.ylabel or 'Precision' self.xlim =[0,1] ax.set_xlabel( xlabel, fontsize= .5 * self.font_size * self.fs ) ax.set_ylabel (ylabel, fontsize= .5 * self.font_size * self.fs) ax.tick_params(axis='both', labelsize=.5 * self.font_size * self.fs) if self.show_grid is True : if self.gwhich =='minor': ax.minorticks_on() ax.grid(self.show_grid, axis=self.gaxis, which = self.gwhich, color = self.gc, linestyle=self.gls, linewidth=self.glw, alpha = self.galpha ) if len(self.leg_kws) ==0 or 'loc' not in self.leg_kws.keys(): self.leg_kws['loc']='upper left' ax.legend(**self.leg_kws) if self.ylim is None: self.ylim = [0, 1] ax.set_ylim (self.ylim) if kind =='recall': ax.set_xlim (self.xlim) self.save(fig) return self
[docs] def plotROC( self, clfs, label: int |str, method: Optional[str]=None, cvp_kws:dict=None, **roc_kws )-> 'EvalPlot': """ Plot receiving operating characteric (ROC) classifiers. Can plot multiple classifiers at once. If multiple classifiers are given, each classifier must be a tuple of ``( <name>, classifier>, <method>)``. For instance, to plot the both :class:`sklearn.ensemble.RandomForestClassifier` and :class:`sklearn.linear_model.SGDClassifier` classifiers, they must be ranged as follow:: clfs =[ ('sgd', SGDClassifier(), "decision_function" ), ('forest', RandomForestClassifier(), "predict_proba") ] It is important to know whether the method 'predict_proba' is valid for the scikit-learn classifier, we want to plot its ROC curve. Parameters ----------- clfs :callables, always as a function, classifier estimators A supervised predictor with a finite set of discrete possible output values. A classifier must supports modeling some of binary, targets. It must store a classes attribute after fitting. label: int, Specific class to evaluate the tradeoff of precision and recall. `label` needs to be specified and a value within the target. kind: str, ['threshold|'recall'], default='threshold' kind of PR plot. If kind is 'recall', method plots the precision VS the recall scores, otherwiwe the PR tradeoff is plotted against the 'threshold.' method: str Method to get scores from each instance in the trainset. Could be ``decison_funcion`` or ``predict_proba``. When using the scikit-Learn classifier, it generally has one of the method. Default is ``decision_function``. cvp_kws: dict, optional The :func:`sklearn.model_selection.cross_val_predict` keywords additional arguments prt_kws:dict, Additional keyword arguments passed to func:`watex.exlib.sklearn.precision_recall_tradeoff` roc_kws: dict roc_curve additional keywords arguments. Return ------- ``self``: `EvalPlot` instance ``self`` for easy method chaining. Examples -------- (1) Plot ROC for single classifier >>> from watex.exlib.sklearn import ( SGDClassifier, RandomForestClassifier ) >>> from watex.datasets.dload import load_bagoue >>> from watex.utils import cattarget >>> from watex.view.mlplot import EvalPlot >>> X , y = load_bagoue(as_frame =True ) >>> sgd_clf = SGDClassifier(random_state= 42) # our estimator >>> b= EvalPlot(scale = True , encode_labels=True) >>> b.fit_transform(X, y) >>> # binarize the label b.y >>> ybin = cattarget(b.y, labels= 2 ) # can also use labels =[0, 1] >>> b.y = ybin >>> # plot the ROC >>> b.plotROC(sgd_clf , label =1) # class=1 ... EvalPlot(tname= None, objective= None, scale= True, ... , sns_height= 4.0, sns_aspect= 0.7, verbose= 0) (2)-> Plot ROC for multiple classifiers >>> b= EvalPlot(scale = True , encode_labels=True, lw =3., lc=(.9, 0, .8), font_size=7 ) >>> sgd_clf = SGDClassifier(random_state= 42) >>> forest_clf =RandomForestClassifier(random_state=42) >>> b.fit_transform(X, y) >>> # binarize the label b.y >>> ybin = cattarget(b.y, labels= 2 ) # can also use labels =[0, 1] >>> b.y = ybin >>> clfs =[('sgd', sgd_clf, "decision_function" ), ('forest', forest_clf, "predict_proba")] >>> b.plotROC (clfs =clfs , label =1 ) ... EvalPlot(tname= None, objective= None, scale= True, ... , sns_height= 4.0, sns_aspect= 0.7, verbose= 0) """ # if method not given as tuple if not isinstance(clfs, (list, tuple)): try : clfs =[(clfs.__name__, clfs, method)] except AttributeError: # type `clf` is ABCMeta clfs =[(clfs.__class__.__name__, clfs, method)] # loop and set the tuple of (clfname , clfvalue, clfmethod) # anc convert to list to support item assignments clfs = [list(pnclf) for pnclf in clfs] for i, (clfn, _clf, _) in enumerate(clfs) : if clfn is None or clfn =='': try: clfn = _clf.__name__ except AttributeError: # when type `clf` is ABCMeta clfn= _clf.__class__.__name__ clfs[i][0] = clfn # reconvert to tuple values clfs =[tuple(pnclf) for pnclf in clfs] # build multiples classifiers objects rocObjs =[ROC_curve( clf=_clf,X=self.X,y=self.y, cv =self.cv, label=label, method =meth, cvp_kws=cvp_kws,**roc_kws) for (name, _clf, meth) in clfs ] # create figure obj fig = plt.figure(figsize = self.fig_size) ax = fig.add_subplot(1,1,1) D_COLORS = make_mpl_properties(len(clfs)) D_STYLES= make_mpl_properties(len(clfs), prop= 'line') D_COLORS[0] = self.lc D_STYLES[0]= self.ls for ii, (name, _clf, _) in enumerate( clfs): ax.plot(rocObjs[ii].fpr, rocObjs[ii].tpr, label =name + ' (AUC={:.4f})'.format( rocObjs[ii].roc_auc_score), color =D_COLORS[ii], linestyle = D_STYLES[ii] , linewidth = self.lw ) xlabel = self.xlabel or 'False Positive Rate' ylabel = self.ylabel or 'True Positive Rate' self.xlim =[0,1] self.ylim =[0,1] ax.plot(self.xlim, self.ylim, ls= '--', color ='k') ax.set_xlim (self.xlim) ax.set_ylim (self.ylim) ax.set_xlabel( xlabel, fontsize= .5 * self.font_size * self.fs ) ax.set_ylabel (ylabel, fontsize= .5 * self.font_size * self.fs) ax.tick_params(axis='both', labelsize=.5 * self.font_size * self.fs) if self.show_grid is True : if self.gwhich =='minor': ax.minorticks_on() ax.grid(self.show_grid, axis=self.gaxis, which = self.gwhich, color = self.gc, linestyle=self.gls, linewidth=self.glw, alpha = self.galpha ) if len(self.leg_kws) ==0 or 'loc' not in self.leg_kws.keys(): self.leg_kws['loc']='lower right' ax.legend(**self.leg_kws) self.save(fig) return self
[docs] @docSanitizer() def plotConfusionMatrix( self, clf:F, *, kind:str =None, labels:List[int]=None, matshow_kws: dict=None, **conf_mx_kws )-> 'EvalPlot': """ Plot confusion matrix for error evaluation. A representation of the confusion matrix for error visualization. If kind is set ``map``, plot will give the number of confused instances/items. However when `kind` is set to ``error``, the number of items confused is explained as a percentage. Parameters ----------- clf :callable, always as a function, classifier estimator A supervised predictor with a finite set of discrete possible output values. A classifier must supports modeling some of binary, targets. It must store a classes attribute after fitting. labels: int, or list of int, optional Specific class to evaluate the tradeoff of precision and recall. `label` needs to be specified and a value within the target. plottype: str can be `map` or `error` to visualize the matshow of prediction and errors respectively. matshow_kws: dict matplotlib additional keywords arguments. conf_mx_kws: dict Additional confusion matrix keywords arguments. ylabel: list list of labels names to hold the name of each categories. Return ------- ``self``: `EvalPlot` instance ``self`` for easy method chaining. Examples -------- >>> from watex.datasets import fetch_data >>> from watex.utils.mlutils import cattarget >>> from watex.exlib.sklearn import SVC >>> from watex.view.mlplot import EvalPlot >>> X, y = fetch_data ('bagoue', return_X_y=True, as_frame =True) >>> # partition the target into 4 clusters-> just for demo >>> b= EvalPlot(scale =True, label_values = 4 ) >>> b.fit_transform (X, y) >>> # prepare our estimator >>> svc_clf = SVC(C=100, gamma=1e-2, kernel='rbf', random_state =42) >>> matshow_kwargs ={ 'aspect': 'auto', # 'auto'equal 'interpolation': None, 'cmap':'jet } >>> plot_kws ={'lw':3, 'lc':(.9, 0, .8), 'font_size':15., 'cb_format':None, 'xlabel': 'Predicted classes', 'ylabel': 'Actual classes', 'font_weight':None, 'tp_labelbottom':False, 'tp_labeltop':True, 'tp_bottom': False } >>> b.plotConfusionMatrix(clf=svc_clf, matshow_kws = matshow_kwargs, **plot_kws) >>> svc_clf = SVC(C=100, gamma=1e-2, kernel='rbf', ... random_state =42) >>> # replace the integer identifier with litteral string >>> b.litteral_classes = ['FR0', 'FR1', 'FR2', 'FR3'] >>> b.plotConfusionMatrix(svc_clf, matshow_kws=matshow_kwargs, kind='error', **plot_kws) """ self.inspect kind = str (kind).lower().strip() if kind.find ('error')>=0 or kind.find('fill diagonal')>=0 : kind ='error' else: kind ='map' matshow_kws= matshow_kws or dict() # gives a gray color to matshow # if is given as matshow keywords arguments # then remove it _check_cmap = 'cmap' in matshow_kws.keys() if not _check_cmap or len(matshow_kws)==0: matshow_kws['cmap']= plt.cm.gray labels = labels or self.label_values y = self.y if labels is not None: # labels = labels_validator(self.y, labels) y, labels =self._cat_codes_y(values = labels, ) # for plotting purpose, change the labels to hold # the string litteral class names. labels = self.litteral_classes or labels # get yticks one it is a classification prof confObj =confusion_matrix(clf=clf, X=self.X, y=y, cv=self.cv, # **conf_mx_kws ) # create figure obj fig = plt.figure(figsize = self.fig_size) ax = fig.add_subplot(1,1,1) if kind =='map' : cax = ax.matshow(confObj.conf_mx, **matshow_kws) if self.cb_label is None: self.cb_label='Items confused' if kind in ('error', 'fill diagonal'): cax = ax.matshow(confObj.norm_conf_mx, **matshow_kws) self.cb_label = self.cb_label or 'Error' cbax= fig.colorbar(cax, **self.cb_props) ax.set_xlabel( self.xlabel, fontsize= self.font_size ) if labels is not None: xticks_loc = list(ax.get_xticks()) yticks_loc = list(ax.get_yticks()) ax.xaxis.set_major_locator(mticker.FixedLocator(xticks_loc)) ax.xaxis.set_major_formatter(mticker.FixedFormatter( [''] + list (labels))) ax.yaxis.set_major_locator(mticker.FixedLocator(yticks_loc)) ax.yaxis.set_major_formatter(mticker.FixedFormatter( [''] + list (labels))) self.ylabel = self.ylabel or 'Actual classes' self.xlabel = self.xlabel or 'Predicted classes' ax.set_ylabel (self.ylabel, fontsize= self.font_size *3 ) ax.set_xlabel (self.xlabel, fontsize= self.font_size *3 ) ax.tick_params(axis=self.tp_axis, labelsize= self.font_size *3 , bottom=self.tp_bottom, top=self.tp_top, labelbottom=self.tp_labelbottom, labeltop=self.tp_labeltop ) if self.tp_labeltop: ax.xaxis.set_label_position('top') cbax.ax.tick_params(labelsize=self.font_size * 3 ) cbax.set_label(label=self.cb_label, size=self.font_size * 3 , weight=self.font_weight) plt.xticks(rotation = self.rotate_xlabel) plt.yticks(rotation = self.rotate_ylabel) self.save(fig) return self
def __repr__(self): """ Pretty format for programmer guidance following the API... """ return repr_callable_obj (self, skip = ('y', 'X') ) def __getattr__(self, name): if name.endswith ('_'): if name not in self.__dict__.keys(): if name in ('data_', 'X_'): raise NotFittedError ( f'Fit the {self.__class__.__name__!r} object first' ) rv = smart_strobj_recognition(name, self.__dict__, deep =True) appender = "" if rv is None else f'. Do you mean {rv!r}' raise AttributeError ( f'{self.__class__.__name__!r} object has no attribute {name!r}' f'{appender}{"" if rv is None else "?"}' )
EvalPlot.__doc__ ="""\ Metrics, dimensionality and model evaluatation plots. Inherited from :class:`BasePlot`. Dimensional reduction and metric plots. The class works only with numerical features. .. admonition:: Discouraged Contineous target values for plotting classification metrics is discouraged. However, We encourage user to prepare its dataset before using the :class:`EvalPlot` methods. This is recommended to have full control of the expected results. Indeed, the most metrics plot implemented here works with supervised methods especially deals with the classification problems. So, the convenient way is for users to discretize/categorize (class labels) before the `fit`. If not the case, as the examples of demonstration under each method implementation, we first need to categorize the continue labels. The choice is twofolds: either providing individual class label as a list of integers using the method :meth:`EvalPlot._cat_codes_y` or by specifying the number of clusters that the target must hold. Commonly the latter choice is usefull for a test or academic purpose. In practice into a real dataset, it is discouraged to use this kind of target partition since, it is far away of the reality and will yield unexpected misinterpretation. Parameters ----------- {params.core.X} {params.core.y} {params.core.tname} {params.evdoc.objective} encode_labels: bool, default=False, label encoding works with `label_values` parameter. If the `y` is a continous numerical values, we could turn the regression to classification by setting `encode_labels` to ``True``. if value is set to ``True`` and values of labels is not given, an unique identifier is created which can not fit the exact needs of the users. So it is recommended to set this parameters in combinaison with the`label_values`. For instance:: encode_labels=True ; label_values =3 indicates that the target `y` values should be categorized to hold the integer identifier equals to ``[0 , 1, 2]``. `y` are splitted into three subsets where:: classes (c) = [ c{{0}} <= y. min(), y.min() < c {{1}}< y.max(), >=y.max {{2}}] This auto-splitting could not fit the exact classification of the target so it is recommended to set the `label_values` as a list of class labels. For instance `label_values=[0 , 1, 2]` and else. scale: str, ['StandardScaler'|'MinMaxScaler'], default ='StandardScaler' kind of feature scaling to apply on numerical features. Note that when using PCA, it is recommended to turn `scale` to ``True`` and `fit_transform` rather than only fit the method. Note that `transform` method also handle the missing nan value in the data where the default strategy for filling is ``most_frequent``. {params.core.cv} prefix: str, optional litteral string to prefix the integer identical labels. label_values: list of int, optional works with `encode_labels` parameters. It indicates the different class labels. Refer to explanation of `encode_labels`. Litteral_classes: list or str, optional Works when objective is ``flow``. Replace class integer names by its litteral strings. For instance:: label_values =[0, 1, 3, 6] Litteral_classes = ['rate0', 'rate1', 'rate2', 'rate3'] {params.evdoc.yp_ls} {params.evdoc.yp_lw} {params.evdoc.yp_lc} {params.evdoc.rs} {params.evdoc.ps} {params.evdoc.rc} {params.evdoc.pc} {params.evdoc.yp_marker} {params.evdoc.yp_markerfacecolor} {params.evdoc.yp_markeredgecolor} {params.evdoc.yp_markeredgewidth} {params.base.savefig} {params.base.fig_dpi} {params.base.fig_num} {params.base.fig_size} {params.base.fig_orientation} {params.base.fig_title} {params.base.fs} {params.base.ls} {params.base.lc} {params.base.lw} {params.base.alpha} {params.base.font_weight} {params.base.font_style} {params.base.font_size} {params.base.ms} {params.base.marker} {params.base.marker_facecolor} {params.base.marker_edgecolor} {params.base.marker_edgewidth} {params.base.xminorticks} {params.base.yminorticks} {params.base.bins} {params.base.xlim} {params.base.ylim} {params.base.xlabel} {params.base.ylabel} {params.base.rotate_xlabel} {params.base.rotate_ylabel} {params.base.leg_kws} {params.base.plt_kws} {params.base.glc} {params.base.glw} {params.base.galpha} {params.base.gaxis} {params.base.gwhich} {params.base.tp_axis} {params.base.tp_labelsize} {params.base.tp_bottom} {params.base.tp_labelbottom} {params.base.tp_labeltop} {params.base.cb_orientation} {params.base.cb_aspect} {params.base.cb_shrink} {params.base.cb_pad} {params.base.cb_anchor} {params.base.cb_panchor} {params.base.cb_label} {params.base.cb_spacing} {params.base.cb_drawedges} Notes -------- This module works with numerical data i.e if the data must contains the numerical features only. If categorical values are included in the dataset, they should be removed and the size of the data should be chunked during the fit methods. """.format( params=_param_docs, ) # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # create a shadow class to hold the font and matplotlib properties # from 'EvalPlot` and giving an option for saving figure _b= EvalPlot () pobj = type ('Plot', (BasePlot, ), {**_b.__dict__} ) setattr(pobj, 'save', _b.save ) # redefine the pobj doc pobj.__doc__="""\ Shadow plotting class that holds the :class:`~watex.property.BasePlot` parameters. Each matplotlib properties can be modified as :class:`~watex.view.pobj` attributes object. For instance:: >>> pobj.ls ='-.' # change the line style >>> pobj.fig_Size = (7, 5) # change the figure size >>> pobj.lw=7. # change the linewidth .. seealso:: Refer to :class:`~watex.property.BasePlot` for parameter details. """ # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
[docs] def plotProjection( X: DataFrame | NDArray, Xt: DataFrame | NDArray =None, *, columns: List[str] =None, test_kws: dict =None, **baseplot_kws ): """ Visualize train and test dataset based on the geographical coordinates. Since there is geographical information(latitude/longitude or easting/northing), it is a good idea to create a scatterplot of all instances to visualize data. Parameters --------- X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. The notation is uppercase to denote that it is ordinarily a matrix. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. Xt: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Shorthand for "test set"; data that is observed at testing and prediction time, used as independent variables in learning. The notation is uppercase to denote that it is ordinarily a matrix. columns: list of str or index, optional columns is usefull when a dataframe is given with a dimension size greater than 2. If such data is passed to `X` or `Xt`, columns must hold the name to considered as 'easting', 'northing' when UTM coordinates are given or 'latitude' , 'longitude' when latlon are given. If dimension size is greater than 2 and columns is None , an error will raises to prevent the user to provide the index for 'y' and 'x' coordinated retrieval. test_kws: dict, keywords arguments passed to :func:`matplotlib.plot.scatter` as test location font and colors properties. baseplot_kws: dict, All all the keywords arguments passed to the peroperty :class:`watex.property.BasePlot` class. Examples -------- >>> from watex.datasets import fetch_data >>> from watex.view.mlplot import plotProjection >>> # Discard all the non-numeric data >>> # then inut numerical data >>> from watex.utils import to_numeric_dtypes, naive_imputer >>> X, Xt, *_ = fetch_data ('bagoue', split_X_y =True, as_frame =True) >>> X =to_numeric_dtypes(X, pop_cat_features=True ) >>> X= naive_imputer(X) >>> Xt = to_numeric_dtypes(Xt, pop_cat_features=True ) >>> Xt= naive_imputer(Xt) >>> plot_kws = dict (fig_size=(8, 12), lc='k', marker='o', lw =3., font_size=15., xlabel= 'easting (m) ', ylabel='northing (m)' , markerfacecolor ='k', markeredgecolor='r', alpha =1., markeredgewidth=2., show_grid =True, galpha =0.2, glw=.5, rotate_xlabel =90., fs =3., s =None ) >>> plotProjection( X, Xt , columns= ['east', 'north'], trainlabel='train location', testlabel='test location', **plot_kws ) """ trainlabel =baseplot_kws.pop ('trainlabel', None ) testlabel =baseplot_kws.pop ('testlabel', None ) for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) #check array X=check_array ( X, input_name="X", to_frame =True, ) Xt =check_array ( Xt, input_name="Xt", to_frame =True, ) # validate the projections. xy , xynames = projection_validator(X, Xt, columns ) x, y , xt, yt =xy xname, yname, xtname, yname=xynames pobj.xlim =[np.ceil(min(x)), np.floor(max(x))] pobj.ylim =[np.ceil(min(y)), np.floor(max(y))] xpad = abs((x -x.mean()).min())/5. ypad = abs((y -y.mean()).min())/5. if Xt is not None: min_x, max_x = xt.min(), xt.max() min_y, max_y = yt.min(), yt.max() pobj.xlim = [min([pobj.xlim[0], np.floor(min_x)]), max([pobj.xlim[1], np.ceil(max_x)])] pobj.ylim = [min([pobj.ylim[0], np.floor(min_y)]), max([pobj.ylim[1], np.ceil(max_y)])] pobj.xlim =[pobj.xlim[0] - xpad, pobj.xlim[1] +xpad] pobj.ylim =[pobj.ylim[0] - ypad, pobj.ylim[1] +ypad] # create figure obj fig = plt.figure(figsize = pobj.fig_size) ax = fig.add_subplot(1,1,1) xname = pobj.xlabel or xname yname = pobj.ylabel or yname if pobj.s is None: pobj.s = pobj.fs *40 ax.scatter(x, y, color = pobj.lc, s = pobj.s if not pobj.s else pobj.fs * pobj.s, alpha = pobj.alpha , marker = pobj.marker, edgecolors = pobj.marker_edgecolor, linewidths = pobj.lw, linestyles = pobj.ls, facecolors = pobj.marker_facecolor, label = trainlabel ) if Xt is not None: if pobj.s is not None: pobj.s /=2 test_kws = test_kws or dict ( color = 'r',s = pobj.s, alpha = pobj.alpha , marker = pobj.marker, edgecolors = 'r', linewidths = pobj.lw, linestyles = pobj.ls, facecolors = 'k' ) ax.scatter(xt, yt, label = testlabel, **test_kws ) ax.set_xlim (pobj.xlim) ax.set_ylim (pobj.ylim) ax.set_xlabel( xname, fontsize= pobj.font_size ) ax.set_ylabel (yname, fontsize= pobj.font_size ) ax.tick_params(axis='both', labelsize= pobj.font_size ) plt.xticks(rotation = pobj.rotate_xlabel) plt.yticks(rotation = pobj.rotate_ylabel) if pobj.show_grid is True : ax.grid(pobj.show_grid, axis=pobj.gaxis, which = pobj.gwhich, color = pobj.gc, linestyle=pobj.gls, linewidth=pobj.glw, alpha = pobj.galpha ) if pobj.gwhich =='minor': ax.minorticks_on() if len(pobj.leg_kws) ==0 or 'loc' not in pobj.leg_kws.keys(): pobj.leg_kws['loc']='upper left' ax.legend(**pobj.leg_kws) pobj.save(fig)
[docs] def plotModel( yt: ArrayLike |Series, ypred:ArrayLike |Series=None, *, clf:F=None, Xt:DataFrame|NDArray=None, predict:bool =False, prefix:Optional[bool]=None, index:List[int|str] =None, fill_between:bool=False, labels:List[str]=None, return_ypred:bool=False, **baseplot_kws ): """ Plot model 'y' (true labels) versus 'ypred' (predicted) from test data. Plot will allow to know where estimator/classifier fails to predict correctly the target Parameters ---------- yt:array-like, shape (M, ) ``M=m-samples``, test target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. ypred:array-like, shape (M, ) ``M=m-samples`` Array of the predicted labels. It has the same number of samples as the test data 'Xt' clf :callable, always as a function, classifier estimator A supervised predictor with a finite set of discrete possible output values. A classifier must supports modeling some of binary, targets. It must store a classes attribute after fitting. Xt: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Shorthand for "test set"; data that is observed at testing and prediction time, used as independent variables in learning. The notation is uppercase to denote that it is ordinarily a matrix. prefix: str, optional litteral string to prefix the samples/examples considered as tick labels in the abscissa. For instance:: index =[0, 2, 4, 7] prefix ='b' --> index =['b0', 'b2', 'b4', 'b7'] predict: bool, default=False, Expected to be 'True' when user want to predict the array 'ypred' and plot at the same time. Otherwise, can be set to 'False' and use the'ypred' data already predicted. Note that, if 'True', an estimator/classifier must be provided as well as the test data 'Xt', otherwise an error will occur. index: array_like, optional list integer values or string expected to be the index of 'Xt' and 'yt' turned into pandas dataframe and series respectively. Note that one of them has already and index and new index is given, the latter must be consistent. This is usefull when data are provided as ndarray rathern than a dataframe. fill_between: bool Fill a line between the actual classes i.e the true labels. labels: list of str or int, Optional list of labels names to hold the name of each category. return_pred: bool, return predicted 'ypred' if 'True' else nothing. baseplot_kws: dict, All all the keywords arguments passed to the peroperty :class:`watex.property.BasePlot` class. Examples -------- (1)-> Prepare our data - Use analysis data of Bagoue dataset since data is alread scaled and imputed >>> from watex.exlib.sklearn import SVC >>> from watex.datasets import fetch_data >>> from watex.view import plotModel >>> from watex.utils.mlutils import split_train_test_by_id >>> X, y = fetch_data('bagoue analysis' ) >>> _, Xtest = split_train_test_by_id(X, test_ratio=.3 , # 30% in test set keep_colindex= False ) >>> _, ytest = split_train_test_by_id(y, .3 , keep_colindex =False) (2)-> prepared our demo estimator and plot model predicted >>> svc_clf = SVC(C=100, gamma=1e-2, kernel='rbf', random_state =42) >>> base_plot_params ={ 'lw' :3., # line width 'lc':(.9, 0, .8), 'ms':7., 'yp_marker' :'o', 'fig_size':(12, 8), 'font_size':15., 'xlabel': 'Test examples', 'ylabel':'Flow categories' , 'marker':'o', 'markeredgecolor':'k', 'markerfacecolor':'b', 'markeredgewidth':3, 'yp_markerfacecolor' :'k', 'yp_markeredgecolor':'r', 'alpha' :1., 'yp_markeredgewidth':2., 'show_grid' :True, 'galpha' :0.2, 'glw':.5, 'rotate_xlabel' :90., 'fs' :3., 's' :20 , 'rotate_xlabel':90 } >>> plotModel(yt= ytest , Xt=Xtest , predict =True , # predict the result (estimator fit) clf=svc_clf , fill_between= False, prefix ='b', labels=['FR0', 'FR1', 'FR2', 'FR3'], # replace 'y' labels. **base_plot_params ) >>> # plot show where the model failed to predict the target 'yt' """ def format_ticks (ind, tick_number): """ Format thick parameter with 'FuncFormatter(func)' rather than using:: axi.xaxis.set_major_locator (plt.MaxNLocator(3)) ax.xaxis.set_major_formatter (plt.FuncFormatter(format_thicks)) """ if ind % 7 ==0: return '{}'.format (index[ind]) else: None #xxxxxxxxxxxxxxxx update base plot keyword arguments xxxxxxxxxxxxxx for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) # index is used for displaying the examples label in x-abscissa # for instance index = ['b4, 'b5', 'b11', ... ,'b425', 'b427', 'b430'] Xt, yt,index, clf, ypred= _chk_predict_args ( Xt, yt,index, clf, ypred , predict= predict ) if prefix is not None: index =np.array([f'{prefix}' +str(item) for item in index ]) # create figure obj fig = plt.figure(figsize = pobj.fig_size) ax = fig.add_subplot(1,1,1) # create figure obj # control the size of predicted items pobj.s = pobj.s or pobj.fs *30 # plot obverved data (test label =actual) ax.scatter(x= index, y =yt , color = pobj.lc, s = pobj.s*10, alpha = pobj.alpha, marker = pobj.marker, edgecolors = pobj.marker_edgecolor, linewidths = pobj.lw, linestyles = pobj.ls, facecolors = pobj.marker_facecolor, label = 'Observed' ) # plot the predicted target ax.scatter(x= index, y =ypred , color = pobj.yp_lc, s = pobj.s/2, alpha = pobj.alpha, marker = pobj.yp_marker, edgecolors = pobj.yp_marker_edgecolor, linewidths = pobj.yp_lw, linestyles = pobj.yp_ls, facecolors = pobj.yp_marker_facecolor, label = 'Predicted' ) if fill_between: ax.plot(yt, c=pobj.lc, ls=pobj.ls, lw=pobj.lw, alpha=pobj.alpha ) if pobj.ylabel is None: pobj.ylabel ='Categories ' if pobj.xlabel is None: pobj.xlabel = 'Test data' if labels is not None: if not is_iterable(labels): labels =[labels] if len(labels) != len(np.unique(yt)): warnings.warn( "Number of categories in 'yt' and labels must be consistent." f" Expected {len(np.unique(yt))}, got {len(labels)}") else: ax.set_yticks(np.unique(yt)) ax.set_yticklabels(labels) ax.set_ylabel (pobj.ylabel, fontsize= pobj.font_size ) ax.set_xlabel (pobj.xlabel, fontsize= pobj.font_size ) if pobj.tp_axis is None or pobj.tp_axis =='both': ax.tick_params(axis=pobj.tp_axis, labelsize= pobj.tp_labelsize *5 , ) elif pobj.tp_axis =='x': param_='y' elif pobj.tp_axis =='y': param_='x' if pobj.tp_axis in ('x', 'y'): ax.tick_params(axis=pobj.tp_axis, labelsize= pobj.tp_labelsize *5 , ) ax.tick_params(axis=param_, labelsize= pobj.font_size, ) # show label every 14 samples if len(yt ) >= 14 : ax.xaxis.set_major_formatter (plt.FuncFormatter(format_ticks)) plt.xticks(rotation = pobj.rotate_xlabel) plt.yticks(rotation = pobj.rotate_ylabel) if pobj.show_grid: ax.grid(pobj.show_grid, axis=pobj.gaxis, which = pobj.gwhich, color = pobj.gc, linestyle=pobj.gls, linewidth=pobj.glw, alpha = pobj.galpha ) if pobj.gwhich =='minor': ax.minorticks_on() if len(pobj.leg_kws) ==0 or 'loc' not in pobj.leg_kws.keys(): pobj.leg_kws['loc']='upper left' ax.legend(**pobj.leg_kws) pobj.save(fig) return ypred if return_ypred else None
[docs] def plot_reg_scoring( reg, X, y, test_size=None, random_state =42, scoring ='mse', return_errors: bool=False, **baseplot_kws ): #xxxxxxxxxxxxxxxx update base plot keyword arguments for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) scoring = scoring or 'mse' scoring = str(scoring).lower().strip() if scoring not in ('mse', 'rme'): raise ValueError ("Acceptable scorings are'mse' are 'rmse'" f" got {scoring!r}") if not hasattr(reg, '__class__') and not inspect.isclass(reg.__class__): raise TypeError(f"{reg!r} isn't a model estimator.") X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=test_size, random_state=random_state) train_errors, val_errors = [], [] for m in range(1, len(y_train)): try: reg.fit(X_train[:m], y_train[:m]) except ValueError: # value_error # raise ValueError (msg) from value_error # skip the valueError # <The number of classes has to be greater # than one; got 1 class> continue y_train_pred = reg.predict(X_train[:m]) y_val_pred = reg.predict(X_val) if scoring in ('mse','rmse') : train_errors.append(mean_squared_error( y_train_pred, y_train[:m])) val_errors.append( mean_squared_error(y_val_pred, y_val)) else: train_errors.append(sum( y_train_pred==y_train[:m])/len(y_train_pred)) val_errors.append( sum(y_val_pred==y_val)/len(y_val_pred)) # create figure obj if scoring =='rmse': train_errors= np.sqrt(train_errors) val_errors = np.sqrt(val_errors) if pobj.ylabel is None: pobj.ylabel ='Score' if pobj.xlabel is None: pobj.xlabel = 'Training set size' fig = plt.figure(figsize = pobj.fig_size) ax = fig.add_subplot(1,1,1) # create figure obj # set new attributes for nv, vv in zip(('vlc', 'vls'), ('b', ':')): if not hasattr(pobj, nv): setattr(pobj, nv, vv) ax.plot(train_errors, color = pobj.lc, linewidth = pobj.lw, linestyle = pobj.ls , label = 'training set', **pobj.plt_kws ) ax.plot(val_errors, color = pobj.vlc, linewidth = pobj.lw, linestyle = pobj.vls , label = 'validation set', **pobj.plt_kws ) _remaining_plot_roperties(pobj, ax, fig=fig ) return (train_errors, val_errors) if return_errors else None
plot_reg_scoring.__doc__ ="""\ Plot regressor learning curves using root-mean squared error scorings. Use the hold-out cross-validation technique for score evaluation [1]_. Parameters ----------- reg: callable, always as a function A regression estimator; Estimators must provide a fit method, and should provide `set_params` and `get_params`, although these are usually provided by inheritance from `base.BaseEstimator`. The estimated model is stored in public and private attributes on the estimator instance, facilitating decoding through prediction and transformation methods. The core functionality of some estimators may also be available as a ``function``. {params.core.X} {params.core.y} scoring: str, ['mse'|'rmse'], default ='mse' kind of error to visualize on the regression learning curve. {params.core.test_size} {params.core.random_state} return_errors: bool, default='False' returns training eror and validation errors. baseplot_kws: dict, All all the keywords arguments passed to the peroperty :class:`watex.property.BasePlot` class. Returns -------- (train_errors, val_errors): Tuple, training score and validation scores if `return_errors` is set to ``True``, otherwise returns nothing Examples --------- >>> from watex.datasets import fetch_data >>> from watex.view.mlplot import plot_reg_scoring >>> # Note that for the demo, we import SVC rather than LinearSVR since the >>> # problem of Bagoue dataset is a classification rather than regression. >>> # if use regression instead, a convergence problem will occurs. >>> from watex.exlib.sklearn import SVC >>> X, y = fetch_data('bagoue analysed')# got the preprocessed and imputed data >>> svm =SVC() >>> t_errors, v_errors =plot_reg_scoring(svm, X, y, return_errors=True) Notes ------ The hold-out technique is the classic and most popular approach for estimating the generalization performance of the machine learning. The dataset is splitted into training and test sets. The former is used for the model training whereas the latter is used for model performance evaluation. However in typical machine learning we are also interessed in tuning and comparing different parameter setting for futher improve the performance for the name refering to the given classification or regression problem for which we want the optimal values of tuning the hyperparameters. Thus, reusing the same datset over and over again during the model selection is not recommended since it will become a part of the training data and then the model will be more likely to overfit. From this issue, the hold-out cross validation is not a good learning practice. A better way to use the hold-out method is to separate the data into three parts such as the traing set, the the validation set and the test dataset. See more in [2]_. References ------------ .. [1] Pedregosa, F., Varoquaux, G., Gramfort, A., Michel, V., Thirion, B., Grisel, O., Blondel, M., et al. (2011) Scikit-learn: Machine learning in Python. J. Mach. Learn. Res., 12, 2825–2830. .. [2] Raschka, S. & Mirjalili, V. (2019) Python Machine Learning. (J. Malysiak, S. Jain, J. Lovell, C. Nelson, S. D’silva & R. Atitkar, Eds.), 3rd ed., Packt. """.format(params = _param_docs)
[docs] def plot_model_scores(models, scores=None, cv_size=None, **baseplot_kws): #xxxxxxxxxxxxxxxx set base plot keywords arguments for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) # if scores is None: # raise ValueError('NoneType can not be plot.') if isinstance(models, str): models = str2columns (models) if not is_iterable(models): models =[models] _ckeck_score = scores is not None if _ckeck_score : scores = is_iterable(scores, exclude_string=True, transform= True ) # if is_iterable(models) and is_iterable(scores): if len(models) != len(scores): raise TypeError( "Fined-tuned model and scores sizes must be consistent;" f" got {len(models)!r} and {len(scores)} respectively.") elif scores is None: # check wether scores are appended to model try : scores = [score for _, score in models] except: raise TypeError ( "Missing score(s). Scores are needed for each model.") models= [model for model, _ in models ] # for item assigments, use list instead. models=[[bn, bscore] for bn, bscore in zip(models, scores)] for ii, (model, _) in enumerate(models) : model = model or 'None' if not isinstance (model, str): if inspect.isclass(model.__class__): models[ii][0] = model.__class__.__name__ else: models[ii][0] = type(model).__name__ # get_the minimal size from cv if not isinstance(cv, (int, float) ): cv_size_min = min ( [ len(models[i][1]) for i in range (len(models))]) if cv_size is None: cv_size = cv_size_min if cv_size is not None: try : cv_size = int(cv_size) except: raise ValueError( f"Expect a number for 'cv', got {type(cv_size).__name__!r}.") if cv_size < 1 : raise ValueError ( f"cv must contain at least one positivevalue, got {cv_size}") elif cv_size > cv_size_min : raise ValueError(f"Size for cv is too large; expect {cv_size_min}" f" as a maximum size, got {cv_size}") # shrink to the number of validation to keep the same size for all # give model models = [(modelname, modelval[:cv_size] ) for modelname, modelval in models] # customize plots with colors lines and styles # and create figure obj lcs_kws = {'lc': make_mpl_properties(cv_size), 'ls':make_mpl_properties(cv_size, 'line') } lcs_kws ['ls']= [pobj.ls] + lcs_kws['ls'] lcs_kws ['lc']= [pobj.lc] + lcs_kws['lc'] # create figure obj and change style # if sns_style is passed as base_plot_params fig = plt.figure(figsize = pobj.fig_size) ax = fig.add_subplot(1,1,1) if pobj.sns_style is not None: sns.set_style(pobj.sns_style) for k in range(len(models)): ax.plot( # np.array([i for i in range(cv_size)]) +1, np.arange (cv_size) +1, models[k][1], color = lcs_kws['lc'][k], linewidth = pobj.lw, linestyle = lcs_kws['ls'][k], label = models[k][0], ) # appendLineParams(pobj, ax, xlim=pobj.xlim, ylim=pobj.ylim) _remaining_plot_roperties(pobj, ax, xlim=pobj.xlim, ylim=pobj.ylim, fig=fig ) pobj.save(fig)
plot_model_scores.__doc__="""\ uses the cross validation to get an estimation of model performance generalization. It Visualizes model fined tuned scores vs the cross validation Parameters ---------- models: list of callables, always as a functions, list of estimator names can also be a pair estimators and validations scores.For instance estimators and scores can be arranged as:: models =[('SVM', scores_svm), ('LogRegress', scores_logregress), ...] If that arrangement is passed to `models` parameter then no need to pass the score values of each estimators in `scores`. Note that a model is an object which manages the estimation and decoding. The model is estimated as a deterministic function of: * parameters provided in object construction or with set_params; * the global numpy.random random state if the estimator’s random_state parameter is set to None; and * any data or sample properties passed to the most recent call to fit, fit_transform or fit_predict, or data similarly passed in a sequence of calls to partial_fit. list of estimators names or a pairs estimators and validations scores. For instance:: clfs =[('SVM', scores_svm), ('LogRegress', scores_logregress), ...] scores: array like list of scores on different validation sets. If scores are given, set only the name of the estimators passed to `models` like:: models =['SVM', 'LogRegress', ...] scores=[scores_svm, scores_logregress, ...] cv_size: float or int, The number of fold used for validation. If different models have different cross validation values, the minimum size of cross validation is used and the scored of each model is resized to match the minimum size number. baseplot_kws: dict, All all the keywords arguments passed to the peroperty :class:`watex.property.BasePlot` class. Examples --------- (1) -> Score is appended to the model >>> from watex.exlib.sklearn import SVC >>> from watex.view.mlplot import plot_model_scores >>> import numpy as np >>> svc_model = SVC() >>> fake_scores = np.random.permutation (np.arange (0, 1, .05)) >>> plot_model_scores([(svc_model, fake_scores )]) ... (2) -> Use model and score separately >>> plot_model_scores([svc_model],scores =[fake_scores] )# >>> # customize plot by passing keywords properties >>> base_plot_params ={ 'lw' :3., 'lc':(.9, 0, .8), 'ms':7., 'fig_size':(12, 8), 'font_size':15., 'xlabel': 'samples', 'ylabel':'scores' , 'marker':'o', 'alpha' :1., 'yp_markeredgewidth':2., 'show_grid' :True, 'galpha' :0.2, 'glw':.5, 'rotate_xlabel' :90., 'fs' :3., 's' :20 , 'sns_style': 'darkgrid', } >>> plot_model_scores([svc_model],scores =[fake_scores] , **base_plot_params ) """
[docs] def plotDendroheat( df: DataFrame |NDArray, columns: List[str] =None, labels:Optional[List[str]] =None, metric:str ='euclidean', method:str ='complete', kind:str = 'design', cmap:str ='hot_r', fig_size:Tuple[int] =(8, 8), facecolor:str ='white', **kwd ): """ Attaches dendrogram to a heat map. Hierachical dendrogram are often used in combination with a heat map which allows us to represent the individual value in data array or matrix containing our training examples with a color code. Parameters ------------ df: dataframe or NDArray of (n_samples, n_features) dataframe of Ndarray. If array is given , must specify the column names to much the array shape 1 columns: list list of labels to name each columns of arrays of (n_samples, n_features) If dataframe is given, don't need to specify the columns. kind: str, ['squareform'|'condense'|'design'], default is {'design'} kind of approach to summing up the linkage matrix. Indeed, a condensed distance matrix is a flat array containing the upper triangular of the distance matrix. This is the form that ``pdist`` returns. Alternatively, a collection of :math:`m` observation vectors in :math:`n` dimensions may be passed as an :math:`m` by :math:`n` array. All elements of the condensed distance matrix must be finite, i.e., no NaNs or infs. Alternatively, we could used the ``squareform`` distance matrix to yield different distance values than expected. the ``design`` approach uses the complete inpout example matrix also called 'design matrix' to lead correct linkage matrix similar to `squareform` and `condense``. metric : str or callable, default is {'euclidean'} The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If ``X`` is the distance array itself, use "precomputed" as the metric. Precomputed distance matrices must have 0 along the diagonal. method : str, optional, default is {'complete'} The linkage algorithm to use. See the ``Linkage Methods`` section below for full descriptions in :func:`watex.utils.exmath.linkage_matrix` labels : ndarray, optional By default, ``labels`` is None so the index of the original observation is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the text to put under the :math:`i` th leaf node only if it corresponds to an original observation and not a non-singleton cluster. cmap: str , default is {'hot_r'} matplotlib color map fig_size: str , Tuple , default is {(8, 8)} the size of the figure facecolor: str , default is {"white"} Matplotlib facecolor kwd: dict additional keywords arguments passes to :func:`scipy.cluster.hierarchy.dendrogram` Examples --------- >>> # (1) -> Use random data >>> import numpy as np >>> from watex.view.mlplot import plotDendroheat >>> np.random.seed(123) >>> variables =['X', 'Y', 'Z'] ; labels =['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4'] >>> X= np.random.random_sample ([5,3]) *10 >>> df =pd.DataFrame (X, columns =variables, index =labels) >>> plotDendroheat (df) >>> # (2) -> Use Bagoue data >>> from watex.datasets import load_bagoue >>> X, y = load_bagoue (as_frame=True ) >>> X =X[['magnitude', 'power', 'sfi']].astype(float) # convert to float >>> plotDendroheat (X ) """ df=check_array ( df, input_name="Data 'df' ", to_frame =True, ) if columns is not None: if isinstance (columns , str): columns = [columns] if len(columns)!= df.shape [1]: raise TypeError("X and columns must be consistent," f" got {len(columns)} instead of {df.shape [1]}" ) df = pd.DataFrame(data = df, columns = columns ) # create a new figure object and define x axis position # and y poaition , width, heigh of the dendrogram via the # add_axes attributes. Furthermore, we rotate the dengrogram # to 90 degree counter-clockwise. fig = plt.figure (figsize = fig_size , facecolor = facecolor ) axd = fig.add_axes ([.09, .1, .2, .6 ]) row_cluster = linkage_matrix(df = df, metric= metric, method =method , kind = kind , ) orient ='left' # use orientation 'right for matplotlib version < v1.5.1 mpl_version = mpl.__version__.split('.') if mpl_version [0] =='1' : if mpl_version [1] =='5' : if float(mpl_version[2]) < 1. : orient = 'right' r = dendrogram(row_cluster , orientation= orient, **kwd ) # 2. reorder the data in our initial dataframe according # to the clustering label that can be accessed by a dendrogram # which is essentially a Python dictionnary via a key leaves df_rowclust = df.iloc [r['leaves'][::-1]] if hasattr( df, 'columns') else df [r['leaves'][::-1]] # 3. construct the heatmap from the reordered dataframe and position # in the next ro the dendrogram axm = fig.add_axes ([.23, .1, .63, .6]) #.6 # [.23, .1, .2, .6] cax = axm.matshow (df_rowclust , interpolation = 'nearest' , cmap=cmap, ) #4. modify the asteric of the dendogram by removing the axis # ticks and hiding the axis spines. Also we add a color bar and # assign the feature and data record names to names x and y axis # tick lables, respectively axd.set_xticks ([]) # set ticks invisible axd.set_yticks ([]) for i in axd.spines.values () : i.set_visible (False) fig.colorbar(cax ) xticks_loc = list(axm.get_xticks()) yticks_loc = list(axm.get_yticks()) df_rowclust_cols = df_rowclust.columns if hasattr ( df_rowclust , 'columns') else [f"{i+1}" for i in range (df.shape[1])] axm.xaxis.set_major_locator(mticker.FixedLocator(xticks_loc)) axm.xaxis.set_major_formatter(mticker.FixedFormatter( [''] + list (df_rowclust_cols))) df_rowclust_index = df_rowclust.index if hasattr( df_rowclust , 'columns') else [f"{i}" for i in range (df.shape[0])] axm.yaxis.set_major_locator(mticker.FixedLocator(yticks_loc)) axm.yaxis.set_major_formatter(mticker.FixedFormatter( [''] + list (df_rowclust_index))) plt.show ()
[docs] def plotDendrogram ( df:DataFrame, columns:List[str] =None, labels: ArrayLike =None, metric:str ='euclidean', method:str ='complete', kind:str = None, return_r:bool =False, verbose:bool=False, **kwd ): r""" Visualizes the linkage matrix in the results of dendrogram. Note that the categorical features if exist in the dataframe should automatically be discarded. Parameters ----------- df: dataframe or NDArray of (n_samples, n_features) dataframe of Ndarray. If array is given , must specify the column names to much the array shape 1 columns: list list of labels to name each columns of arrays of (n_samples, n_features) If dataframe is given, don't need to specify the columns. kind: str, ['squareform'|'condense'|'design'], default is {'design'} kind of approach to summing up the linkage matrix. Indeed, a condensed distance matrix is a flat array containing the upper triangular of the distance matrix. This is the form that ``pdist`` returns. Alternatively, a collection of :math:`m` observation vectors in :math:`n` dimensions may be passed as an :math:`m` by :math:`n` array. All elements of the condensed distance matrix must be finite, i.e., no NaNs or infs. Alternatively, we could used the ``squareform`` distance matrix to yield different distance values than expected. the ``design`` approach uses the complete inpout example matrix also called 'design matrix' to lead correct linkage matrix similar to `squareform` and `condense``. metric : str or callable, default is {'euclidean'} The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If ``X`` is the distance array itself, use "precomputed" as the metric. Precomputed distance matrices must have 0 along the diagonal. method : str, optional, default is {'complete'} The linkage algorithm to use. See the ``Linkage Methods`` section below for full descriptions in :func:`watex.utils.exmath.linkage_matrix` labels : ndarray, optional By default, ``labels`` is None so the index of the original observation is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the text to put under the :math:`i` th leaf node only if it corresponds to an original observation and not a non-singleton cluster. return_r: bool, default='False', return r-dictionnary if set to 'True' otherwise returns nothing verbose: int, bool, default='False' If ``True``, output message of the name of categorical features dropped. kwd: dict additional keywords arguments passes to :func:`scipy.cluster.hierarchy.dendrogram` Returns ------- r : dict A dictionary of data structures computed to render the dendrogram. Its has the following keys: ``'color_list'`` A list of color names. The k'th element represents the color of the k'th link. ``'icoord'`` and ``'dcoord'`` Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]`` where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]`` where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``. ``'ivl'`` A list of labels corresponding to the leaf nodes. ``'leaves'`` For each i, ``H[i] == j``, cluster node ``j`` appears in position ``i`` in the left-to-right traversal of the leaves, where :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the ``i``-th leaf node corresponds to an original observation. Otherwise, it corresponds to a non-singleton cluster. ``'leaves_color_list'`` A list of color names. The k'th element represents the color of the k'th leaf. Examples ---------- >>> from watex.datasets import load_iris >>> from watex.view import plotDendrogram >>> data = load_iris () >>> X =data.data[:, :2] >>> plotDendrogram (X, columns =['X1', 'X2' ] ) """ if hasattr (df, 'columns') and columns is not None: df = df [columns ] df = to_numeric_dtypes(df, pop_cat_features= True, verbose =verbose ) df=check_array ( df, input_name="Data 'df' ", to_frame =True, ) kind:str = kind or 'design' row_cluster = linkage_matrix(df = df, columns = columns, metric= metric, method =method , kind = kind , ) #make dendogram black (1/2) # set_link_color_palette(['black']) r= dendrogram(row_cluster, labels= labels , # make dendogram colors (2/2) # color_threshold= np.inf, **kwd) plt.tight_layout() plt.ylabel ('Euclidian distance') plt.show () return r if return_r else None
[docs] def plotSilhouette ( X:NDArray |DataFrame, labels:ArrayLike=None, prefit:bool=True, n_clusters:int =3, n_init: int=10 , max_iter:int=300 , random_state:int=None , tol:float=1e4 , metric:str='euclidean', **kwd ): r""" quantifies the quality of clustering samples. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. If a sparse matrix is passed, a copy will be made if it's not in CSR format. labels : array-like 1d of shape (n_samples,) Label values for each sample. n_clusters : int, default=8 The number of clusters to form as well as the number of centroids to generate. prefit : bool, default=False Whether a prefit `labels` is expected to be passed into the function directly or not. If `True`, `labels` must be a fit predicted values target. If `False`, `labels` is fitted and updated from `X` by calling `fit_predict` methods. Any other values passed to `labels` is discarded. n_init : int, default=10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, default=300 Maximum number of iterations of the k-means algorithm for a single run. tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. verbose : int, default=0 Verbosity mode. random_state : int, RandomState instance or None, default=42 Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If ``X`` is the distance array itself, use "precomputed" as the metric. Precomputed distance matrices must have 0 along the diagonal. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still metric dependent. See the scipy docs for usage examples. Note ------- The sihouette coefficient is bound between -1 and 1 See More --------- Silhouette is used as graphical tools, to plot a measure how tighly is grouped the examples of the clusters are. To calculate the silhouette coefficient, three steps is allows: * calculate the **cluster cohesion**, :math:`a(i)`, as the average distance between examples, :math:`x^{(i)}`, and all the others points * calculate the **cluster separation**, :math:`b^{(i)}` from the next average distance between the example , :math:`x^{(i)}` amd all the example of nearest cluster * calculate the silhouette, :math:`s^{(i)}`, as the difference between the cluster cohesion and separation divided by the greater of the two, as shown here: .. math:: s^{(i)}=\frac{b^{(i)} - a^{(i)}}{max {{b^{(i)},a^{(i)} }}} Examples -------- >>> from watex.datasets import load_hlogs >>> from watex.view.mlplot import plotSilhouette >>> # use resistivity and gamma for this demo >>> X_res_gamma = load_hlogs().frame[['resistivity', 'gamma_gamma']] (1) Plot silhouette with 'prefit' set to 'False' >>> plotSilhouette (X_res_gamma, prefit =False) """ if ( not prefit and labels is not None ): warnings.warn("'labels' is given while 'prefix' is 'False'" "'prefit' will set to 'True'") prefit=True if labels is not None: if not hasattr (labels, '__array__'): raise TypeError( "Labels (target 'y') expects an array-like: " f"{type(labels).__name__!r}") labels=check_y ( labels, to_frame =True, ) if len(labels)!=len(X): raise TypeError("X and labels must have a consistency size." f"{len(X)} and {len(labels)} respectively.") if prefit and labels is None: raise TypeError ("Labels can not be None, while 'prefit' is 'True'" " Turn 'prefit' to 'False' or provide the labels " "instead.") if not prefit : km= KMeans (n_clusters =n_clusters , init='k-means++', n_init =n_init , max_iter = max_iter , tol=tol, random_state =random_state ) labels = km.fit_predict(X ) return _plotSilhouette(X, labels, metric = metric , **kwd)
def _plotSilhouette (X, labels, metric ='euclidean', **kwds ): r"""Plot quantifying the quality of clustering silhouette Parameters --------- X : array-like of shape (n_samples_a, n_samples_a) if metric == \ "precomputed" or (n_samples_a, n_features) otherwise An array of pairwise distances between samples, or a feature array. labels : array-like of shape (n_samples,) Label values for each sample. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`. If ``X`` is the distance array itself, use "precomputed" as the metric. Precomputed distance matrices must have 0 along the diagonal. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a ``scipy.spatial.distance`` metric, the parameters are still metric dependent. See the scipy docs for usage examples. Examples --------- >>> import numpy as np >>> from watex.exlib.sklearn import KMeans >>> from watex.datasets import load_iris >>> from watex.view.mlplot import plotSilhouette >>> d= load_iris () >>> X= d.data [:, 0][:, np.newaxis] # take the first axis >>> km= KMeans (n_clusters =3 , init='k-means++', n_init =10 , max_iter = 300 , tol=1e-4, random_state =0 ) >>> y_km = km.fit_predict(X) >>> plotSilhouette (X, y_km) See also --------- watex.utils.plotutils.plot_silhouette: Plot naive silhouette Notes ------ Silhouette is used as graphical tools, to plot a measure how tighly is grouped the examples of the clusters are. To calculate the silhouette coefficient, three steps is allows: * calculate the **cluster cohesion**, :math:`a(i)`, as the average distance between examples, :math:`x^{(i)}`, and all the others points * calculate the **cluster separation**, :math:`b^{(i)}` from the next average distance between the example , :math:`x^{(i)}` amd all the example of nearest cluster * calculate the silhouette, :math:`s^{(i)}`, as the difference between the cluster cohesion and separation divided by the greater of the two, as shown here: .. math:: s^{(i)}=\frac{b^{(i)} - a^{(i)}}{max {{b^{(i)},a^{(i)} }}} Note that the sihouette coefficient is bound between -1 and 1 """ cluster_labels = np.unique (labels) n_clusters = cluster_labels.shape [0] silhouette_vals = silhouette_samples(X, labels= labels, metric = metric , **kwds) y_ax_lower , y_ax_upper = 0, 0 yticks =[] for i, c in enumerate (cluster_labels ) : c_silhouette_vals = silhouette_vals[labels ==c ] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color =cm.jet (float(i)/n_clusters ) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height =1.0 , edgecolor ='none', color =color, ) yticks.append((y_ax_lower + y_ax_upper)/2.) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline (silhouette_avg, color='red', linestyle ='--' ) plt.yticks(yticks, cluster_labels +1 ) plt.ylabel ("Cluster") plt.xlabel ("Silhouette coefficient") plt.tight_layout() plt.show()
[docs] def plotLearningInspections ( models:List[object] , X:NDArray, y:ArrayLike, fig_size:Tuple[int] = ( 22, 18 ) , cv: int = None, savefig:Optional[str] = None, titles = None, subplot_kws =None, **kws ): """ Inspect multiple models from their learning curves. Mutiples Inspection plots that generate the test and training learning curve, the training samples vs fit times curve, the fit times vs score curve for each model. Parameters ---------- models : list of estimator instances Each estimator instance implements `fit` and `predict` methods which will be cloned for each validation. X : array-like of shape (n_samples, n_features) Training vector, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : array-like of shape (n_samples) or (n_samples, n_features) Target relative to ``X`` for classification or regression; None for unsupervised learning. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer Sckikit-learn :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. savefig: str, default =None , the path to save the figures. Argument is passed to matplotlib.Figure class. titles: str, list List of model names if changes are needed. If ``None``, model names are used by default. kws: dict, Additional keywords argument passed to :func:`plotLearningInspection`. Returns ---------- axes: Matplotlib axes See also --------- plotLearningInspection: Inspect single model Examples --------- >>> from watex.datasets import fetch_data >>> from watex.models.premodels import p >>> from watex.view.mlplot import plotLearningInspections >>> # import sparse matrix from Bagoue dataset >>> X, y = fetch_data ('bagoue prepared') >>> # import the two pretrained models from SVM >>> models = [p.SVM.rbf.best_estimator_ , p.SVM.poly.best_estimator_] >>> plotLearningInspections (models , X, y, ylim=(0.7, 1.01) ) """ models = is_iterable(models, exclude_string= True, transform =True ) titles = list(is_iterable( titles , exclude_string= True, transform =True )) if len(titles ) != len(models): titles = titles + [None for i in range (len(models)- len(titles))] # set the cross-validation to 4 cv = cv or 4 #set figure and subplots if len(models)==1: msg = ( f"{plotLearningInspection.__module__}." f"{plotLearningInspection.__qualname__}" ) raise PlotError ("For a single model inspection, use the" f" function {msg!r} instead." ) fig , axes = plt.subplots (3 , len(models), figsize = fig_size ) subplot_kws = subplot_kws or dict( left=0.0625, right = 0.95, wspace = 0.1, hspace = .5 ) fig.subplots_adjust(**subplot_kws) if not is_iterable( axes) : axes =[axes ] cscores =[]; vscores =[] for kk, model in enumerate ( models ) : title = titles[kk] or get_estimator_name (model ) _, csc, vsc = plotLearningInspection(model, X=X , y=y, axes = axes [:, kk], title =title, **kws) cscores.append(csc); vscores.append (vsc) print("*"*77) print("Convergence scores =", np.around(np.mean(cscores), 4)) print("validation scores =", np.around (np.nanmean(vscores), 4)) print("*"*77) if savefig : fig.savefig (savefig , dpi = 300 ) plt.show () if savefig is None else plt.close ()
[docs] def plotLearningInspection( model, X, y, axes=None, ylim=None, cv=5, n_jobs=None, train_sizes=None, display_legend = True, title=None, ): """Inspect model from its learning curve. Generate 3 plots: the test and training learning curve, the training samples vs fit times curve, the fit times vs score curve. Parameters ---------- model : estimator instance An estimator instance implementing `fit` and `predict` methods which will be cloned for each validation. title : str Title for the chart. X : array-like of shape (n_samples, n_features) Training vector, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. y : array-like of shape (n_samples) or (n_samples, n_features) Target relative to ``X`` for classification or regression; None for unsupervised learning. axes : array-like of shape (3,), default=None Axes to use for plotting the curves. ylim : tuple of shape (2,), default=None Defines minimum and maximum y-values plotted, e.g. (ymin, ymax). cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like of shape (n_ticks,) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the ``dtype`` is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) display_legend: bool, default ='True' display the legend Returns ---------- axes: Matplotlib axes Examples ---------- >>> from watex.datasets import fetch_data >>> from watex.models import p >>> from watex.view.mlplot import plotLearningInspection >>> # import sparse matrix from Bagoue datasets >>> X, y = fetch_data ('bagoue prepared') >>> # import the pretrained Radial Basis Function (RBF) from SVM >>> plotLearningInspection (p.SVM.rbf.best_estimator_ , X, y ) """ train_sizes = train_sizes or np.linspace(0.1, 1.0, 5) X, y = check_X_y( X, y, accept_sparse= True, to_frame =True ) if axes is None: _, axes = plt.subplots(1, 3, figsize=(20, 5)) axes[0].set_title(title or get_estimator_name(model)) if ylim is not None: axes[0].set_ylim(*ylim) axes[0].set_xlabel("Training examples") axes[0].set_ylabel("Score") train_sizes, train_scores, test_scores, fit_times, _ = learning_curve( model, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, return_times=True, ) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) # Plot learning curve axes[0].grid() axes[0].fill_between( train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r", ) axes[0].fill_between( train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g", ) axes[0].hlines( np.mean([train_scores[-1], test_scores[-1]]), train_sizes[0], train_sizes[-1], color="gray", linestyle ="--", label="Convergence score" ) axes[0].plot( train_sizes, train_scores_mean, "o-", color="r", label="Training score" ) axes[0].plot( train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score" ) if display_legend: axes[0].legend(loc="best") # set title name title_name = ( f"{'the model'if title else get_estimator_name(model)}" ) # Plot n_samples vs fit_times axes[1].grid() axes[1].plot(train_sizes, fit_times_mean, "o-") axes[1].fill_between( train_sizes, fit_times_mean - fit_times_std, fit_times_mean + fit_times_std, alpha=0.1, ) axes[1].set_xlabel("Training examples") axes[1].set_ylabel("fit_times") axes[1].set_title(f"Scalability of {title_name}") # Plot fit_time vs score fit_time_argsort = fit_times_mean.argsort() fit_time_sorted = fit_times_mean[fit_time_argsort] test_scores_mean_sorted = test_scores_mean[fit_time_argsort] test_scores_std_sorted = test_scores_std[fit_time_argsort] axes[2].grid() axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-" ) axes[2].fill_between( fit_time_sorted, test_scores_mean_sorted - test_scores_std_sorted, test_scores_mean_sorted + test_scores_std_sorted, alpha=0.1, ) axes[2].set_xlabel("fit_times") axes[2].set_ylabel("Score") axes[2].set_title(f"Performance of {title_name}") return axes , np.mean([train_scores[-1], test_scores[-1]]),test_scores_mean
#XXX
[docs] def plot_matshow( arr, / , labelx:List[str] =None, labely:List[str]=None, matshow_kws=None, **baseplot_kws ): #xxxxxxxxx update base plot keyword arguments for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) arr= check_array( arr, to_frame =True, input_name="Array 'arr'" ) matshow_kws= matshow_kws or dict() fig = plt.figure(figsize = pobj.fig_size) ax = fig.add_subplot(1,1,1) cax = ax.matshow(arr, **matshow_kws) cbax= fig.colorbar(cax, **pobj.cb_props) if pobj.cb_label is None: pobj.cb_label='' ax.set_xlabel( pobj.xlabel, fontsize= pobj.font_size ) # for label in zip ([labelx, labely]): # if label is not None: # if not is_iterable(label): # label = [label] # if len(label) !=arr.shape[1]: # warnings.warn( # "labels and arr dimensions must be consistent" # f" Expect {arr.shape[1]}, got {len(label)}. " # ) #continue if labelx is not None: ax = _check_labelxy (labelx , arr, ax ) if labely is not None: ax = _check_labelxy (labely, arr, ax , axis ='y') if pobj.ylabel is None: pobj.ylabel ='' if pobj.xlabel is None: pobj.xlabel = '' ax.set_ylabel (pobj.ylabel, fontsize= pobj.font_size ) ax.tick_params(axis=pobj.tp_axis, labelsize= pobj.font_size, bottom=pobj.tp_bottom, top=pobj.tp_top, labelbottom=pobj.tp_labelbottom, labeltop=pobj.tp_labeltop ) if pobj.tp_labeltop: ax.xaxis.set_label_position('top') cbax.ax.tick_params(labelsize=pobj.font_size ) cbax.set_label(label=pobj.cb_label, size=pobj.font_size, weight=pobj.font_weight) plt.xticks(rotation = pobj.rotate_xlabel) plt.yticks(rotation = pobj.rotate_ylabel) pobj.save(fig)
plot_matshow.__doc__ ="""\ Quick matrix visualization using matplotlib.pyplot.matshow. Parameters ---------- arr: 2D ndarray, matrix of n rowns and m-columns items matshow_kws: dict Additional keywords arguments for :func:`matplotlib.axes.matshow` labelx: list of str, optional list of labels names that express the name of each category on x-axis. It might be consistent with the matrix number of columns of `arr`. label: list of str, optional list of labels names that express the name of each category on y-axis. It might be consistent with the matrix number of row of `arr`. Examples --------- >>> import numpy as np >>> from watex.view.mlplot import plot_matshow >>> matshow_kwargs ={ 'aspect': 'auto', 'interpolation': None, 'cmap':'copper_r', } >>> baseplot_kws ={'lw':3, 'lc':(.9, 0, .8), 'font_size':15., 'cb_format':None, #'cb_label':'Rate of prediction', 'xlabel': 'Predicted flow classes', 'ylabel': 'Geological rocks', 'font_weight':None, 'tp_labelbottom':False, 'tp_labeltop':True, 'tp_bottom': False } >>> labelx =['FR0', 'FR1', 'FR2', 'FR3', 'Rates'] >>> labely =['VOLCANO-SEDIM. SCHISTS', 'GEOSYN. GRANITES', 'GRANITES', '1.0', 'Rates'] >>> array2d = np.array([(1. , .5, 1. ,1., .9286), (.5, .8, 1., .667, .7692), (.7, .81, .7, .5, .7442), (.667, .75, 1., .75, .82), (.9091, 0.8064, .7, .8667, .7931)]) >>> plot_matshow(array2d, labelx, labely, matshow_kwargs,**baseplot_kws ) """
[docs] def biPlot( self, Xr: NDArray, components:NDArray, y: ArrayLike, classes: ArrayLike=None, markers:List [str]=None, colors: List [str ]=None, ): """ The biplot is the best way to visualize all-in-one following a PCA analysis. There is an implementation in R but there is no standard implementation in Python. Parameters ----------- self: :class:`watex.property.BasePlot`. Matplotlib property from `BasePlot` instances. Default `BasePlot` instance is given as a `pobj` instance and can be loaded for plotting purpose as:: >>> from watex.view import pobj To change some default plot properties like line width or style, both can be set before running the script as follow :: >>> pobj.lw = 2. ; pobj.ls=':' # and so on Xr: NDArray of transformed X. the PCA projected data scores on n-given components.The reduced dimension of train set 'X' with maximum ratio as sorted eigenvectors from first to the last component. components: NDArray, shape (n_components, n_eigenvectors ), the eigenvectors of the PCA. The shape in axis must much the number of component computed using PCA. If the `Xr` shape 1 equals to the shape 0 of the component matrix `components`, it will be transposed to fit `Xr` shape 1. y: Array-like, the target composing the class labels. classes: list or int, class categories or class labels markers: str, Matplotlib list of markers for plotting classes. colors: str, Matplotlib list of colors to customize plots Examples --------- >>> from watex.analysis import nPCA >>> from watex.datasets import fetch_data >>> from watex.view import biPlot, pobj # pobj is Baseplot instance >>> X, y = fetch_data ('bagoue pca' ) # fetch pca data >>> pca= nPCA (X, n_components= 2 , return_X= False ) # return PCA object >>> components = pca.components_ [:2, :] # for two components >>> biPlot (pobj, pca.X, components , y ) # pca.X is the reduced dim X >>> # to change for instance line width (lw) or style (ls) >>> # just use the baseplotobject (pobj) References ----------- Originally written by `Serafeim Loukas`_, serafeim.loukas@epfl.ch and was edited to fit the :term:`watex` package API. .. _Serafeim Loukas: https://towardsdatascience.com/...-python-7c274582c37e """ Xr = check_array( Xr, to_frame= False, input_name="X reduced 'Xr'" ) components = check_array( components, to_frame =False , input_name="PCA components" ) Xr = np.array (Xr); components = np.array (components ) xs = Xr[:,0] # projection on PC1 ys = Xr[:,1] # projection on PC2 if Xr.shape[1]==components.shape [0] : # i.e components is not transposed # transposed then components = components.T n = components.shape[0] # number of variables fig = plt.figure(figsize=self.fig_size, #(10,8), dpi=self.fig_dpi #100 ) if classes is None: classes = np.unique(y) if colors is None: # make color based on group # to fit length of classes colors = make_mpl_properties( len(classes)) colors = [colors[c] for c in range(len(classes))] if markers is None: markers= make_mpl_properties(len(classes), prop='marker') markers = [markers[m] for m in range(len(classes))] for s,l in enumerate(classes): plt.scatter(xs[y==l],ys[y==l], c = colors[s], marker=markers[s] ) for i in range(n): # plot as arrows the variable scores # (each variable has a score for PC1 and one for PC2) plt.arrow(0, 0, components[i,0], components[i,1], color = self.lc, #'k', alpha = self.alpha, #0.9, linestyle = self.ls, # '-', linewidth = self.lw, #1.5, overhang=0.2) plt.text(components[i,0]* 1.15, components[i,1] * 1.15, "Var"+str(i+1), color = 'k', ha = 'center', va = 'center', fontsize= self.font_size ) plt.tick_params(axis ='both', labelsize = self.font_size) plt.xlabel(self.xlabel or "PC1",size=self.font_size) plt.ylabel(self.ylabel or "PC2",size=self.font_size) limx= int(xs.max()) + 1 limy= int(ys.max()) + 1 plt.xlim([-limx,limx]) plt.ylim([-limy,limy]) plt.grid() plt.tick_params(axis='both', which='both', labelsize=self.font_size ) self.save(fig)
# if self.savefig is not None: # savefigure (plt, self.savefig, dpi = self.fig_dpi ) def _remaining_plot_roperties (self, ax, xlim=None, ylim=None, fig=None ): """Append the remaining lines properties such as xlabel, grid , legend and ticks parameters. Relevant idea to not DRY(Don't Repeat Yourself). :param ax: matplotlib.pyplot.axis :param (xlim, ylim): Limit of x-axis and y-axis :param fig: Matplotlib.figure name. :return: self- Plot object. """ if self.xlabel is None: self.xlabel ='' if self.ylabel is None: self.ylabel ='' if xlim is not None: ax.set_xlim(xlim) if ylim is not None: ax.set_ylim(ylim) ax.set_xlabel( self.xlabel, fontsize= .5 * self.font_size * self.fs ) ax.set_ylabel (self.ylabel, fontsize= .5 * self.font_size * self.fs) ax.tick_params(axis='both', labelsize=.5 * self.font_size * self.fs) if self.show_grid is True : if self.gwhich =='minor': ax.minorticks_on() ax.grid(self.show_grid, axis=self.gaxis, which = self.gwhich, color = self.gc, linestyle=self.gls, linewidth=self.glw, alpha = self.galpha ) if len(self.leg_kws) ==0 or 'loc' not in self.leg_kws.keys(): self.leg_kws['loc']='best' ax.legend(**self.leg_kws) self.save(fig) return self def _chk_predict_args (Xt, yt, *args, predict =False ): """ Validate arguments passed for model prediction :param Xt: ndarray|DataFrame, test data :param yt: array-like, pandas serie for test label :param args: list of other keyword arguments which seems to be usefull. :param predict: bool, expect a prediction or not. :returns: Tuple (Xt, yt, index , clf , ypred )- tuple of : * Xt : test data * yt : test label data * index :index to fit the samples in the dataframe or the shape [0] of ndarray * clf: the predictor or estimator * ypred: the estimator predicted values """ # index is used for displayed the examples label in x-abscissa # for instance index = ['b4, 'b5', 'b11', ... ,'b425', 'b427', 'b430'] index , clf , ypred = args if index is not None: #control len of index and len of y if not is_iterable (index): raise TypeError("Index is an iterable object with the same length" "as 'y', got '{type (index).__name__!r}'") len_index= len(yt)==len(index) if not len_index: warnings.warn( "Expect an index size be consistent with 'y' size={len(yt)}," " got'{len(index)}'. Given index can not be used." ) index =None if len_index : if isinstance(yt, (pd.Series, pd.DataFrame)): if not np.all(yt.index.isin(index)): warnings.warn( "Given index values are mismatched. Note that for " "overlaying the model plot, 'Xt' indexes must be " "identical to the one in target 'yt'. The indexes" " provided are wrong and should be resetted." ) index =yt.index yt=yt.values() yt= pd.Series(yt, index = index ) if predict: if clf is None: warnings.warn("An estimator/classifier is needed for prediction." " Got Nonetype.") raise EstimatorError("No estimator detected. Could not predict 'y'") if Xt is None: raise TypeError( "Test data 'Xt' is needed for prediction. Got nothing") # check estimator as callable object or ABCMeta classes if not hasattr(clf, '__call__') and not inspect.isclass(clf)\ and type(clf.__class__)!=ABCMeta: raise EstimatorError( f"{clf.__class__.__name__!r} is not an estimator/classifier." " 'y' prediction is aborted!") clf.fit(Xt, yt) ypred = clf.predict(Xt) if isinstance(Xt, (pd.DataFrame, pd.Series)): if index is None: index = Xt.index if isinstance(yt, pd.Series): index = yt.index.astype('>U12') if index is None: # take default values if indexes are not given index =np.array([i for i in range(len(yt))]) if len(yt)!=len(ypred): raise TypeError("'ypred'(predicted) and 'yt'(true target) sizes must" f" be consistent. Expected {len(yt)}, got {len(ypred)}") return Xt, yt, index , clf , ypred def _check_labelxy (lablist, ar, ax, axis = 'x' ): """ Assert whether the x and y labels given for setting the ticklabels are consistent. If consistent, function set x or y labels along the x or y axis of the given array. :param lablist: list, list of the label to set along x/y axis :param ar: arraylike 2d, array to set x/y axis labels :param ax: matplotlib.pyplot.Axes, :param axis: str, default="x", kind of axis to set the label. """ warn_msg = ("labels along axis {axis} and arr dimensions must be" " consistent. Expects {shape}, got {len_label}") ax_ticks, ax_labels = (ax.set_xticks, ax.set_xticklabels ) if axis =='x' else ( ax.set_yticks, ax.set_yticklabels ) if lablist is not None: lablist = is_iterable(lablist, exclude_string=True, transform =True ) if not _check_consistency_size ( lablist , ar[0 if axis =='x' else 1], error ='ignore'): warnings.warn(warn_msg.format( axis = axis , shape=ar.shape[0 if axis =='x' else 1], len_label=len(lablist)) ) else: ax_ticks(np.arange(0, ar.shape[0 if axis =='x' else 1])) ax_labels(lablist) return ax
[docs] def plot2d( ar, y=None, x =None, distance=50., stnlist =None, prefix ='S', how= 'py', to_log10=False, plot_contours=False, top_label='', **baseplot_kws ): """Two dimensional template for visualization matrices. It is a wrappers that can plot any matrice by customizing the position X and y. By default X is considering as stations and y the resistivity log data. Parameters ----------- ar: Array-like 2D, shape (M, N) 2D array for plotting. For instance, it can be a 2D resistivity collected at all stations (N) and all frequency (M) y: array-like, default=None Y-coordinates. It should have the length N, the same of the ``arr2d``. the rows of the ``arr2d``. x: array-like, default=None, X-coordinates. It should have the length M, the same of the ``arr2d``; the columns of the 2D dimensional array. Note that if `x` is given, the `distance is not needed. distance: float The step between two stations. If given, it creates an array of position for plotting purpose. Default value is ``50`` meters. stnlist: list of str List of stations names. If given, it should have the same length of the columns M, of `arr2d`` prefix: str string value to add as prefix of given id. Prefix can be the site name. Default is ``S``. how: str Mode to index the station. Default is 'Python indexing' i.e. the counting of stations would starts by 0. Any other mode will start the counting by 1. to_log10: bool, default=False Recompute the `ar` in logarithm base 10 values. Note when ``True``, the ``y`` should be also in log10. plot_contours: bool, default=True Plot the contours map. Is available only if the plot_style is set to ``pcolormesh``. top_label: str, Name of the top label. baseplot_kws: dict, All all the keywords arguments passed to the property :class:`watex.property.BasePlot` class. Returns ------- axe: <AxesSubplot> object Examples -------- >>> import numpy as np >>> import watex >>> np.random.seed (42) >>> data = np.random.randn ( 15, 20 ) >>> data_nan = data.copy() >>> data_nan [2, 1] = np.nan; data_nan[4, 2]= np.nan; data_nan[6, 3]=np.nan >>> watex.view.mlplot.plot2d (data ) <AxesSubplot:xlabel='Distance(m)', ylabel='log10(Frequency)[Hz]'> >>> watex.view.mlplot.plot2d (data_nan , plt_style = 'imshow', fig_size = (10, 4)) """ #xxxxxxxxx update base plot keyword arguments for k in list(baseplot_kws.keys()): setattr (pobj , k, baseplot_kws[k]) if y is not None: if len(y) != ar.shape [0]: raise ValueError ("'y' array must have an identical number " f" of row of 2D array: {ar.shape[0]}") if x is not None: if len(x) != ar.shape[1]: raise ValueError (" 'x' array must have the same number " f" of columns of 2D array: {ar.shape[1]}") d= distance or 1. try : distance = float(distance) except : raise TypeError ( f'Expect a float value not {type(distance).__name__!r}') # put value to log10 if True if to_log10: ar = np.log10 (ar ) # assume the resistivity data y = np.log10(y) if y is not None else y # assume the frequency data y = np.arange(ar.shape [0]) if y is None else y x= x or np.arange(ar.shape[1]) * d stn = stnlist or make_ids ( x , prefix , how = how) #print(stnlis) if stn is not None: stn = np.array(stn) if not _check_consistency_size(stn, x, error ="ignore"): raise ValueError("The list of stations and positions must be" f" consistent. {len(stnlist)} and {len(x)}" " were given respectively") # make figure fig, axe = plt.subplots(1,figsize = pobj.fig_size, num = pobj.fig_num, dpi = pobj.fig_dpi ) cmap = plt.get_cmap( pobj.cmap) if pobj.plt_style not in ('pcolormesh','imshow' ): warnings.warn(f"Unrecognized plot style {pobj.plt_style!r}." " Expect ['pcolormesh'|'imshow']." " 'pcolormesh' ( default) is used instead.") pobj.plt_style= 'pcolormesh' if pobj.plt_style =='pcolormesh': X, Y = np.meshgrid (x, y) # ar = np.ma.masked_where(np.isnan(ar), ar) #Zm = ma.array(Z,mask=np.isnan(Z)) pkws = dict (vmax = np.nanmax (ar), vmin = np.nanmin (ar), ) if plot_contours: levels = mticker.MaxNLocator(nbins=15).tick_values( np.nanmin (ar), np.nanmax(ar) ) # delete vmin and Vmax : not supported # when norm is passed del pkws ['vmin'] ; del pkws ['vmax'] pkws ['norm'] = BoundaryNorm( levels, ncolors=plt.colormaps[pobj.cmap].N, clip=True) ax = axe.pcolormesh ( X, Y, np.flipud (ar), shading= pobj.plt_shading, cmap =cmap, **pkws ) if plot_contours: # contours are *point* based plots, so convert # our bound into point centers dx, dy = 0.05, 0.05 axe.contourf(X+ dx/2., Y + dy/2., np.flipud (ar) , levels=levels, cmap=plt.colormaps[pobj.cmap] ) if pobj.plt_style =='imshow': ax = axe.imshow (ar, interpolation = pobj.imshow_interp, cmap =cmap, aspect = pobj.fig_aspect , origin= 'lower', extent=( np.nanmin(x), np.nanmax (x), np.nanmin(y), np.nanmax(y) ) ) # set axis limit axe.set_ylim(np.nanmin(y), np.nanmax(y)) axe.set_xlim(np.nanmin(x), np.nanmax (x)) cbl = 'log_{10}' if to_log10 else '' axe.set_xlabel(pobj.xlabel or 'Distance(m)', fontdict ={ 'size': 1.5 * pobj.font_size , 'weight': pobj.font_weight} ) axe.set_ylabel(pobj.ylabel or f"{cbl}Frequency$[Hz]$", fontdict ={ #'style': pobj.font_style, 'size': 1.5 * pobj.font_size , 'weight': pobj.font_weight}) if pobj.show_grid is True : axe.minorticks_on() axe.grid(color='k', ls=':', lw =0.25, alpha=0.7, which ='major') labex = pobj.cb_label or f"{cbl}App.Res$[Ω.m]$" cb = fig.colorbar(ax , ax= axe) cb.ax.yaxis.tick_left() cb.ax.tick_params(axis='y', direction='in', pad=2., labelsize = pobj.font_size ) cb.set_label(labex,fontdict={'size': 1.2 * pobj.font_size , 'style':pobj.font_style}) #--> set second axis axe2 = axe.twiny() axe2.set_xticks(range(len(x)),minor=False ) # set ticks params to reformat the size axe.tick_params ( labelsize = pobj.font_size ) axe2.tick_params ( labelsize = pobj.font_size ) # get xticks and format labels using the auto detection _get_xticks_formatage(axe2, stn, fmt = 'S{:02}', auto=True, rotation=pobj.rotate_xlabel ) axe2.set_xlabel(top_label, fontdict ={ 'style': pobj.font_style, 'size': 1.5 * pobj.font_size , 'weight': pobj.font_weight}, ) fig.suptitle(pobj.fig_title,ha='left', fontsize= 15* pobj.fs, verticalalignment='center', style =pobj.font_style, bbox =dict(boxstyle='round', facecolor ='moccasin') ) #plt.tight_layout(h_pad =1.8, w_pad =2*1.08) plt.tight_layout() if pobj.savefig is not None : fig.savefig(pobj.savefig, dpi = pobj.fig_dpi, orientation =pobj.orient) plt.show() if pobj.savefig is None else plt.close(fig=fig) return axe