Source code for watex.analysis.decomposition

# -*- coding: utf-8 -*-
#   Licence:BSD 3-Clause
#   Author: LKouadio <etanoyau@gmail.com>
#   Copyright (c) 2021-2022
"""
Decomposition 
================

Steps behing the principal component analysis (PCA) and matrices decomposition 
"""

from warnings import warn 
import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib.colors import ListedColormap 
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA 
from .._docstring import _core_docs 
from ..utils.funcutils import _assert_all_types 
# ---
__all__=[
    "extract_pca", 
    "decision_region", 
    "feature_transformation", 
    "total_variance_ratio" , 
    "linear_discriminant_analysis"
    ]

[docs]def extract_pca (X): # standize the features sc = StandardScaler() X= sc.fit_transform(X) # constructing the covariance matrix cov_mat = np.cov(X.T) eigen_vals, eigen_vecs = np.linalg.eig (cov_mat) return eigen_vals, eigen_vecs, X
extract_pca.__doc__="""\ A naive approach to extract PCA from training set X Parameters ---------- {params.X} Returns -------- Tuple (eigen_vals, eigen_vecs, Xsc): Eigen values , eigen vectors and Xsc scaled (standardized) Examples --------- >>> from watex.exlib.sklearn import SimpleImputer >>> from watex.utils import selectfeatures >>> from watex.datasets import fetch_data >>> from watex.analysis import extract_pca >>> data= fetch_data("bagoue original").get('data=dfy1') # encoded flow categories >>> y = data.flow ; X= data.drop(columns='flow') >>> # select the numerical features >>> X =selectfeatures(X, include ='number') >>> # imputed the missing data >>> X = SimpleImputer().fit_transform(X) >>> eigval, eigvecs, _ = extract_pca(X) >>> eigval ... array([2.09220756, 1.43940464, 0.20251943, 1.08913226, 0.97512157, 0.85749283, 0.64907948, 0.71364687]) Notes ------- All consequent principal component (pc) will have the larget variance given the constraint that these component are uncorrelated (orthogonal) to other pc - even if the inputs features are corralated , the resulting of pc will be mutually orthogonal (uncorelated). Note that the PCA directions are highly sensistive to data scaling and we need to standardize the features prior to PCA if the features were measured on different scales and we assign equal importances of all features the numpy function was designed to operate on both symetric and non-symetric squares matrices. However you may find it return complex eigenvalues in certains casesA related function, `numpy.linalg.eigh` has been implemented to decompose Hermetian matrices which is numerically more stable to work with symetric matrices such as the covariance matrix. `numpy.linalg.eigh` always returns real eigh eigenvalues """.format(params = _core_docs["params"] )
[docs]def total_variance_ratio (X, view =False): eigen_vals, eigen_vcs, _ = extract_pca(X) tot =sum(eigen_vals) # sorting the eigen values by decreasing # order to rank the eigen_vectors var_exp = list(map( lambda x: x/ tot , sorted (eigen_vals, reverse =True))) #var_exp = [(i/tot) for i in sorted (eigen_vals, reverse =True)] cum_var_exp = np.cumsum (var_exp) if view: plt.bar (range(1, len(eigen_vals)+1), var_exp , alpha =.5, align='center', label ='Individual explained variance') plt.step (range(1, len(eigen_vals)+1), cum_var_exp, where ='mid', label="Cumulative explained variance") plt.ylabel ("Explained variance ratio") plt.xlabel ('Principal component analysis') plt.legend (loc ='best') plt.tight_layout() plt.show () return cum_var_exp
total_variance_ratio.__doc__ ="""\ Compute the total variance ratio. Is the ratio of an eigenvalues :math:`\\lambda_j`, as simply the fraction of and eigen value, :math:`\\lambda_j` and the total sum of the eigen values as: .. math:: \\text{explained_variance_ratio}= \\frac{\\lambda_j}{\\sum{j=1}^{d} \\lambda_j} Using numpy cumsum function, we can then calculate the cumulative sum of explained variance which can be plot if `plot` is set to ``True`` via matplotlib set function. Parameters -------------- X: Nd-array, shape(M, N) Array of training set with M examples and N-features view: bool, default {'False'} give an overview of the total explained variance. Returns --------- cum_var_exp : array-like Cumulative sum of variance total explained. Examples ---------- >>> from watex.analysis import total_variance_ratio >>> # Use the X value in the example of `extract_pca` function >>> cum_var = total_variance_ratio(X, view=True) >>> cum_var ... array([0.26091916, 0.44042728, 0.57625294, 0.69786032, 0.80479823, 0.89379712, 0.97474381, 1. ]) """
[docs]def feature_transformation ( X, y=None, n_components =2, positive_class=1, view =False): # select k vectors which correspond to the k largest # eigenvalues , where k is the dimesionality of the new # subspace (k<=d) eigen_vals, eigen_vecs, X = extract_pca(X) # -> sorting the eigen values by decreasing order eigen_pairs = [ (np.abs(eigen_vals[i]) , eigen_vecs[:, i]) for i in range(len(eigen_vals))] eigen_pairs.sort(key =lambda k :k[0], reverse =True) # collect two eigen vectors that correspond to the two largest # eigenvalues to capture about 60% of the variance of this datasets if n_components !=2 : #XXX TODO: transform component > 2 warn("N-component > 2 is not implemented yet.", UserWarning) w= np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs [1][1][:, np.newaxis]) ) # In pratice the number of principal component has to be # determined by a tradeoff between computational efficiency # and the performance of the classifier. #-> transform X onto a PCA subspace( the pc one on two) X_transf = X.dot(w) if view: from ..utils.plotutils import make_mpl_properties if y is None: raise TypeError("Missing the target `y`") # markers = tuple (D_MARKERS [:len(np.unique (y))]) # colors = tuple (D_COLORS [:len(np.unique (y))]) colors = make_mpl_properties(len(np.unique (y))) markers = make_mpl_properties ( len(np.unique (y)), 'markers') if positive_class not in np.unique (y): raise ValueError( f"'{positive_class}' does not match any label " "of the class. The positive class must be an " "integer label within the class values" ) for l, c, m in zip(np.unique (y),colors, markers ): plt.scatter(X_transf[y==positive_class, 0], X_transf[y==positive_class, 1], c= c, label =l, marker=m) plt.xlabel ('PC1') plt.ylabel ('PC2') plt.legend(loc='best') plt.tight_layout() plt.show() return X_transf
feature_transformation.__doc__="""\ Transform X into new principal components after decomposing the covariances matrices. Parameters ----------- {params.X} {params.y} n_components: int, default=2 Number of components with most total variance ratio. positive_class: int, class label as an integer indenfier within the class representation. view: bool, default {{'False'}} give an overview of the total explained variance. Returns --------- X_transf : nd-array X PCA training set transformed. Examples --------- >>> from watex.analysis import feature_transformation >>> # Use the X, y value in the example of `extract_pca` function >>> Xtransf = feature_transformation(X, y=y, positive_class = 2 , view =True) >>> Xtransf[0] ... array([-1.0168034 , 2.56417088]) """.format(params = _core_docs["params"] ) def _decision_region (X, y, clf, resolution =.02 , ax =None ): """ visuzalize the decision region """ from ..utils.plotutils import make_mpl_properties # setup marker generator and colors map colors = tuple (make_mpl_properties(len(np.unique (y)))) markers = tuple (make_mpl_properties ( len(np.unique (y)), 'markers') ) cmap = ListedColormap(colors[:len(np.unique(y))]) # plot the decision surface x1_min , x1_max = X[:, 0].min() -1, X[:, 0].max() +1 x2_min , x2_max = X[:, 1].min() -1, X[:, 1].max() +1 xx1 , xx2 = np.meshgrid(np.arange (x1_min, x1_max, resolution), np.arange (x2_min, x2_max, resolution) ) z= clf.predict(np.array ([xx1.ravel(), xx2.ravel()]).T) z= z.reshape (xx1.shape) if ax is None: fig, ax= plt.subplots() ax.contourf (xx1, xx2, z, alpha =.4, cmap =cmap ) ax.set_xlim(xx1.min(), xx1.max()) ax.set_ylim(xx2.min(), xx2.max()) # plot the examples by classes for idx, cl in enumerate (np.unique (y)): ax.scatter (x= X[y ==cl, 0] , y = X[y==cl, 1], alpha =.6 , color = cmap(idx), edgecolors='black', marker = markers[idx], label=cl ) return ax
[docs]def decision_region ( X, y, clf, Xt =None, yt=None, random_state = 42, test_size = .3 , scaling =True, split =False, n_components =2 , view ='X', resolution =.02, return_expl_variance_ratio =False, return_axe =False, axe =None, **kws ): view = str(view).lower().strip() if view in ('xt', 'test'): view ='test' elif view in ('x', 'train'): view = 'train' else: view =None if split : X, Xt, y, yt = train_test_split(X, y, random_state =random_state, test_size =test_size, **kws) pca = PCA (n_components = n_components) # scale data sc = StandardScaler() X = sc.fit_transform(X) if scaling else X if Xt is not None: Xt = sc.transform(Xt) if scaling else Xt # dimension reduction X_pca = pca.fit_transform(X) Xt_pca = pca.transform (Xt) if ( Xt is not None) else None # fitting the classifier clf model on the reduced datasets clf.fit(X_pca, y ) # now plot the decision regions if view is not None: if view =='train': ax = _decision_region( X_pca, y, clf = clf,resolution = resolution, ax= axe) if view =='test': if Xt_pca is None: raise TypeError("Cannot plot missing test sets (Xt, yt)") ax = _decision_region(Xt_pca, yt, clf=clf, resolution =resolution, ax =axe ) ax.set_xlabel("PC1") ax.set_ylabel ("PC2") ax.legend (loc= 'lower left') plt.show () if return_expl_variance_ratio : pca =PCA(n_components =None ) X_pca = pca.fit_transform(X) return pca.explained_variance_ratio_ return ax if return_axe else X_pca
decision_region.__doc__="""\ View decision regions for the training data reduced to two principal component axes. Parameters ----------- {params.X} {params.y} {params.Xt} {params.yt} {params.clf} random_state: int, default {{42}} state of shuffling the data test_size: float < 1 , default {{.3}} the size to keep remainder data into the test set . split: bool, False Split (X,y) data into a training and test sets(Xt, yt). Here, it value is triggered to ``True``, we assume (X, y) previously given are all the whole dataset with target `y`. n_components: int, float 2 , default {{2}} the number of principal component to retrieve. If value is given as a ratio for instance '.95' i.e. the ratio of keeping variance is 95% and the `n_components can be get using the attributes scikit-learn getter as `<estimator>.n_components_` view: str , ['X', 'Xt', None] the kind of vizualization. 'X', 'Xt' mean the training and test set decision region visualization respectively. If set to ``None``(default), the view are muted. resolution: float, default{{.02}} level of the extension of numpy meshgrip to tighting layout the plot. return_expl_variance_ratio: bool, default is {{False}} returns the PCA variance ratio explaines of all principal components. return_axes: bool, default=False, Return matplotlib object axe ax: Matplotlib.Axes object, optional If not supplied, it is created. kws: dict Additional keywords arguments passed to the scikit-learn function :func:`sklearn.model_selection.train_test_split` Returns --------- nd-array | arraylike (return_expl_variance_ratio=True) X PCA training set transformed or PCA explained variance ratio. Examples --------- >>> from watex.datasets import fetch_data >>> from watex.exlib.sklearn import SimpleImputer, LogisticRegression >>> from watex.analysis.decomposition import decision_region >>> data= fetch_data("bagoue original").get('data=dfy1') # encoded flow categories >>> y = data.flow ; X= data.drop(columns='flow') >>> # select the numerical features >>> X =selectfeatures(X, include ='number') >>> # imputed the missing data >>> X = SimpleImputer().fit_transform(X) >>> lr_clf = LogisticRegression(multi_class ='ovr', random_state =1, solver ='lbfgs') >>> Xpca= decision_region(X, y, clf=lr_clf, split = True, view ='Xt') # test set view >>> Xpca[0] ... array([-1.02925449, 1.42195127]) """.format(params = _core_docs["params"] )
[docs]def linear_discriminant_analysis ( X, y, n_components = 2 , view=False, verbose = 0 , return_X=True, ): n_components = int (_assert_all_types (n_components, int, float )) # standardize the features eigen_vals, eigen_vcs, X = extract_pca(X) # compute the mean vectors which will use to # construct the within classes scatter matrix np.set_printoptions(precision=4) mean_vecs =list() for label in range (1, len(np.unique (y))): mean_vecs.append (np.mean(X[y==label], axis =0 )) if verbose : print('MV %s: %s\n' %(label , mean_vecs[label -1])) # compute the within class Sw using the mean vector; # calculated by summing up the individual scatter matrices d = X.shape [1] # number of features SW =np.zeros ((d, d )) #========================================================================== # for label , mv in zip (range (1 , len(np.unique (y))), mean_vecs): # class_scatter = np.zeros ((d, d) ) # for row in X [y==label ]: # row, mv= row.reshape (d, 1), mv.reshape (d, 1) # class_scatter += (row - mv ).dot(row - mv).T # SW += class_scatter # if verbose : # print("within-class scatter matrix: %sx %s" % ( # SW.shape [0], SW.shape[1])) # the assumption that we are making when we are computing the # scatter matrices is that the labels in the training datasets # are uniformly distributed. Howeverm, if we print the number # of class labels, we see that this assumtions is violated. # for instance : # >>> print('class label distributions: %s' % np.bincount (y)[1:]) #========================================================================== # the better way is to scale the individual scatter matrices, before # sum them up as scatter matrix SW. when dividing the scatter matrices # by the number of classes examples , we can see that computig the # scatter matrix is the same as covariance matrix for label , mv in zip (range (1 , len(np.unique (y))), mean_vecs): class_scatter =np.cov(X[y==label].T ) SW += class_scatter if verbose : print("Scaled within-class scatter matrix: %sx %s" % ( SW.shape [0], SW.shape[1])) # compute between classes scatter matrix SB mean_overall = np.mean (X, axis = 0 ) # include the example from c classes SB = np.zeros ((d, d)) for i, mean_vec in enumerate( mean_vecs ): n = X[y == i+1 , :].shape [0] mean_vec = mean_vec.reshape (d, 1) # make column vector mean_overall = mean_overall.reshape (d, 1) SB += n * (mean_vec - mean_overall ).dot ((mean_vec - mean_overall).T) if verbose : if verbose : print("Between within-class scatter matrix: %sx%s" % ( SB.shape [0], SB.shape[1])) # select discriminant for the new feature subspace eigen_vals, eigen_vecs = np.linalg.eig (np.linalg.inv (SW).dot(SB)) # sort the eigen value in descending order eigen_pairs = [ (np.abs (eigen_vals[i]), eigen_vecs [:, i]) for i in range(len(eigen_vals )) ] eigen_pairs = sorted (eigen_pairs, key =lambda k: k[0], reverse =True ) if verbose : print("Eigen values in descending order:\n") for eig_val in eigen_pairs : print(eig_val[0]) if view : tot = sum(eigen_vals.real ) discr = [(i/tot ) for i in sorted (eigen_vals.real , reverse =True ) ] cum_discr = np.cumsum(discr) plt.bar (range (1 , 1+ X.shape [1]), discr , alpha =.5 , align= 'center', label ='Individual "discriminability"' ) plt.step (range (1 , 1+ X.shape [1]), cum_discr , where ='mid', label ='Cumulative "discriminality"') plt.ylabel ('"Discriminability" ratio') plt.xlabel ("Linear discriminants") plt.ylim ([ -0.1 , 1.1 ]) plt.legend (loc ='best') plt.tight_layout() plt.show () # stack the two most discrimative eigen columns # to create the transformation matrix W for two components # W = np.hstack((eigen_pairs[0][1][:, np.newaxis].real, # eigen_pairs[1][1][:, np.newaxis].real ) # ) W0 = np.hstack(tuple (eigen_pairs[k][1][:, np.newaxis].real for k in range (len(eigen_pairs))) ) if n_components > W0.shape [1]: warn (f"n_component '{n_components}' is larger than the most " f"discriminant vector '{W0.shape [1]}'") n_components = W0.shape [1] W = W0 [:, :n_components] if verbose: print("07 first values of W:\n", W[:5, :]) # we can now transform the training dataset # from matrix W return X.dot(W) if return_X else W
linear_discriminant_analysis.__doc__ ="""\ Linear Discriminant Analysis `LDA`. LDA is used as a technique for feature extraction to increase the computational efficiency and reduce the degree of overfitting due to the curse of dimensionnality in non-regularized models. The general concept behind LDA is very similar to the principal component analysis (PCA), but whereas PCA attempts to find the orthogonal component axes of minimum variance in a dataset, the goal in LDA is to find the features subspace that optimize class separability. The main steps requiered to perform LDA are summarized below: * Standardize the d-dimensional datasets (d is the number of features) * For each class , compute the d-dimensional mean vectors. Thus for each mean feature value, :math:`\\mu_m` with respect to the examples of class :math:`i`: .. math:: m_i = \\frac{1}{n_i} \\sum{x\\in D_i} x_m * Construct the between-class scatter matrix, :math:`S_B` and the within class scatter matrix, :math:`S_W`. Individual scatter matrices are scalled :math:`S_i` before we sum them up as scatter matrix :math:`S_W` as: .. math:: \\sum{i} = \\frac{1}{n_i}S_i \\sum{i} = \\frac{1}{n_i} \\sum{x\\in D_i} (x-m_i)(x-m_i)^T The within-class is also called the covariance matrix, thus we can compute the between class scatter_matrix :math:`S_B`. .. math:: S_B= \\sum{i}^{n_i}(m_i-m) (m_i-m)^T where :math:`m` is the overall mean that is computed , including examples from all classes. * Compute the eigenvectors and corresponding eigenvalues of the matrix :math:`S_W^{-1}S_B`. * Sort the eigenvalues by decreasing order to rank the corresponding eigenvectors * Choose the :math:`k` eigenvectors that correspond to the :math:`k` largest eigenvalues to construct :math:`dxk`-dimensional transformation matrix, :math:`W`; the eigenvectors are the columns of this matrix. * project the examples onto the new_features subspaces using the transformation matrix :math:`W`. Parameters ----------- X: Ndarray ( M x N matrix where ``M=m-samples``, & ``N=n-features``) Training set; Denotes data that is observed at training and prediction time, used as independent variables in learning. When a matrix, each sample may be represented by a feature vector, or a vector of precomputed (dis)similarity with each training sample. :code:`X` may also not be a matrix, and may require a feature extractor or a pairwise metric to turn it into one before learning a model. y: array-like, shape (M, ) ``M=m-samples``, train target; Denotes data that may be observed at training time as the dependent variable in learning, but which is unavailable at prediction time, and is usually the target of prediction. n_components: int, default =2 Number of components considered as the most discriminative eigen vector. return_X: bool, default =True return the transformed training set from `n_components`. view: bool ,default =False, Visualize the LDA plot. If set to ``True``, the plot is triggered. Returns -------- X or W: ndarray (n_samples, 2 ) The transformed train set (X) or matrix (W) from the most discriminative eigenvector columns Examples ---------- >>> from watex.datasets import fetch_data >>> from watex.exlib.sklearn import SimpleImputer, LogisticRegression >>> from watex.analysis.decomposition import linear_discriminant_analysis >>> data= fetch_data("bagoue original").get('data=dfy1') # encoded flow >>> y = data.flow ; X= data.drop(columns='flow') >>> # select the numerical features >>> X =selectfeatures(X, include ='number') >>> # imputed the missing data >>> X = SimpleImputer().fit_transform(X) >>> Xtr= linear_discriminant_analysis (X, y , view =True) """