Source code for watex.analysis.factor

# -*- coding: utf-8 -*-
"""
Model selection with Probabilistic PCA and Factor Analysis (FA)
====================================================================

Probabilistic PCA and Factor Analysis are probabilistic models. The consequence 
is that the likelihood of new data can be used for model selection and 
covariance estimation. Here we compare PCA and FA with cross-validation on 
low rank data corrupted with homoscedastic noise 
(noise variance is the same for each feature) or heteroscedastic noise 
(noise variance is the different for each feature). In a second step we compare 
the model likelihood to the likelihoods obtained from shrinkage covariance 
estimators.

One can observe that with homoscedastic noise both FA and PCA succeed in 
recovering the size of the low rank subspace. The likelihood with PCA is 
higher than FA in this case. However PCA fails and overestimates the rank 
when heteroscedastic noise is present. Under appropriate circumstances the 
low rank models are more likely than shrinkage models.

The automatic estimation from Automatic Choice of Dimensionality for PCA. 
NIPS 2000: 598-604 by Thomas P. Minka is also compared.

# Authors: Alexandre Gramfort & Denis A. Engemann
# License: BSD 3 clause
edited by LKouadio on Tue Oct 11 16:54:26 2022
"""

import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis 
from sklearn.covariance import  ShrunkCovariance, LedoitWolf 
from sklearn.model_selection import GridSearchCV, cross_val_score
from .._docstring import _core_docs

__all__=[ 
    "LW_score", 
    "shrunk_cov_score", 
    "compute_scores", 
    "pcavsfa", 
    "make_scedastic_data", 
    ]


[docs]
def compute_scores(X, n_features , n_components = 5):
    n_components = np.arange(0, n_features, n_components)
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()
    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores


compute_scores.__doc__ ="""\
Compute PCA score and Factor Analysis scores from training X. 
  
Parameters 
-----------
{params.X}

n_features: int, 
    number of features that composes X 
n_components: int, default {{5}}
    number of component to retrieve. 
Returns 
---------
Tuple (pca_scores, fa_scores): 
    Scores from PCA and FA  from transformed X 
""".format(params =_core_docs["params"])


[docs]
def shrunk_cov_score(X):
    shrinkages = np.logspace(-2, 0, 30)  # Fit the models
    cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages})
    return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))


shrunk_cov_score.__doc__="""\
shrunk the covariance scores.
 
Parameters 
-----------
{params.X} 

Returns
-----------
score: score of covariance estimator (best ) with shrinkage

""".format(
    params =_core_docs["params"]
)

[docs]
def LW_score(X, store_precision=True, assume_centered=False,  **kws):
    r"""Models score from Ledoit-Wolf.
    
    Parameters
    ----------
    store_precision : bool, default=True
        Specify if the estimated precision is stored.

    assume_centered : bool, default=False
        If True, data will not be centered before computation.
        Useful when working with data whose mean is almost, but not exactly
        zero.
        If False (default), data will be centered before computation.

    block_size : int, default=1000
        Size of blocks into which the covariance matrix will be split
        during its Ledoit-Wolf estimation. This is purely a memory
        optimization and does not affect results.
        
    Notes
    -----
    The regularised covariance is:
        
    .. math::
        
        (1 - text{shrinkage}) * \text{cov} + \text{shrinkage} * \mu * \text{np.identity(n_features)}
    
    where :math:`\mu = \text{trace(cov)} / n_{features}`
    and shrinkage is given by the Ledoit and Wolf formula
    
    See also
    ----------
        LedoitWolf
        
    References
    ----------
    "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
    Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
    February 2004, pages 365-411.
    
    """
    return np.mean(cross_val_score(LedoitWolf(**kws), X))



[docs]
def pcavsfa (
        X,
        #n_samples, n_features, 
        rank =10 , sigma =1. , n_components =5, 
        random_state = 42 , verbose =0 , view =False, 
  ):
    # options for n_components
    n_samples, n_features = len(X),  X.shape[1]
    n_components = np.arange(0, n_features, n_components) 
    
    rng = np.random.RandomState(random_state)
    U, _, _ = linalg.svd(rng.randn(n_features, n_features))
    
    #X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)
    X = np.dot(rng.randn(n_samples, n_features), U[:, :rank].T)
    # Adding homoscedastic noise
    X_homo = X + sigma * rng.randn(n_samples, n_features)

    # Adding heteroscedastic noise
    sigmas = sigma * rng.rand(n_features) + sigma / 2.
    X_hetero = X + rng.randn(n_samples, n_features) * sigmas


    for X, title in [(X_homo, 'Homoscedastic Noise'),
                     (X_hetero, 'Heteroscedastic Noise')]:
        pca_scores, fa_scores = compute_scores(X, n_features)
        n_components_pca = n_components[np.argmax(pca_scores)]
        n_components_fa = n_components[np.argmax(fa_scores)]
    
        pca = PCA(svd_solver='full', n_components='mle')
        pca.fit(X)
        n_components_pca_mle = pca.n_components_
        
        if verbose:
            print("best n_components by PCA CV = %d" % n_components_pca)
            print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
            print("best n_components by PCA MLE = %d" % n_components_pca_mle)
    
        
        plt.figure()
        plt.plot(n_components, pca_scores, 'b', label='PCA scores')
        plt.plot(n_components, fa_scores, 'r', label='FA scores')
        plt.axvline(rank, color='g', label='TRUTH: %d' % rank, linestyle='-')
        plt.axvline(n_components_pca, color='b',
                    label='PCA CV: %d' % n_components_pca, linestyle='--')
        plt.axvline(n_components_fa, color='r',
                    label='FactorAnalysis CV: %d' % n_components_fa,
                    linestyle='--')
        plt.axvline(n_components_pca_mle, color='k',
                    label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')
    
        # compare with other covariance estimators
        plt.axhline(shrunk_cov_score(X), color='violet',
                    label='Shrunk Covariance MLE', linestyle='-.')
        plt.axhline(LW_score(X), color='orange',
                    label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')
    
        plt.xlabel('nb of components')
        plt.ylabel('CV scores')
        plt.legend(loc='lower right')
        plt.title(title)
    
    plt.show()
    return pca_scores, fa_scores

    
pcavsfa.__doc__="""\
Compute PCA score and Factor Analysis scores from training X and compare  
probabilistic PCA and Factor Analysis  models.
  
Parameters 
-----------
{params.X}

n_features: int, 
    number of features that composes X 
n_components: int, default {{5}}
    number of component to retrieve. 
rank: int, default{{10}}
    Bounding for ranking 
sigma: float, default {{1.}}
    data pertubator ratio for adding heteroscedastic noise
random_state: int , default {{42}}
    Determines random number generation for dataset shuffling. Pass an int
    for reproducible output across multiple function calls.
    
{params.verbose}

Returns 
---------
Tuple (pca_scores, fa_scores): 
    Scores from PCA and FA  from transformed X 
""".format(
    params =_core_docs["params"]
)    
    
    

[docs]
def make_scedastic_data (
        n_samples= 1000, n_features=50, rank =  10, sigma=1., 
        random_state =42
   ): 
    """ Generate a sampling data for probabilistic PCA and Factor Analysis for  
    model comparison. 
    
    By default: 
        nsamples    = 1000 
        n_features  = 50  
        rank        =10 
        
    Returns 
    ----------
    * X: sampling data 
    * X_homo: sampling data with homoscedastic noise
    * X_hetero: sampling with heteroscedastic noise
    * n_components: number of components  50 features. 
    
    """
    # Create the data
    rng = np.random.RandomState(random_state )
    U, _, _ = linalg.svd(rng.randn(n_features, n_features))
    X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)
    
    # Adding homoscedastic noise
    X_homo = X + sigma * rng.randn(n_samples, n_features)
    
    # Adding heteroscedastic noise
    sigmas = sigma * rng.rand(n_features) + sigma / 2.
    X_hetero = X + rng.randn(n_samples, n_features) * sigmas

    # Fit the models
    n_components = np.arange(0, n_features, 5)  # options for n_components
    
    return X, X_homo, X_hetero , n_components