Source code for watex.analysis.factor

# -*- coding: utf-8 -*-
"""
Model selection with Probabilistic PCA and Factor Analysis (FA)
====================================================================

Probabilistic PCA and Factor Analysis are probabilistic models. The consequence 
is that the likelihood of new data can be used for model selection and 
covariance estimation. Here we compare PCA and FA with cross-validation on 
low rank data corrupted with homoscedastic noise 
(noise variance is the same for each feature) or heteroscedastic noise 
(noise variance is the different for each feature). In a second step we compare 
the model likelihood to the likelihoods obtained from shrinkage covariance 
estimators.

One can observe that with homoscedastic noise both FA and PCA succeed in 
recovering the size of the low rank subspace. The likelihood with PCA is 
higher than FA in this case. However PCA fails and overestimates the rank 
when heteroscedastic noise is present. Under appropriate circumstances the 
low rank models are more likely than shrinkage models.

The automatic estimation from Automatic Choice of Dimensionality for PCA. 
NIPS 2000: 598-604 by Thomas P. Minka is also compared.

# Authors: Alexandre Gramfort & Denis A. Engemann
# License: BSD 3 clause
edited by LKouadio on Tue Oct 11 16:54:26 2022
"""

import numpy as np
import matplotlib.pyplot as plt
from scipy import linalg
from sklearn.decomposition import PCA, FactorAnalysis 
from sklearn.covariance import  ShrunkCovariance, LedoitWolf 
from sklearn.model_selection import GridSearchCV, cross_val_score
from .._docstring import _core_docs

__all__=[ 
    "LW_score", 
    "shrunk_cov_score", 
    "compute_scores", 
    "pcavsfa", 
    "make_scedastic_data", 
    ]

[docs] def compute_scores(X, n_features , n_components = 5): n_components = np.arange(0, n_features, n_components) pca = PCA(svd_solver='full') fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
compute_scores.__doc__ ="""\ Compute PCA score and Factor Analysis scores from training X. Parameters ----------- {params.X} n_features: int, number of features that composes X n_components: int, default {{5}} number of component to retrieve. Returns --------- Tuple (pca_scores, fa_scores): Scores from PCA and FA from transformed X """.format(params =_core_docs["params"])
[docs] def shrunk_cov_score(X): shrinkages = np.logspace(-2, 0, 30) # Fit the models cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages}) return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))
shrunk_cov_score.__doc__="""\ shrunk the covariance scores. Parameters ----------- {params.X} Returns ----------- score: score of covariance estimator (best ) with shrinkage """.format( params =_core_docs["params"] )
[docs] def LW_score(X, store_precision=True, assume_centered=False, **kws): r"""Models score from Ledoit-Wolf. Parameters ---------- store_precision : bool, default=True Specify if the estimated precision is stored. assume_centered : bool, default=False If True, data will not be centered before computation. Useful when working with data whose mean is almost, but not exactly zero. If False (default), data will be centered before computation. block_size : int, default=1000 Size of blocks into which the covariance matrix will be split during its Ledoit-Wolf estimation. This is purely a memory optimization and does not affect results. Notes ----- The regularised covariance is: .. math:: (1 - text{shrinkage}) * \text{cov} + \text{shrinkage} * \mu * \text{np.identity(n_features)} where :math:`\mu = \text{trace(cov)} / n_{features}` and shrinkage is given by the Ledoit and Wolf formula See also ---------- LedoitWolf References ---------- "A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices", Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, February 2004, pages 365-411. """ return np.mean(cross_val_score(LedoitWolf(**kws), X))
[docs] def pcavsfa ( X, #n_samples, n_features, rank =10 , sigma =1. , n_components =5, random_state = 42 , verbose =0 , view =False, ): # options for n_components n_samples, n_features = len(X), X.shape[1] n_components = np.arange(0, n_features, n_components) rng = np.random.RandomState(random_state) U, _, _ = linalg.svd(rng.randn(n_features, n_features)) #X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T) X = np.dot(rng.randn(n_samples, n_features), U[:, :rank].T) # Adding homoscedastic noise X_homo = X + sigma * rng.randn(n_samples, n_features) # Adding heteroscedastic noise sigmas = sigma * rng.rand(n_features) + sigma / 2. X_hetero = X + rng.randn(n_samples, n_features) * sigmas for X, title in [(X_homo, 'Homoscedastic Noise'), (X_hetero, 'Heteroscedastic Noise')]: pca_scores, fa_scores = compute_scores(X, n_features) n_components_pca = n_components[np.argmax(pca_scores)] n_components_fa = n_components[np.argmax(fa_scores)] pca = PCA(svd_solver='full', n_components='mle') pca.fit(X) n_components_pca_mle = pca.n_components_ if verbose: print("best n_components by PCA CV = %d" % n_components_pca) print("best n_components by FactorAnalysis CV = %d" % n_components_fa) print("best n_components by PCA MLE = %d" % n_components_pca_mle) plt.figure() plt.plot(n_components, pca_scores, 'b', label='PCA scores') plt.plot(n_components, fa_scores, 'r', label='FA scores') plt.axvline(rank, color='g', label='TRUTH: %d' % rank, linestyle='-') plt.axvline(n_components_pca, color='b', label='PCA CV: %d' % n_components_pca, linestyle='--') plt.axvline(n_components_fa, color='r', label='FactorAnalysis CV: %d' % n_components_fa, linestyle='--') plt.axvline(n_components_pca_mle, color='k', label='PCA MLE: %d' % n_components_pca_mle, linestyle='--') # compare with other covariance estimators plt.axhline(shrunk_cov_score(X), color='violet', label='Shrunk Covariance MLE', linestyle='-.') plt.axhline(LW_score(X), color='orange', label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.') plt.xlabel('nb of components') plt.ylabel('CV scores') plt.legend(loc='lower right') plt.title(title) plt.show() return pca_scores, fa_scores
pcavsfa.__doc__="""\ Compute PCA score and Factor Analysis scores from training X and compare probabilistic PCA and Factor Analysis models. Parameters ----------- {params.X} n_features: int, number of features that composes X n_components: int, default {{5}} number of component to retrieve. rank: int, default{{10}} Bounding for ranking sigma: float, default {{1.}} data pertubator ratio for adding heteroscedastic noise random_state: int , default {{42}} Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. {params.verbose} Returns --------- Tuple (pca_scores, fa_scores): Scores from PCA and FA from transformed X """.format( params =_core_docs["params"] )
[docs] def make_scedastic_data ( n_samples= 1000, n_features=50, rank = 10, sigma=1., random_state =42 ): """ Generate a sampling data for probabilistic PCA and Factor Analysis for model comparison. By default: nsamples = 1000 n_features = 50 rank =10 Returns ---------- * X: sampling data * X_homo: sampling data with homoscedastic noise * X_hetero: sampling with heteroscedastic noise * n_components: number of components 50 features. """ # Create the data rng = np.random.RandomState(random_state ) U, _, _ = linalg.svd(rng.randn(n_features, n_features)) X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T) # Adding homoscedastic noise X_homo = X + sigma * rng.randn(n_samples, n_features) # Adding heteroscedastic noise sigmas = sigma * rng.rand(n_features) + sigma / 2. X_hetero = X + rng.randn(n_samples, n_features) * sigmas # Fit the models n_components = np.arange(0, n_features, 5) # options for n_components return X, X_homo, X_hetero , n_components