projektAI/venv/Lib/site-packages/mlxtend/feature_extraction/principal_component_analysis.py

179 lines
6.4 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Principal Component Analysis for dimensionality reduction.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
from .._base import _BaseModel
class PrincipalComponentAnalysis(_BaseModel):
"""
Principal Component Analysis Class
Parameters
----------
n_components : int (default: None)
The number of principal components for transformation.
Keeps the original dimensions of the dataset if `None`.
solver : str (default: 'svd')
Method for performing the matrix decomposition.
{'eigen', 'svd'}
whitening : bool (default: False)
Performs whitening such that the covariance matrix of
the transformed data will be the identity matrix.
Attributes
----------
w_ : array-like, shape=[n_features, n_components]
Projection matrix
e_vals_ : array-like, shape=[n_features]
Eigenvalues in sorted order.
e_vecs_ : array-like, shape=[n_features]
Eigenvectors in sorted order.
e_vals_normalized_ : array-like, shape=[n_features]
Normalized eigen values such that they sum up to 1.
This is equal to what's often referred to as
"explained variance ratios."
loadings_ : array_like, shape=[n_features, n_features]
The factor loadings of the original variables onto
the principal components. The columns are the principal
components, and the rows are the features loadings.
For instance, the first column contains the loadings onto
the first principal component. Note that the signs may
be flipped depending on whether you use the 'eigen' or
'svd' solver; this does not affect the interpretation
of the loadings though.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/feature_extraction/PrincipalComponentAnalysis/
"""
def __init__(self, n_components=None, solver='svd', whitening=False):
valid_solver = {'eigen', 'svd'}
if solver not in valid_solver:
raise AttributeError('Must be in %s. Found %s'
% (valid_solver, solver))
self.solver = solver
if n_components is not None and n_components < 1:
raise AttributeError('n_components must be > 1 or None')
self.n_components = n_components
self._is_fitted = False
self.whitening = whitening
def fit(self, X, y=None):
"""Learn model from training data.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
self : object
"""
self._is_fitted = False
self._check_arrays(X=X)
self._fit(X=X)
self._is_fitted = True
return self
def _fit(self, X):
n_samples = X.shape[0]
n_features = X.shape[1]
if self.n_components is None or self.n_components > n_features:
n_components = n_features
else:
n_components = self.n_components
if self.solver == 'eigen':
cov_mat = self._covariance_matrix(X)
self.e_vals_, self.e_vecs_ =\
self._decomposition(cov_mat, n_samples)
elif self.solver == 'svd':
self.e_vals_, self.e_vecs_ = self._decomposition(X, n_samples)
self.w_ = self._projection_matrix(eig_vals=self.e_vals_,
eig_vecs=self.e_vecs_,
whitening=self.whitening,
n_components=n_components)
tot = np.sum(self.e_vals_)
self.e_vals_normalized_ = np.array([(i / tot)
for i in sorted(self.e_vals_,
reverse=True)])
self.loadings_ = self._loadings()
return self
def transform(self, X):
""" Apply the linear transformation on X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
X_projected : np.ndarray, shape = [n_samples, n_components]
Projected training vectors.
"""
self._check_arrays(X=X)
if not hasattr(self, 'w_'):
raise AttributeError('Object as not been fitted, yet.')
transformed = X.dot(self.w_)
if self.whitening:
# ## Debug 1
# norm = np.sqrt(np.diag(self.e_vals_[:self.w_.shape[1]]))
# for i, column in enumerate(transformed.T):
# transformed[:, i] /= np.max(norm[:, i])
norm = np.diag((1./np.sqrt(self.e_vals_[:self.w_.shape[1]])))
transformed = norm.dot(transformed.T).T
return transformed
def _covariance_matrix(self, X):
mean_vec = np.mean(X, axis=0)
cov_mat = (X - mean_vec).T.dot((X - mean_vec)) / (X.shape[0] - 1)
return cov_mat
def _decomposition(self, mat, n_samples):
if self.solver == 'eigen':
e_vals, e_vecs = np.linalg.eig(mat)
elif self.solver == 'svd':
# Only SVD on mean centered data is equivalent to
# PCA via covariance matrix (note that computing
# the covariance matrix will implicitely center
# the data)
mat_centered = mat - mat.mean(axis=0)
u, s, v = np.linalg.svd(mat_centered.T)
e_vecs, e_vals = u, s
e_vals = e_vals ** 2 / (n_samples-1)
if e_vals.shape[0] < e_vecs.shape[1]:
new_e_vals = np.zeros(e_vecs.shape[1])
new_e_vals[:e_vals.shape[0]] = e_vals
e_vals = new_e_vals
sort_idx = np.argsort(e_vals)[::-1]
e_vals, e_vecs = e_vals[sort_idx], e_vecs[:, sort_idx]
return e_vals, e_vecs
def _loadings(self):
"""Compute factor loadings"""
return self.e_vecs_ * np.sqrt(self.e_vals_)
def _projection_matrix(self, eig_vals, eig_vecs, whitening, n_components):
matrix_w = np.vstack([eig_vecs[:, i] for i in range(n_components)]).T
return matrix_w