projektAI/venv/Lib/site-packages/mlxtend/plotting/pca_correlation_graph.py

178 lines
6.5 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
# mlxtend Machine Learning Library Extensions
#
# A function for plotting a PCA correlation circle
# File Author: Gabriel Azevedo Ferreira <az.fe.gabriel@gmail.com>
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from mlxtend.externals.adjust_text import adjust_text
from mlxtend.feature_extraction import PrincipalComponentAnalysis
def corr2_coeff(A, B):
"""
Compute correlation coefficients and return as a np array
"""
A, B = np.array(A), np.array(B)
# Rowwise mean of input arrays & subtract from input arrays themeselves
A_mA = A - A.mean(1)[:, None]
B_mB = B - B.mean(1)[:, None]
# Sum of squares across rows
ssA = (A_mA**2).sum(1)
ssB = (B_mB**2).sum(1)
# Finally get corr coeff
return np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None], ssB[None]))
def create_correlation_table(A, B, names_cols_A, names_cols_B):
"""
Compute correlation coefficients and return as a DataFrame.
A and B: 2d array like.
The columns represent the different variables and the rows are
the samples of thos variables
names_cols_A/B : name to be added to the final pandas table
return: pandas DataFrame with the corelations.Columns and Indexes
represent the different variables of A and B (respectvely)
"""
# corrs = np.corrcoef(np.transpose(A), np.transpose(B)
# )[len(names_cols_A):, :len(names_cols_A)]
corrs = corr2_coeff(A.T, B.T).T
df_corrs = pd.DataFrame(corrs,
columns=names_cols_A, index=names_cols_B)
return df_corrs
def plot_pca_correlation_graph(X, variables_names, dimensions=(1, 2),
figure_axis_size=6, X_pca=None,
explained_variance=None):
"""
Compute the PCA for X and plots the Correlation graph
Parameters
----------
X : 2d array like.
The columns represent the different variables and the rows are the
samples of thos variables
variables_names : array like
Name of the columns (the variables) of X
dimensions: tuple with two elements.
dimensions to be plotted (x,y)
figure_axis_size :
size of the final frame. The figure created is a square with length
and width equal to figure_axis_size.
X_pca : np.ndarray, shape = [n_samples, n_components].
Optional.
`X_pca` is the matrix of the transformed components from X.
If not provided, the function computes PCA automatically using
mlxtend.feature_extraction.PrincipalComponentAnalysis
Expected `n_componentes >= max(dimensions)`
explained_variance : 1 dimension np.ndarray, length = n_components
Optional.
`explained_variance` are the eigenvalues from the diagonalized
covariance matrix on the PCA transformatiopn.
If not provided, the function computes PCA independently
Expected `n_componentes == X.shape[1]`
Returns
----------
matplotlib_figure, correlation_matrix
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/plotting/plot_pca_correlation_graph/
"""
X = np.array(X)
X = X - X.mean(axis=0)
n_comp = max(dimensions)
if (X_pca is None) and (explained_variance is None):
pca = PrincipalComponentAnalysis(n_components=n_comp)
pca.fit(X)
X_pca = pca.transform(X)
explained_variance = pca.e_vals_
elif (X_pca is not None) and (explained_variance is None):
raise ValueError("If `X_pca` is not None, the `explained variance`"
" values should not be `None`.")
elif (X_pca is None) and (explained_variance is not None):
raise ValueError("If `explained variance` is not None, the `X_pca`"
" values should not be `None`.")
elif (X_pca is not None) and (explained_variance is not None):
if X_pca.shape[1] != len(explained_variance):
raise ValueError(f"Number of principal components must "
f"match the number "
f"of eigenvalues. Got "
f"{X_pca.shape[1]} "
f"!= "
f"{len(explained_variance)}")
if X_pca.shape[1] < n_comp:
raise ValueError(f"Input array `X_pca` contains fewer principal"
f" components than expected based on `dimensions`."
f" Got {X_pca.shape[1]} components in X_pca, expected"
f" at least `max(dimensions)={n_comp}`.")
if len(explained_variance) < n_comp:
raise ValueError(f"Input array `explained_variance` contains fewer"
f" elements than expected. Got"
f" {len(explained_variance)} elements, expected"
f"`X.shape[1]={X.shape[1]}`.")
corrs = create_correlation_table(X_pca, X, ['Dim ' + str(i + 1) for i in
range(n_comp)],
variables_names)
tot = sum(X.var(0)) * X.shape[0] / (X.shape[0] - 1)
explained_var_ratio = [(i / tot) * 100 for i in explained_variance]
# Plotting circle
fig_res = plt.figure(figsize=(figure_axis_size, figure_axis_size))
plt.Circle((0, 0), radius=1, color='k', fill=False)
circle1 = plt.Circle((0, 0), radius=1, color='k', fill=False)
fig = plt.gcf()
fig.gca().add_artist(circle1)
# Plotting arrows
texts = []
for name, row in corrs.iterrows():
x = row['Dim ' + str(dimensions[0])]
y = row['Dim ' + str(dimensions[1])]
plt.arrow(0.0, 0.0, x, y, color='k', length_includes_head=True,
head_width=.05)
plt.plot([0.0, x], [0.0, y], 'k-')
texts.append(plt.text(x, y, name, fontsize=2 * figure_axis_size))
# Plotting vertical lines
plt.plot([-1.1, 1.1], [0, 0], 'k--')
plt.plot([0, 0], [-1.1, 1.1], 'k--')
# Adjusting text
adjust_text(texts)
# Setting limits and title
plt.xlim((-1.1, 1.1))
plt.ylim((-1.1, 1.1))
plt.title("Correlation Circle", fontsize=figure_axis_size * 3)
plt.xlabel("Dim " + str(dimensions[0]) + " (%s%%)" %
str(explained_var_ratio[dimensions[0] - 1])[:4],
fontsize=figure_axis_size * 2)
plt.ylabel("Dim " + str(dimensions[1]) + " (%s%%)" %
str(explained_var_ratio[dimensions[1] - 1])[:4],
fontsize=figure_axis_size * 2)
return fig_res, corrs