471 lines
13 KiB
Python
471 lines
13 KiB
Python
import random
|
|
from typing import TYPE_CHECKING, Dict, List, Optional, Set
|
|
|
|
import matplotlib.lines as mlines
|
|
import matplotlib.patches as patches
|
|
import numpy as np
|
|
|
|
from pandas._typing import Label
|
|
|
|
from pandas.core.dtypes.missing import notna
|
|
|
|
from pandas.io.formats.printing import pprint_thing
|
|
from pandas.plotting._matplotlib.style import get_standard_colors
|
|
from pandas.plotting._matplotlib.tools import create_subplots, set_ticks_props
|
|
|
|
if TYPE_CHECKING:
|
|
from matplotlib.axes import Axes
|
|
from matplotlib.figure import Figure
|
|
|
|
from pandas import DataFrame, Series
|
|
|
|
|
|
def scatter_matrix(
|
|
frame: "DataFrame",
|
|
alpha=0.5,
|
|
figsize=None,
|
|
ax=None,
|
|
grid=False,
|
|
diagonal="hist",
|
|
marker=".",
|
|
density_kwds=None,
|
|
hist_kwds=None,
|
|
range_padding=0.05,
|
|
**kwds,
|
|
):
|
|
df = frame._get_numeric_data()
|
|
n = df.columns.size
|
|
naxes = n * n
|
|
fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
|
|
|
|
# no gaps between subplots
|
|
fig.subplots_adjust(wspace=0, hspace=0)
|
|
|
|
mask = notna(df)
|
|
|
|
marker = _get_marker_compat(marker)
|
|
|
|
hist_kwds = hist_kwds or {}
|
|
density_kwds = density_kwds or {}
|
|
|
|
# GH 14855
|
|
kwds.setdefault("edgecolors", "none")
|
|
|
|
boundaries_list = []
|
|
for a in df.columns:
|
|
values = df[a].values[mask[a].values]
|
|
rmin_, rmax_ = np.min(values), np.max(values)
|
|
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0
|
|
boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
|
|
|
|
for i, a in enumerate(df.columns):
|
|
for j, b in enumerate(df.columns):
|
|
ax = axes[i, j]
|
|
|
|
if i == j:
|
|
values = df[a].values[mask[a].values]
|
|
|
|
# Deal with the diagonal by drawing a histogram there.
|
|
if diagonal == "hist":
|
|
ax.hist(values, **hist_kwds)
|
|
|
|
elif diagonal in ("kde", "density"):
|
|
from scipy.stats import gaussian_kde
|
|
|
|
y = values
|
|
gkde = gaussian_kde(y)
|
|
ind = np.linspace(y.min(), y.max(), 1000)
|
|
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
|
|
|
|
ax.set_xlim(boundaries_list[i])
|
|
|
|
else:
|
|
common = (mask[a] & mask[b]).values
|
|
|
|
ax.scatter(
|
|
df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds
|
|
)
|
|
|
|
ax.set_xlim(boundaries_list[j])
|
|
ax.set_ylim(boundaries_list[i])
|
|
|
|
ax.set_xlabel(b)
|
|
ax.set_ylabel(a)
|
|
|
|
if j != 0:
|
|
ax.yaxis.set_visible(False)
|
|
if i != n - 1:
|
|
ax.xaxis.set_visible(False)
|
|
|
|
if len(df.columns) > 1:
|
|
lim1 = boundaries_list[0]
|
|
locs = axes[0][1].yaxis.get_majorticklocs()
|
|
locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
|
|
adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
|
|
|
|
lim0 = axes[0][0].get_ylim()
|
|
adj = adj * (lim0[1] - lim0[0]) + lim0[0]
|
|
axes[0][0].yaxis.set_ticks(adj)
|
|
|
|
if np.all(locs == locs.astype(int)):
|
|
# if all ticks are int
|
|
locs = locs.astype(int)
|
|
axes[0][0].yaxis.set_ticklabels(locs)
|
|
|
|
set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
|
|
|
|
return axes
|
|
|
|
|
|
def _get_marker_compat(marker):
|
|
if marker not in mlines.lineMarkers:
|
|
return "o"
|
|
return marker
|
|
|
|
|
|
def radviz(
|
|
frame: "DataFrame",
|
|
class_column,
|
|
ax: Optional["Axes"] = None,
|
|
color=None,
|
|
colormap=None,
|
|
**kwds,
|
|
) -> "Axes":
|
|
import matplotlib.pyplot as plt
|
|
|
|
def normalize(series):
|
|
a = min(series)
|
|
b = max(series)
|
|
return (series - a) / (b - a)
|
|
|
|
n = len(frame)
|
|
classes = frame[class_column].drop_duplicates()
|
|
class_col = frame[class_column]
|
|
df = frame.drop(class_column, axis=1).apply(normalize)
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
ax.set_xlim(-1, 1)
|
|
ax.set_ylim(-1, 1)
|
|
|
|
to_plot: Dict[Label, List[List]] = {}
|
|
colors = get_standard_colors(
|
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color
|
|
)
|
|
|
|
for kls in classes:
|
|
to_plot[kls] = [[], []]
|
|
|
|
m = len(frame.columns) - 1
|
|
s = np.array(
|
|
[
|
|
(np.cos(t), np.sin(t))
|
|
for t in [2.0 * np.pi * (i / float(m)) for i in range(m)]
|
|
]
|
|
)
|
|
|
|
for i in range(n):
|
|
row = df.iloc[i].values
|
|
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
|
|
y = (s * row_).sum(axis=0) / row.sum()
|
|
kls = class_col.iat[i]
|
|
to_plot[kls][0].append(y[0])
|
|
to_plot[kls][1].append(y[1])
|
|
|
|
for i, kls in enumerate(classes):
|
|
ax.scatter(
|
|
to_plot[kls][0],
|
|
to_plot[kls][1],
|
|
color=colors[i],
|
|
label=pprint_thing(kls),
|
|
**kwds,
|
|
)
|
|
ax.legend()
|
|
|
|
ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none"))
|
|
|
|
for xy, name in zip(s, df.columns):
|
|
|
|
ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray"))
|
|
|
|
if xy[0] < 0.0 and xy[1] < 0.0:
|
|
ax.text(
|
|
xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small"
|
|
)
|
|
elif xy[0] < 0.0 and xy[1] >= 0.0:
|
|
ax.text(
|
|
xy[0] - 0.025,
|
|
xy[1] + 0.025,
|
|
name,
|
|
ha="right",
|
|
va="bottom",
|
|
size="small",
|
|
)
|
|
elif xy[0] >= 0.0 and xy[1] < 0.0:
|
|
ax.text(
|
|
xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small"
|
|
)
|
|
elif xy[0] >= 0.0 and xy[1] >= 0.0:
|
|
ax.text(
|
|
xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small"
|
|
)
|
|
|
|
ax.axis("equal")
|
|
return ax
|
|
|
|
|
|
def andrews_curves(
|
|
frame: "DataFrame",
|
|
class_column,
|
|
ax: Optional["Axes"] = None,
|
|
samples: int = 200,
|
|
color=None,
|
|
colormap=None,
|
|
**kwds,
|
|
) -> "Axes":
|
|
import matplotlib.pyplot as plt
|
|
|
|
def function(amplitudes):
|
|
def f(t):
|
|
x1 = amplitudes[0]
|
|
result = x1 / np.sqrt(2.0)
|
|
|
|
# Take the rest of the coefficients and resize them
|
|
# appropriately. Take a copy of amplitudes as otherwise numpy
|
|
# deletes the element from amplitudes itself.
|
|
coeffs = np.delete(np.copy(amplitudes), 0)
|
|
coeffs.resize(int((coeffs.size + 1) / 2), 2)
|
|
|
|
# Generate the harmonics and arguments for the sin and cos
|
|
# functions.
|
|
harmonics = np.arange(0, coeffs.shape[0]) + 1
|
|
trig_args = np.outer(harmonics, t)
|
|
|
|
result += np.sum(
|
|
coeffs[:, 0, np.newaxis] * np.sin(trig_args)
|
|
+ coeffs[:, 1, np.newaxis] * np.cos(trig_args),
|
|
axis=0,
|
|
)
|
|
return result
|
|
|
|
return f
|
|
|
|
n = len(frame)
|
|
class_col = frame[class_column]
|
|
classes = frame[class_column].drop_duplicates()
|
|
df = frame.drop(class_column, axis=1)
|
|
t = np.linspace(-np.pi, np.pi, samples)
|
|
used_legends: Set[str] = set()
|
|
|
|
color_values = get_standard_colors(
|
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color
|
|
)
|
|
colors = dict(zip(classes, color_values))
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
ax.set_xlim(-np.pi, np.pi)
|
|
for i in range(n):
|
|
row = df.iloc[i].values
|
|
f = function(row)
|
|
y = f(t)
|
|
kls = class_col.iat[i]
|
|
label = pprint_thing(kls)
|
|
if label not in used_legends:
|
|
used_legends.add(label)
|
|
ax.plot(t, y, color=colors[kls], label=label, **kwds)
|
|
else:
|
|
ax.plot(t, y, color=colors[kls], **kwds)
|
|
|
|
ax.legend(loc="upper right")
|
|
ax.grid()
|
|
return ax
|
|
|
|
|
|
def bootstrap_plot(
|
|
series: "Series",
|
|
fig: Optional["Figure"] = None,
|
|
size: int = 50,
|
|
samples: int = 500,
|
|
**kwds,
|
|
) -> "Figure":
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
# TODO: is the failure mentioned below still relevant?
|
|
# random.sample(ndarray, int) fails on python 3.3, sigh
|
|
data = list(series.values)
|
|
samplings = [random.sample(data, size) for _ in range(samples)]
|
|
|
|
means = np.array([np.mean(sampling) for sampling in samplings])
|
|
medians = np.array([np.median(sampling) for sampling in samplings])
|
|
midranges = np.array(
|
|
[(min(sampling) + max(sampling)) * 0.5 for sampling in samplings]
|
|
)
|
|
if fig is None:
|
|
fig = plt.figure()
|
|
x = list(range(samples))
|
|
axes = []
|
|
ax1 = fig.add_subplot(2, 3, 1)
|
|
ax1.set_xlabel("Sample")
|
|
axes.append(ax1)
|
|
ax1.plot(x, means, **kwds)
|
|
ax2 = fig.add_subplot(2, 3, 2)
|
|
ax2.set_xlabel("Sample")
|
|
axes.append(ax2)
|
|
ax2.plot(x, medians, **kwds)
|
|
ax3 = fig.add_subplot(2, 3, 3)
|
|
ax3.set_xlabel("Sample")
|
|
axes.append(ax3)
|
|
ax3.plot(x, midranges, **kwds)
|
|
ax4 = fig.add_subplot(2, 3, 4)
|
|
ax4.set_xlabel("Mean")
|
|
axes.append(ax4)
|
|
ax4.hist(means, **kwds)
|
|
ax5 = fig.add_subplot(2, 3, 5)
|
|
ax5.set_xlabel("Median")
|
|
axes.append(ax5)
|
|
ax5.hist(medians, **kwds)
|
|
ax6 = fig.add_subplot(2, 3, 6)
|
|
ax6.set_xlabel("Midrange")
|
|
axes.append(ax6)
|
|
ax6.hist(midranges, **kwds)
|
|
for axis in axes:
|
|
plt.setp(axis.get_xticklabels(), fontsize=8)
|
|
plt.setp(axis.get_yticklabels(), fontsize=8)
|
|
plt.tight_layout()
|
|
return fig
|
|
|
|
|
|
def parallel_coordinates(
|
|
frame: "DataFrame",
|
|
class_column,
|
|
cols=None,
|
|
ax: Optional["Axes"] = None,
|
|
color=None,
|
|
use_columns=False,
|
|
xticks=None,
|
|
colormap=None,
|
|
axvlines: bool = True,
|
|
axvlines_kwds=None,
|
|
sort_labels: bool = False,
|
|
**kwds,
|
|
) -> "Axes":
|
|
import matplotlib.pyplot as plt
|
|
|
|
if axvlines_kwds is None:
|
|
axvlines_kwds = {"linewidth": 1, "color": "black"}
|
|
|
|
n = len(frame)
|
|
classes = frame[class_column].drop_duplicates()
|
|
class_col = frame[class_column]
|
|
|
|
if cols is None:
|
|
df = frame.drop(class_column, axis=1)
|
|
else:
|
|
df = frame[cols]
|
|
|
|
used_legends: Set[str] = set()
|
|
|
|
ncols = len(df.columns)
|
|
|
|
# determine values to use for xticks
|
|
if use_columns is True:
|
|
if not np.all(np.isreal(list(df.columns))):
|
|
raise ValueError("Columns must be numeric to be used as xticks")
|
|
x = df.columns
|
|
elif xticks is not None:
|
|
if not np.all(np.isreal(xticks)):
|
|
raise ValueError("xticks specified must be numeric")
|
|
elif len(xticks) != ncols:
|
|
raise ValueError("Length of xticks must match number of columns")
|
|
x = xticks
|
|
else:
|
|
x = list(range(ncols))
|
|
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
|
|
color_values = get_standard_colors(
|
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color
|
|
)
|
|
|
|
if sort_labels:
|
|
classes = sorted(classes)
|
|
color_values = sorted(color_values)
|
|
colors = dict(zip(classes, color_values))
|
|
|
|
for i in range(n):
|
|
y = df.iloc[i].values
|
|
kls = class_col.iat[i]
|
|
label = pprint_thing(kls)
|
|
if label not in used_legends:
|
|
used_legends.add(label)
|
|
ax.plot(x, y, color=colors[kls], label=label, **kwds)
|
|
else:
|
|
ax.plot(x, y, color=colors[kls], **kwds)
|
|
|
|
if axvlines:
|
|
for i in x:
|
|
ax.axvline(i, **axvlines_kwds)
|
|
|
|
ax.set_xticks(x)
|
|
ax.set_xticklabels(df.columns)
|
|
ax.set_xlim(x[0], x[-1])
|
|
ax.legend(loc="upper right")
|
|
ax.grid()
|
|
return ax
|
|
|
|
|
|
def lag_plot(
|
|
series: "Series", lag: int = 1, ax: Optional["Axes"] = None, **kwds
|
|
) -> "Axes":
|
|
# workaround because `c='b'` is hardcoded in matplotlib's scatter method
|
|
import matplotlib.pyplot as plt
|
|
|
|
kwds.setdefault("c", plt.rcParams["patch.facecolor"])
|
|
|
|
data = series.values
|
|
y1 = data[:-lag]
|
|
y2 = data[lag:]
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
ax.set_xlabel("y(t)")
|
|
ax.set_ylabel(f"y(t + {lag})")
|
|
ax.scatter(y1, y2, **kwds)
|
|
return ax
|
|
|
|
|
|
def autocorrelation_plot(
|
|
series: "Series", ax: Optional["Axes"] = None, **kwds
|
|
) -> "Axes":
|
|
import matplotlib.pyplot as plt
|
|
|
|
n = len(series)
|
|
data = np.asarray(series)
|
|
if ax is None:
|
|
ax = plt.gca()
|
|
ax.set_xlim(1, n)
|
|
ax.set_ylim(-1.0, 1.0)
|
|
mean = np.mean(data)
|
|
c0 = np.sum((data - mean) ** 2) / float(n)
|
|
|
|
def r(h):
|
|
return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0
|
|
|
|
x = np.arange(n) + 1
|
|
y = [r(loc) for loc in x]
|
|
z95 = 1.959963984540054
|
|
z99 = 2.5758293035489004
|
|
ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey")
|
|
ax.axhline(y=z95 / np.sqrt(n), color="grey")
|
|
ax.axhline(y=0.0, color="black")
|
|
ax.axhline(y=-z95 / np.sqrt(n), color="grey")
|
|
ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey")
|
|
ax.set_xlabel("Lag")
|
|
ax.set_ylabel("Autocorrelation")
|
|
ax.plot(x, y, **kwds)
|
|
if "label" in kwds:
|
|
ax.legend()
|
|
ax.grid()
|
|
return ax
|