573 lines
18 KiB
Python
573 lines
18 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Literal,
|
|
NamedTuple,
|
|
)
|
|
import warnings
|
|
|
|
from matplotlib.artist import setp
|
|
import numpy as np
|
|
|
|
from pandas._libs import lib
|
|
from pandas.util._decorators import cache_readonly
|
|
from pandas.util._exceptions import find_stack_level
|
|
|
|
from pandas.core.dtypes.common import is_dict_like
|
|
from pandas.core.dtypes.generic import ABCSeries
|
|
from pandas.core.dtypes.missing import remove_na_arraylike
|
|
|
|
import pandas as pd
|
|
import pandas.core.common as com
|
|
|
|
from pandas.io.formats.printing import pprint_thing
|
|
from pandas.plotting._matplotlib.core import (
|
|
LinePlot,
|
|
MPLPlot,
|
|
)
|
|
from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
|
|
from pandas.plotting._matplotlib.style import get_standard_colors
|
|
from pandas.plotting._matplotlib.tools import (
|
|
create_subplots,
|
|
flatten_axes,
|
|
maybe_adjust_figure,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Collection
|
|
|
|
from matplotlib.axes import Axes
|
|
from matplotlib.figure import Figure
|
|
from matplotlib.lines import Line2D
|
|
|
|
from pandas._typing import MatplotlibColor
|
|
|
|
|
|
def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None:
|
|
"""Set the tick labels of a given axis.
|
|
|
|
Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the
|
|
case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of
|
|
labels.
|
|
"""
|
|
ticks = ax.get_xticks() if is_vertical else ax.get_yticks()
|
|
if len(ticks) != len(labels):
|
|
i, remainder = divmod(len(ticks), len(labels))
|
|
assert remainder == 0, remainder
|
|
labels *= i
|
|
if is_vertical:
|
|
ax.set_xticklabels(labels, **kwargs)
|
|
else:
|
|
ax.set_yticklabels(labels, **kwargs)
|
|
|
|
|
|
class BoxPlot(LinePlot):
|
|
@property
|
|
def _kind(self) -> Literal["box"]:
|
|
return "box"
|
|
|
|
_layout_type = "horizontal"
|
|
|
|
_valid_return_types = (None, "axes", "dict", "both")
|
|
|
|
class BP(NamedTuple):
|
|
# namedtuple to hold results
|
|
ax: Axes
|
|
lines: dict[str, list[Line2D]]
|
|
|
|
def __init__(self, data, return_type: str = "axes", **kwargs) -> None:
|
|
if return_type not in self._valid_return_types:
|
|
raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}")
|
|
|
|
self.return_type = return_type
|
|
# Do not call LinePlot.__init__ which may fill nan
|
|
MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called
|
|
|
|
if self.subplots:
|
|
# Disable label ax sharing. Otherwise, all subplots shows last
|
|
# column label
|
|
if self.orientation == "vertical":
|
|
self.sharex = False
|
|
else:
|
|
self.sharey = False
|
|
|
|
# error: Signature of "_plot" incompatible with supertype "MPLPlot"
|
|
@classmethod
|
|
def _plot( # type: ignore[override]
|
|
cls, ax: Axes, y: np.ndarray, column_num=None, return_type: str = "axes", **kwds
|
|
):
|
|
ys: np.ndarray | list[np.ndarray]
|
|
if y.ndim == 2:
|
|
ys = [remove_na_arraylike(v) for v in y]
|
|
# Boxplot fails with empty arrays, so need to add a NaN
|
|
# if any cols are empty
|
|
# GH 8181
|
|
ys = [v if v.size > 0 else np.array([np.nan]) for v in ys]
|
|
else:
|
|
ys = remove_na_arraylike(y)
|
|
bp = ax.boxplot(ys, **kwds)
|
|
|
|
if return_type == "dict":
|
|
return bp, bp
|
|
elif return_type == "both":
|
|
return cls.BP(ax=ax, lines=bp), bp
|
|
else:
|
|
return ax, bp
|
|
|
|
def _validate_color_args(self, color, colormap):
|
|
if color is lib.no_default:
|
|
return None
|
|
|
|
if colormap is not None:
|
|
warnings.warn(
|
|
"'color' and 'colormap' cannot be used "
|
|
"simultaneously. Using 'color'",
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
|
|
if isinstance(color, dict):
|
|
valid_keys = ["boxes", "whiskers", "medians", "caps"]
|
|
for key in color:
|
|
if key not in valid_keys:
|
|
raise ValueError(
|
|
f"color dict contains invalid key '{key}'. "
|
|
f"The key must be either {valid_keys}"
|
|
)
|
|
return color
|
|
|
|
@cache_readonly
|
|
def _color_attrs(self):
|
|
# get standard colors for default
|
|
# use 2 colors by default, for box/whisker and median
|
|
# flier colors isn't needed here
|
|
# because it can be specified by ``sym`` kw
|
|
return get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
|
|
|
|
@cache_readonly
|
|
def _boxes_c(self):
|
|
return self._color_attrs[0]
|
|
|
|
@cache_readonly
|
|
def _whiskers_c(self):
|
|
return self._color_attrs[0]
|
|
|
|
@cache_readonly
|
|
def _medians_c(self):
|
|
return self._color_attrs[2]
|
|
|
|
@cache_readonly
|
|
def _caps_c(self):
|
|
return self._color_attrs[0]
|
|
|
|
def _get_colors(
|
|
self,
|
|
num_colors=None,
|
|
color_kwds: dict[str, MatplotlibColor]
|
|
| MatplotlibColor
|
|
| Collection[MatplotlibColor]
|
|
| None = "color",
|
|
) -> None:
|
|
pass
|
|
|
|
def maybe_color_bp(self, bp) -> None:
|
|
if isinstance(self.color, dict):
|
|
boxes = self.color.get("boxes", self._boxes_c)
|
|
whiskers = self.color.get("whiskers", self._whiskers_c)
|
|
medians = self.color.get("medians", self._medians_c)
|
|
caps = self.color.get("caps", self._caps_c)
|
|
else:
|
|
# Other types are forwarded to matplotlib
|
|
# If None, use default colors
|
|
boxes = self.color or self._boxes_c
|
|
whiskers = self.color or self._whiskers_c
|
|
medians = self.color or self._medians_c
|
|
caps = self.color or self._caps_c
|
|
|
|
color_tup = (boxes, whiskers, medians, caps)
|
|
maybe_color_bp(bp, color_tup=color_tup, **self.kwds)
|
|
|
|
def _make_plot(self, fig: Figure) -> None:
|
|
if self.subplots:
|
|
self._return_obj = pd.Series(dtype=object)
|
|
|
|
# Re-create iterated data if `by` is assigned by users
|
|
data = (
|
|
create_iter_data_given_by(self.data, self._kind)
|
|
if self.by is not None
|
|
else self.data
|
|
)
|
|
|
|
# error: Argument "data" to "_iter_data" of "MPLPlot" has
|
|
# incompatible type "object"; expected "DataFrame |
|
|
# dict[Hashable, Series | DataFrame]"
|
|
for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type]
|
|
ax = self._get_ax(i)
|
|
kwds = self.kwds.copy()
|
|
|
|
# When by is applied, show title for subplots to know which group it is
|
|
# just like df.boxplot, and need to apply T on y to provide right input
|
|
if self.by is not None:
|
|
y = y.T
|
|
ax.set_title(pprint_thing(label))
|
|
|
|
# When `by` is assigned, the ticklabels will become unique grouped
|
|
# values, instead of label which is used as subtitle in this case.
|
|
# error: "Index" has no attribute "levels"; maybe "nlevels"?
|
|
levels = self.data.columns.levels # type: ignore[attr-defined]
|
|
ticklabels = [pprint_thing(col) for col in levels[0]]
|
|
else:
|
|
ticklabels = [pprint_thing(label)]
|
|
|
|
ret, bp = self._plot(
|
|
ax, y, column_num=i, return_type=self.return_type, **kwds
|
|
)
|
|
self.maybe_color_bp(bp)
|
|
self._return_obj[label] = ret
|
|
_set_ticklabels(
|
|
ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical"
|
|
)
|
|
else:
|
|
y = self.data.values.T
|
|
ax = self._get_ax(0)
|
|
kwds = self.kwds.copy()
|
|
|
|
ret, bp = self._plot(
|
|
ax, y, column_num=0, return_type=self.return_type, **kwds
|
|
)
|
|
self.maybe_color_bp(bp)
|
|
self._return_obj = ret
|
|
|
|
labels = [pprint_thing(left) for left in self.data.columns]
|
|
if not self.use_index:
|
|
labels = [pprint_thing(key) for key in range(len(labels))]
|
|
_set_ticklabels(
|
|
ax=ax, labels=labels, is_vertical=self.orientation == "vertical"
|
|
)
|
|
|
|
def _make_legend(self) -> None:
|
|
pass
|
|
|
|
def _post_plot_logic(self, ax: Axes, data) -> None:
|
|
# GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel
|
|
if self.xlabel:
|
|
ax.set_xlabel(pprint_thing(self.xlabel))
|
|
if self.ylabel:
|
|
ax.set_ylabel(pprint_thing(self.ylabel))
|
|
|
|
@property
|
|
def orientation(self) -> Literal["horizontal", "vertical"]:
|
|
if self.kwds.get("vert", True):
|
|
return "vertical"
|
|
else:
|
|
return "horizontal"
|
|
|
|
@property
|
|
def result(self):
|
|
if self.return_type is None:
|
|
return super().result
|
|
else:
|
|
return self._return_obj
|
|
|
|
|
|
def maybe_color_bp(bp, color_tup, **kwds) -> None:
|
|
# GH#30346, when users specifying those arguments explicitly, our defaults
|
|
# for these four kwargs should be overridden; if not, use Pandas settings
|
|
if not kwds.get("boxprops"):
|
|
setp(bp["boxes"], color=color_tup[0], alpha=1)
|
|
if not kwds.get("whiskerprops"):
|
|
setp(bp["whiskers"], color=color_tup[1], alpha=1)
|
|
if not kwds.get("medianprops"):
|
|
setp(bp["medians"], color=color_tup[2], alpha=1)
|
|
if not kwds.get("capprops"):
|
|
setp(bp["caps"], color=color_tup[3], alpha=1)
|
|
|
|
|
|
def _grouped_plot_by_column(
|
|
plotf,
|
|
data,
|
|
columns=None,
|
|
by=None,
|
|
numeric_only: bool = True,
|
|
grid: bool = False,
|
|
figsize: tuple[float, float] | None = None,
|
|
ax=None,
|
|
layout=None,
|
|
return_type=None,
|
|
**kwargs,
|
|
):
|
|
grouped = data.groupby(by, observed=False)
|
|
if columns is None:
|
|
if not isinstance(by, (list, tuple)):
|
|
by = [by]
|
|
columns = data._get_numeric_data().columns.difference(by)
|
|
naxes = len(columns)
|
|
fig, axes = create_subplots(
|
|
naxes=naxes,
|
|
sharex=kwargs.pop("sharex", True),
|
|
sharey=kwargs.pop("sharey", True),
|
|
figsize=figsize,
|
|
ax=ax,
|
|
layout=layout,
|
|
)
|
|
|
|
_axes = flatten_axes(axes)
|
|
|
|
# GH 45465: move the "by" label based on "vert"
|
|
xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None)
|
|
if kwargs.get("vert", True):
|
|
xlabel = xlabel or by
|
|
else:
|
|
ylabel = ylabel or by
|
|
|
|
ax_values = []
|
|
|
|
for i, col in enumerate(columns):
|
|
ax = _axes[i]
|
|
gp_col = grouped[col]
|
|
keys, values = zip(*gp_col)
|
|
re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs)
|
|
ax.set_title(col)
|
|
ax_values.append(re_plotf)
|
|
ax.grid(grid)
|
|
|
|
result = pd.Series(ax_values, index=columns, copy=False)
|
|
|
|
# Return axes in multiplot case, maybe revisit later # 985
|
|
if return_type is None:
|
|
result = axes
|
|
|
|
byline = by[0] if len(by) == 1 else by
|
|
fig.suptitle(f"Boxplot grouped by {byline}")
|
|
maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
|
|
|
|
return result
|
|
|
|
|
|
def boxplot(
|
|
data,
|
|
column=None,
|
|
by=None,
|
|
ax=None,
|
|
fontsize: int | None = None,
|
|
rot: int = 0,
|
|
grid: bool = True,
|
|
figsize: tuple[float, float] | None = None,
|
|
layout=None,
|
|
return_type=None,
|
|
**kwds,
|
|
):
|
|
import matplotlib.pyplot as plt
|
|
|
|
# validate return_type:
|
|
if return_type not in BoxPlot._valid_return_types:
|
|
raise ValueError("return_type must be {'axes', 'dict', 'both'}")
|
|
|
|
if isinstance(data, ABCSeries):
|
|
data = data.to_frame("x")
|
|
column = "x"
|
|
|
|
def _get_colors():
|
|
# num_colors=3 is required as method maybe_color_bp takes the colors
|
|
# in positions 0 and 2.
|
|
# if colors not provided, use same defaults as DataFrame.plot.box
|
|
result = get_standard_colors(num_colors=3)
|
|
result = np.take(result, [0, 0, 2])
|
|
result = np.append(result, "k")
|
|
|
|
colors = kwds.pop("color", None)
|
|
if colors:
|
|
if is_dict_like(colors):
|
|
# replace colors in result array with user-specified colors
|
|
# taken from the colors dict parameter
|
|
# "boxes" value placed in position 0, "whiskers" in 1, etc.
|
|
valid_keys = ["boxes", "whiskers", "medians", "caps"]
|
|
key_to_index = dict(zip(valid_keys, range(4)))
|
|
for key, value in colors.items():
|
|
if key in valid_keys:
|
|
result[key_to_index[key]] = value
|
|
else:
|
|
raise ValueError(
|
|
f"color dict contains invalid key '{key}'. "
|
|
f"The key must be either {valid_keys}"
|
|
)
|
|
else:
|
|
result.fill(colors)
|
|
|
|
return result
|
|
|
|
def plot_group(keys, values, ax: Axes, **kwds):
|
|
# GH 45465: xlabel/ylabel need to be popped out before plotting happens
|
|
xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None)
|
|
if xlabel:
|
|
ax.set_xlabel(pprint_thing(xlabel))
|
|
if ylabel:
|
|
ax.set_ylabel(pprint_thing(ylabel))
|
|
|
|
keys = [pprint_thing(x) for x in keys]
|
|
values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values]
|
|
bp = ax.boxplot(values, **kwds)
|
|
if fontsize is not None:
|
|
ax.tick_params(axis="both", labelsize=fontsize)
|
|
|
|
# GH 45465: x/y are flipped when "vert" changes
|
|
_set_ticklabels(
|
|
ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot
|
|
)
|
|
maybe_color_bp(bp, color_tup=colors, **kwds)
|
|
|
|
# Return axes in multiplot case, maybe revisit later # 985
|
|
if return_type == "dict":
|
|
return bp
|
|
elif return_type == "both":
|
|
return BoxPlot.BP(ax=ax, lines=bp)
|
|
else:
|
|
return ax
|
|
|
|
colors = _get_colors()
|
|
if column is None:
|
|
columns = None
|
|
elif isinstance(column, (list, tuple)):
|
|
columns = column
|
|
else:
|
|
columns = [column]
|
|
|
|
if by is not None:
|
|
# Prefer array return type for 2-D plots to match the subplot layout
|
|
# https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
|
|
result = _grouped_plot_by_column(
|
|
plot_group,
|
|
data,
|
|
columns=columns,
|
|
by=by,
|
|
grid=grid,
|
|
figsize=figsize,
|
|
ax=ax,
|
|
layout=layout,
|
|
return_type=return_type,
|
|
**kwds,
|
|
)
|
|
else:
|
|
if return_type is None:
|
|
return_type = "axes"
|
|
if layout is not None:
|
|
raise ValueError("The 'layout' keyword is not supported when 'by' is None")
|
|
|
|
if ax is None:
|
|
rc = {"figure.figsize": figsize} if figsize is not None else {}
|
|
with plt.rc_context(rc):
|
|
ax = plt.gca()
|
|
data = data._get_numeric_data()
|
|
naxes = len(data.columns)
|
|
if naxes == 0:
|
|
raise ValueError(
|
|
"boxplot method requires numerical columns, nothing to plot."
|
|
)
|
|
if columns is None:
|
|
columns = data.columns
|
|
else:
|
|
data = data[columns]
|
|
|
|
result = plot_group(columns, data.values.T, ax, **kwds)
|
|
ax.grid(grid)
|
|
|
|
return result
|
|
|
|
|
|
def boxplot_frame(
|
|
self,
|
|
column=None,
|
|
by=None,
|
|
ax=None,
|
|
fontsize: int | None = None,
|
|
rot: int = 0,
|
|
grid: bool = True,
|
|
figsize: tuple[float, float] | None = None,
|
|
layout=None,
|
|
return_type=None,
|
|
**kwds,
|
|
):
|
|
import matplotlib.pyplot as plt
|
|
|
|
ax = boxplot(
|
|
self,
|
|
column=column,
|
|
by=by,
|
|
ax=ax,
|
|
fontsize=fontsize,
|
|
grid=grid,
|
|
rot=rot,
|
|
figsize=figsize,
|
|
layout=layout,
|
|
return_type=return_type,
|
|
**kwds,
|
|
)
|
|
plt.draw_if_interactive()
|
|
return ax
|
|
|
|
|
|
def boxplot_frame_groupby(
|
|
grouped,
|
|
subplots: bool = True,
|
|
column=None,
|
|
fontsize: int | None = None,
|
|
rot: int = 0,
|
|
grid: bool = True,
|
|
ax=None,
|
|
figsize: tuple[float, float] | None = None,
|
|
layout=None,
|
|
sharex: bool = False,
|
|
sharey: bool = True,
|
|
**kwds,
|
|
):
|
|
if subplots is True:
|
|
naxes = len(grouped)
|
|
fig, axes = create_subplots(
|
|
naxes=naxes,
|
|
squeeze=False,
|
|
ax=ax,
|
|
sharex=sharex,
|
|
sharey=sharey,
|
|
figsize=figsize,
|
|
layout=layout,
|
|
)
|
|
axes = flatten_axes(axes)
|
|
|
|
ret = pd.Series(dtype=object)
|
|
|
|
for (key, group), ax in zip(grouped, axes):
|
|
d = group.boxplot(
|
|
ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds
|
|
)
|
|
ax.set_title(pprint_thing(key))
|
|
ret.loc[key] = d
|
|
maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
|
|
else:
|
|
keys, frames = zip(*grouped)
|
|
if grouped.axis == 0:
|
|
df = pd.concat(frames, keys=keys, axis=1)
|
|
elif len(frames) > 1:
|
|
df = frames[0].join(frames[1::])
|
|
else:
|
|
df = frames[0]
|
|
|
|
# GH 16748, DataFrameGroupby fails when subplots=False and `column` argument
|
|
# is assigned, and in this case, since `df` here becomes MI after groupby,
|
|
# so we need to couple the keys (grouped values) and column (original df
|
|
# column) together to search for subset to plot
|
|
if column is not None:
|
|
column = com.convert_to_list_like(column)
|
|
multi_key = pd.MultiIndex.from_product([keys, column])
|
|
column = list(multi_key.values)
|
|
ret = df.boxplot(
|
|
column=column,
|
|
fontsize=fontsize,
|
|
rot=rot,
|
|
grid=grid,
|
|
ax=ax,
|
|
figsize=figsize,
|
|
layout=layout,
|
|
**kwds,
|
|
)
|
|
return ret
|