3RNN/Lib/site-packages/pandas/core/methods/describe.py
2024-05-26 19:49:15 +02:00

417 lines
12 KiB
Python

"""
Module responsible for execution of NDFrame.describe() method.
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
"""
from __future__ import annotations
from abc import (
ABC,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
Callable,
cast,
)
import numpy as np
from pandas._libs.tslibs import Timestamp
from pandas._typing import (
DtypeObj,
NDFrameT,
npt,
)
from pandas.util._validators import validate_percentile
from pandas.core.dtypes.common import (
is_bool_dtype,
is_numeric_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.arrays.floating import Float64Dtype
from pandas.core.reshape.concat import concat
from pandas.io.formats.format import format_percentiles
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Sequence,
)
from pandas import (
DataFrame,
Series,
)
def describe_ndframe(
*,
obj: NDFrameT,
include: str | Sequence[str] | None,
exclude: str | Sequence[str] | None,
percentiles: Sequence[float] | np.ndarray | None,
) -> NDFrameT:
"""Describe series or dataframe.
Called from pandas.core.generic.NDFrame.describe()
Parameters
----------
obj: DataFrame or Series
Either dataframe or series to be described.
include : 'all', list-like of dtypes or None (default), optional
A white list of data types to include in the result. Ignored for ``Series``.
exclude : list-like of dtypes or None (default), optional,
A black list of data types to omit from the result. Ignored for ``Series``.
percentiles : list-like of numbers, optional
The percentiles to include in the output. All should fall between 0 and 1.
The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
75th percentiles.
Returns
-------
Dataframe or series description.
"""
percentiles = _refine_percentiles(percentiles)
describer: NDFrameDescriberAbstract
if obj.ndim == 1:
describer = SeriesDescriber(
obj=cast("Series", obj),
)
else:
describer = DataFrameDescriber(
obj=cast("DataFrame", obj),
include=include,
exclude=exclude,
)
result = describer.describe(percentiles=percentiles)
return cast(NDFrameT, result)
class NDFrameDescriberAbstract(ABC):
"""Abstract class for describing dataframe or series.
Parameters
----------
obj : Series or DataFrame
Object to be described.
"""
def __init__(self, obj: DataFrame | Series) -> None:
self.obj = obj
@abstractmethod
def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
"""Do describe either series or dataframe.
Parameters
----------
percentiles : list-like of numbers
The percentiles to include in the output.
"""
class SeriesDescriber(NDFrameDescriberAbstract):
"""Class responsible for creating series description."""
obj: Series
def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
describe_func = select_describe_func(
self.obj,
)
return describe_func(self.obj, percentiles)
class DataFrameDescriber(NDFrameDescriberAbstract):
"""Class responsible for creating dataobj description.
Parameters
----------
obj : DataFrame
DataFrame to be described.
include : 'all', list-like of dtypes or None
A white list of data types to include in the result.
exclude : list-like of dtypes or None
A black list of data types to omit from the result.
"""
obj: DataFrame
def __init__(
self,
obj: DataFrame,
*,
include: str | Sequence[str] | None,
exclude: str | Sequence[str] | None,
) -> None:
self.include = include
self.exclude = exclude
if obj.ndim == 2 and obj.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")
super().__init__(obj)
def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
data = self._select_data()
ldesc: list[Series] = []
for _, series in data.items():
describe_func = select_describe_func(series)
ldesc.append(describe_func(series, percentiles))
col_names = reorder_columns(ldesc)
d = concat(
[x.reindex(col_names, copy=False) for x in ldesc],
axis=1,
sort=False,
)
d.columns = data.columns.copy()
return d
def _select_data(self) -> DataFrame:
"""Select columns to be described."""
if (self.include is None) and (self.exclude is None):
# when some numerics are found, keep only numerics
default_include: list[npt.DTypeLike] = [np.number, "datetime"]
data = self.obj.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self.obj
elif self.include == "all":
if self.exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self.obj
else:
data = self.obj.select_dtypes(
include=self.include,
exclude=self.exclude,
)
return data
def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
"""Set a convenient order for rows for display."""
names: list[Hashable] = []
seen_names: set[Hashable] = set()
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in seen_names:
seen_names.add(name)
names.append(name)
return names
def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
"""Describe series containing numerical data.
Parameters
----------
series : Series
Series to be described.
percentiles : list-like of numbers
The percentiles to include in the output.
"""
from pandas import Series
formatted_percentiles = format_percentiles(percentiles)
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
# GH#48340 - always return float on non-complex numeric data
dtype: DtypeObj | None
if isinstance(series.dtype, ExtensionDtype):
if isinstance(series.dtype, ArrowDtype):
if series.dtype.kind == "m":
# GH53001: describe timedeltas with object dtype
dtype = None
else:
import pyarrow as pa
dtype = ArrowDtype(pa.float64())
else:
dtype = Float64Dtype()
elif series.dtype.kind in "iufb":
# i.e. numeric but exclude complex dtype
dtype = np.dtype("float")
else:
dtype = None
return Series(d, index=stat_index, name=series.name, dtype=dtype)
def describe_categorical_1d(
data: Series,
percentiles_ignored: Sequence[float],
) -> Series:
"""Describe series containing categorical data.
Parameters
----------
data : Series
Series to be described.
percentiles_ignored : list-like of numbers
Ignored, but in place to unify interface.
"""
names = ["count", "unique", "top", "freq"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
if count_unique > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
dtype = None
else:
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
top, freq = np.nan, np.nan
dtype = "object"
result = [data.count(), count_unique, top, freq]
from pandas import Series
return Series(result, index=names, name=data.name, dtype=dtype)
def describe_timestamp_as_categorical_1d(
data: Series,
percentiles_ignored: Sequence[float],
) -> Series:
"""Describe series containing timestamp data treated as categorical.
Parameters
----------
data : Series
Series to be described.
percentiles_ignored : list-like of numbers
Ignored, but in place to unify interface.
"""
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result: list[float | Timestamp] = [data.count(), count_unique]
dtype = None
if count_unique > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"
from pandas import Series
return Series(result, index=names, name=data.name, dtype=dtype)
def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
"""Describe series containing datetime64 dtype.
Parameters
----------
data : Series
Series to be described.
percentiles : list-like of numbers
The percentiles to include in the output.
"""
# GH-30164
from pandas import Series
formatted_percentiles = format_percentiles(percentiles)
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return Series(d, index=stat_index, name=data.name)
def select_describe_func(
data: Series,
) -> Callable:
"""Select proper function for describing series based on data type.
Parameters
----------
data : Series
Series to be described.
"""
if is_bool_dtype(data.dtype):
return describe_categorical_1d
elif is_numeric_dtype(data):
return describe_numeric_1d
elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
return describe_timestamp_1d
elif data.dtype.kind == "m":
return describe_numeric_1d
else:
return describe_categorical_1d
def _refine_percentiles(
percentiles: Sequence[float] | np.ndarray | None,
) -> npt.NDArray[np.float64]:
"""
Ensure that percentiles are unique and sorted.
Parameters
----------
percentiles : list-like of numbers, optional
The percentiles to include in the output.
"""
if percentiles is None:
return np.array([0.25, 0.5, 0.75])
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)
# get them all to be in [0, 1]
validate_percentile(percentiles)
# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
# sort and check for duplicates
unique_pcts = np.unique(percentiles)
assert percentiles is not None
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
return unique_pcts