9491 lines
318 KiB
Python
9491 lines
318 KiB
Python
"""
|
||
DataFrame
|
||
---------
|
||
An efficient 2D container for potentially mixed-type time series or other
|
||
labeled data series.
|
||
|
||
Similar to its R counterpart, data.frame, except providing automatic data
|
||
alignment and a host of useful data manipulation methods having to do with the
|
||
labeling information
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import collections
|
||
from collections import abc
|
||
import datetime
|
||
from io import StringIO
|
||
import itertools
|
||
import mmap
|
||
from textwrap import dedent
|
||
from typing import (
|
||
IO,
|
||
TYPE_CHECKING,
|
||
Any,
|
||
AnyStr,
|
||
Dict,
|
||
FrozenSet,
|
||
Hashable,
|
||
Iterable,
|
||
Iterator,
|
||
List,
|
||
Optional,
|
||
Sequence,
|
||
Set,
|
||
Tuple,
|
||
Type,
|
||
Union,
|
||
cast,
|
||
overload,
|
||
)
|
||
import warnings
|
||
|
||
import numpy as np
|
||
import numpy.ma as ma
|
||
|
||
from pandas._config import get_option
|
||
|
||
from pandas._libs import algos as libalgos, lib, properties
|
||
from pandas._libs.lib import no_default
|
||
from pandas._typing import (
|
||
AggFuncType,
|
||
ArrayLike,
|
||
Axes,
|
||
Axis,
|
||
CompressionOptions,
|
||
Dtype,
|
||
FilePathOrBuffer,
|
||
FrameOrSeriesUnion,
|
||
IndexKeyFunc,
|
||
Label,
|
||
Level,
|
||
Renamer,
|
||
StorageOptions,
|
||
ValueKeyFunc,
|
||
)
|
||
from pandas.compat._optional import import_optional_dependency
|
||
from pandas.compat.numpy import function as nv
|
||
from pandas.util._decorators import (
|
||
Appender,
|
||
Substitution,
|
||
deprecate_kwarg,
|
||
doc,
|
||
rewrite_axis_style_signature,
|
||
)
|
||
from pandas.util._validators import (
|
||
validate_axis_style_args,
|
||
validate_bool_kwarg,
|
||
validate_percentile,
|
||
)
|
||
|
||
from pandas.core.dtypes.cast import (
|
||
construct_1d_arraylike_from_scalar,
|
||
find_common_type,
|
||
infer_dtype_from_scalar,
|
||
invalidate_string_dtypes,
|
||
maybe_box_datetimelike,
|
||
maybe_cast_to_datetime,
|
||
maybe_casted_values,
|
||
maybe_convert_platform,
|
||
maybe_downcast_to_dtype,
|
||
maybe_infer_to_datetimelike,
|
||
maybe_upcast,
|
||
validate_numeric_casting,
|
||
)
|
||
from pandas.core.dtypes.common import (
|
||
ensure_int64,
|
||
ensure_platform_int,
|
||
infer_dtype_from_object,
|
||
is_bool_dtype,
|
||
is_dataclass,
|
||
is_datetime64_any_dtype,
|
||
is_dict_like,
|
||
is_dtype_equal,
|
||
is_extension_array_dtype,
|
||
is_float,
|
||
is_float_dtype,
|
||
is_hashable,
|
||
is_integer,
|
||
is_integer_dtype,
|
||
is_iterator,
|
||
is_list_like,
|
||
is_named_tuple,
|
||
is_object_dtype,
|
||
is_scalar,
|
||
is_sequence,
|
||
pandas_dtype,
|
||
)
|
||
from pandas.core.dtypes.missing import isna, notna
|
||
|
||
from pandas.core import algorithms, common as com, generic, nanops, ops
|
||
from pandas.core.accessor import CachedAccessor
|
||
from pandas.core.aggregation import (
|
||
aggregate,
|
||
reconstruct_func,
|
||
relabel_result,
|
||
transform,
|
||
)
|
||
from pandas.core.arraylike import OpsMixin
|
||
from pandas.core.arrays import Categorical, ExtensionArray
|
||
from pandas.core.arrays.sparse import SparseFrameAccessor
|
||
from pandas.core.construction import extract_array
|
||
from pandas.core.generic import NDFrame, _shared_docs
|
||
from pandas.core.indexes import base as ibase
|
||
from pandas.core.indexes.api import (
|
||
DatetimeIndex,
|
||
Index,
|
||
PeriodIndex,
|
||
ensure_index,
|
||
ensure_index_from_sequences,
|
||
)
|
||
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
|
||
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
|
||
from pandas.core.internals import BlockManager
|
||
from pandas.core.internals.construction import (
|
||
arrays_to_mgr,
|
||
dataclasses_to_dicts,
|
||
get_names_from_index,
|
||
init_dict,
|
||
init_ndarray,
|
||
masked_rec_array_to_mgr,
|
||
reorder_arrays,
|
||
sanitize_index,
|
||
to_arrays,
|
||
)
|
||
from pandas.core.reshape.melt import melt
|
||
from pandas.core.series import Series
|
||
from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort
|
||
|
||
from pandas.io.common import get_handle
|
||
from pandas.io.formats import console, format as fmt
|
||
from pandas.io.formats.info import BaseInfo, DataFrameInfo
|
||
import pandas.plotting
|
||
|
||
if TYPE_CHECKING:
|
||
from typing import Literal
|
||
|
||
from pandas.core.groupby.generic import DataFrameGroupBy
|
||
|
||
from pandas.io.formats.style import Styler
|
||
|
||
# ---------------------------------------------------------------------
|
||
# Docstring templates
|
||
|
||
_shared_doc_kwargs = {
|
||
"axes": "index, columns",
|
||
"klass": "DataFrame",
|
||
"axes_single_arg": "{0 or 'index', 1 or 'columns'}",
|
||
"axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
If 0 or 'index': apply function to each column.
|
||
If 1 or 'columns': apply function to each row.""",
|
||
"optional_by": """
|
||
by : str or list of str
|
||
Name or list of names to sort by.
|
||
|
||
- if `axis` is 0 or `'index'` then `by` may contain index
|
||
levels and/or column labels.
|
||
- if `axis` is 1 or `'columns'` then `by` may contain column
|
||
levels and/or index labels.""",
|
||
"optional_labels": """labels : array-like, optional
|
||
New labels / index to conform the axis specified by 'axis' to.""",
|
||
"optional_axis": """axis : int or str, optional
|
||
Axis to target. Can be either the axis name ('index', 'columns')
|
||
or number (0, 1).""",
|
||
}
|
||
|
||
_numeric_only_doc = """numeric_only : boolean, default None
|
||
Include only float, int, boolean data. If None, will attempt to use
|
||
everything, then use only numeric data
|
||
"""
|
||
|
||
_merge_doc = """
|
||
Merge DataFrame or named Series objects with a database-style join.
|
||
|
||
The join is done on columns or indexes. If joining columns on
|
||
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
|
||
on indexes or indexes on a column or columns, the index will be passed on.
|
||
When performing a cross merge, no column specifications to merge on are
|
||
allowed.
|
||
|
||
Parameters
|
||
----------%s
|
||
right : DataFrame or named Series
|
||
Object to merge with.
|
||
how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
|
||
Type of merge to be performed.
|
||
|
||
* left: use only keys from left frame, similar to a SQL left outer join;
|
||
preserve key order.
|
||
* right: use only keys from right frame, similar to a SQL right outer join;
|
||
preserve key order.
|
||
* outer: use union of keys from both frames, similar to a SQL full outer
|
||
join; sort keys lexicographically.
|
||
* inner: use intersection of keys from both frames, similar to a SQL inner
|
||
join; preserve the order of the left keys.
|
||
* cross: creates the cartesian product from both frames, preserves the order
|
||
of the left keys.
|
||
|
||
.. versionadded:: 1.2.0
|
||
|
||
on : label or list
|
||
Column or index level names to join on. These must be found in both
|
||
DataFrames. If `on` is None and not merging on indexes then this defaults
|
||
to the intersection of the columns in both DataFrames.
|
||
left_on : label or list, or array-like
|
||
Column or index level names to join on in the left DataFrame. Can also
|
||
be an array or list of arrays of the length of the left DataFrame.
|
||
These arrays are treated as if they are columns.
|
||
right_on : label or list, or array-like
|
||
Column or index level names to join on in the right DataFrame. Can also
|
||
be an array or list of arrays of the length of the right DataFrame.
|
||
These arrays are treated as if they are columns.
|
||
left_index : bool, default False
|
||
Use the index from the left DataFrame as the join key(s). If it is a
|
||
MultiIndex, the number of keys in the other DataFrame (either the index
|
||
or a number of columns) must match the number of levels.
|
||
right_index : bool, default False
|
||
Use the index from the right DataFrame as the join key. Same caveats as
|
||
left_index.
|
||
sort : bool, default False
|
||
Sort the join keys lexicographically in the result DataFrame. If False,
|
||
the order of the join keys depends on the join type (how keyword).
|
||
suffixes : list-like, default is ("_x", "_y")
|
||
A length-2 sequence where each element is optionally a string
|
||
indicating the suffix to add to overlapping column names in
|
||
`left` and `right` respectively. Pass a value of `None` instead
|
||
of a string to indicate that the column name from `left` or
|
||
`right` should be left as-is, with no suffix. At least one of the
|
||
values must not be None.
|
||
copy : bool, default True
|
||
If False, avoid copy if possible.
|
||
indicator : bool or str, default False
|
||
If True, adds a column to the output DataFrame called "_merge" with
|
||
information on the source of each row. The column can be given a different
|
||
name by providing a string argument. The column will have a Categorical
|
||
type with the value of "left_only" for observations whose merge key only
|
||
appears in the left DataFrame, "right_only" for observations
|
||
whose merge key only appears in the right DataFrame, and "both"
|
||
if the observation's merge key is found in both DataFrames.
|
||
|
||
validate : str, optional
|
||
If specified, checks if merge is of specified type.
|
||
|
||
* "one_to_one" or "1:1": check if merge keys are unique in both
|
||
left and right datasets.
|
||
* "one_to_many" or "1:m": check if merge keys are unique in left
|
||
dataset.
|
||
* "many_to_one" or "m:1": check if merge keys are unique in right
|
||
dataset.
|
||
* "many_to_many" or "m:m": allowed, but does not result in checks.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
A DataFrame of the two merged objects.
|
||
|
||
See Also
|
||
--------
|
||
merge_ordered : Merge with optional filling/interpolation.
|
||
merge_asof : Merge on nearest keys.
|
||
DataFrame.join : Similar method using indices.
|
||
|
||
Notes
|
||
-----
|
||
Support for specifying index levels as the `on`, `left_on`, and
|
||
`right_on` parameters was added in version 0.23.0
|
||
Support for merging named Series objects was added in version 0.24.0
|
||
|
||
Examples
|
||
--------
|
||
>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
|
||
... 'value': [1, 2, 3, 5]})
|
||
>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
|
||
... 'value': [5, 6, 7, 8]})
|
||
>>> df1
|
||
lkey value
|
||
0 foo 1
|
||
1 bar 2
|
||
2 baz 3
|
||
3 foo 5
|
||
>>> df2
|
||
rkey value
|
||
0 foo 5
|
||
1 bar 6
|
||
2 baz 7
|
||
3 foo 8
|
||
|
||
Merge df1 and df2 on the lkey and rkey columns. The value columns have
|
||
the default suffixes, _x and _y, appended.
|
||
|
||
>>> df1.merge(df2, left_on='lkey', right_on='rkey')
|
||
lkey value_x rkey value_y
|
||
0 foo 1 foo 5
|
||
1 foo 1 foo 8
|
||
2 foo 5 foo 5
|
||
3 foo 5 foo 8
|
||
4 bar 2 bar 6
|
||
5 baz 3 baz 7
|
||
|
||
Merge DataFrames df1 and df2 with specified left and right suffixes
|
||
appended to any overlapping columns.
|
||
|
||
>>> df1.merge(df2, left_on='lkey', right_on='rkey',
|
||
... suffixes=('_left', '_right'))
|
||
lkey value_left rkey value_right
|
||
0 foo 1 foo 5
|
||
1 foo 1 foo 8
|
||
2 foo 5 foo 5
|
||
3 foo 5 foo 8
|
||
4 bar 2 bar 6
|
||
5 baz 3 baz 7
|
||
|
||
Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
|
||
any overlapping columns.
|
||
|
||
>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
|
||
Traceback (most recent call last):
|
||
...
|
||
ValueError: columns overlap but no suffix specified:
|
||
Index(['value'], dtype='object')
|
||
|
||
>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
|
||
>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
|
||
>>> df1
|
||
a b
|
||
0 foo 1
|
||
1 bar 2
|
||
>>> df2
|
||
a c
|
||
0 foo 3
|
||
1 baz 4
|
||
|
||
>>> df1.merge(df2, how='inner', on='a')
|
||
a b c
|
||
0 foo 1 3
|
||
|
||
>>> df1.merge(df2, how='left', on='a')
|
||
a b c
|
||
0 foo 1 3.0
|
||
1 bar 2 NaN
|
||
|
||
>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
|
||
>>> df2 = pd.DataFrame({'right': [7, 8]})
|
||
>>> df1
|
||
left
|
||
0 foo
|
||
1 bar
|
||
>>> df2
|
||
right
|
||
0 7
|
||
1 8
|
||
|
||
>>> df1.merge(df2, how='cross')
|
||
left right
|
||
0 foo 7
|
||
1 foo 8
|
||
2 bar 7
|
||
3 bar 8
|
||
"""
|
||
|
||
|
||
# -----------------------------------------------------------------------
|
||
# DataFrame class
|
||
|
||
|
||
class DataFrame(NDFrame, OpsMixin):
|
||
"""
|
||
Two-dimensional, size-mutable, potentially heterogeneous tabular data.
|
||
|
||
Data structure also contains labeled axes (rows and columns).
|
||
Arithmetic operations align on both row and column labels. Can be
|
||
thought of as a dict-like container for Series objects. The primary
|
||
pandas data structure.
|
||
|
||
Parameters
|
||
----------
|
||
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
|
||
Dict can contain Series, arrays, constants, dataclass or list-like objects. If
|
||
data is a dict, column order follows insertion-order.
|
||
|
||
.. versionchanged:: 0.25.0
|
||
If data is a list of dicts, column order follows insertion-order.
|
||
|
||
index : Index or array-like
|
||
Index to use for resulting frame. Will default to RangeIndex if
|
||
no indexing information part of input data and no index provided.
|
||
columns : Index or array-like
|
||
Column labels to use for resulting frame. Will default to
|
||
RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
|
||
dtype : dtype, default None
|
||
Data type to force. Only a single dtype is allowed. If None, infer.
|
||
copy : bool, default False
|
||
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.from_records : Constructor from tuples, also record arrays.
|
||
DataFrame.from_dict : From dicts of Series, arrays, or dicts.
|
||
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
||
read_table : Read general delimited file into DataFrame.
|
||
read_clipboard : Read text from clipboard into DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
Constructing DataFrame from a dictionary.
|
||
|
||
>>> d = {'col1': [1, 2], 'col2': [3, 4]}
|
||
>>> df = pd.DataFrame(data=d)
|
||
>>> df
|
||
col1 col2
|
||
0 1 3
|
||
1 2 4
|
||
|
||
Notice that the inferred dtype is int64.
|
||
|
||
>>> df.dtypes
|
||
col1 int64
|
||
col2 int64
|
||
dtype: object
|
||
|
||
To enforce a single dtype:
|
||
|
||
>>> df = pd.DataFrame(data=d, dtype=np.int8)
|
||
>>> df.dtypes
|
||
col1 int8
|
||
col2 int8
|
||
dtype: object
|
||
|
||
Constructing DataFrame from numpy ndarray:
|
||
|
||
>>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
|
||
... columns=['a', 'b', 'c'])
|
||
>>> df2
|
||
a b c
|
||
0 1 2 3
|
||
1 4 5 6
|
||
2 7 8 9
|
||
|
||
Constructing DataFrame from dataclass:
|
||
|
||
>>> from dataclasses import make_dataclass
|
||
>>> Point = make_dataclass("Point", [("x", int), ("y", int)])
|
||
>>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
|
||
x y
|
||
0 0 0
|
||
1 0 3
|
||
2 2 3
|
||
"""
|
||
|
||
_internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
|
||
_typ = "dataframe"
|
||
_HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
|
||
|
||
@property
|
||
def _constructor(self) -> Type[DataFrame]:
|
||
return DataFrame
|
||
|
||
_constructor_sliced: Type[Series] = Series
|
||
_hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([])
|
||
_accessors: Set[str] = {"sparse"}
|
||
|
||
@property
|
||
def _constructor_expanddim(self):
|
||
# GH#31549 raising NotImplementedError on a property causes trouble
|
||
# for `inspect`
|
||
def constructor(*args, **kwargs):
|
||
raise NotImplementedError("Not supported for DataFrames!")
|
||
|
||
return constructor
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Constructors
|
||
|
||
def __init__(
|
||
self,
|
||
data=None,
|
||
index: Optional[Axes] = None,
|
||
columns: Optional[Axes] = None,
|
||
dtype: Optional[Dtype] = None,
|
||
copy: bool = False,
|
||
):
|
||
if data is None:
|
||
data = {}
|
||
if dtype is not None:
|
||
dtype = self._validate_dtype(dtype)
|
||
|
||
if isinstance(data, DataFrame):
|
||
data = data._mgr
|
||
|
||
if isinstance(data, BlockManager):
|
||
if index is None and columns is None and dtype is None and copy is False:
|
||
# GH#33357 fastpath
|
||
NDFrame.__init__(self, data)
|
||
return
|
||
|
||
mgr = self._init_mgr(
|
||
data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
|
||
)
|
||
|
||
elif isinstance(data, dict):
|
||
mgr = init_dict(data, index, columns, dtype=dtype)
|
||
elif isinstance(data, ma.MaskedArray):
|
||
import numpy.ma.mrecords as mrecords
|
||
|
||
# masked recarray
|
||
if isinstance(data, mrecords.MaskedRecords):
|
||
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
|
||
|
||
# a masked array
|
||
else:
|
||
mask = ma.getmaskarray(data)
|
||
if mask.any():
|
||
data, fill_value = maybe_upcast(data, copy=True)
|
||
data.soften_mask() # set hardmask False if it was True
|
||
data[mask] = fill_value
|
||
else:
|
||
data = data.copy()
|
||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
|
||
|
||
elif isinstance(data, (np.ndarray, Series, Index)):
|
||
if data.dtype.names:
|
||
data_columns = list(data.dtype.names)
|
||
data = {k: data[k] for k in data_columns}
|
||
if columns is None:
|
||
columns = data_columns
|
||
mgr = init_dict(data, index, columns, dtype=dtype)
|
||
elif getattr(data, "name", None) is not None:
|
||
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
|
||
else:
|
||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
|
||
|
||
# For data is list-like, or Iterable (will consume into list)
|
||
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
|
||
if not isinstance(data, (abc.Sequence, ExtensionArray)):
|
||
data = list(data)
|
||
if len(data) > 0:
|
||
if is_dataclass(data[0]):
|
||
data = dataclasses_to_dicts(data)
|
||
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
|
||
if is_named_tuple(data[0]) and columns is None:
|
||
columns = data[0]._fields
|
||
arrays, columns = to_arrays(data, columns, dtype=dtype)
|
||
columns = ensure_index(columns)
|
||
|
||
# set the index
|
||
if index is None:
|
||
if isinstance(data[0], Series):
|
||
index = get_names_from_index(data)
|
||
elif isinstance(data[0], Categorical):
|
||
index = ibase.default_index(len(data[0]))
|
||
else:
|
||
index = ibase.default_index(len(data))
|
||
|
||
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
|
||
else:
|
||
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
|
||
else:
|
||
mgr = init_dict({}, index, columns, dtype=dtype)
|
||
# For data is scalar
|
||
else:
|
||
if index is None or columns is None:
|
||
raise ValueError("DataFrame constructor not properly called!")
|
||
|
||
if not dtype:
|
||
dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
|
||
|
||
# For data is a scalar extension dtype
|
||
if is_extension_array_dtype(dtype):
|
||
|
||
values = [
|
||
construct_1d_arraylike_from_scalar(data, len(index), dtype)
|
||
for _ in range(len(columns))
|
||
]
|
||
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
|
||
else:
|
||
# Attempt to coerce to a numpy array
|
||
try:
|
||
arr = np.array(data, dtype=dtype, copy=copy)
|
||
except (ValueError, TypeError) as err:
|
||
exc = TypeError(
|
||
"DataFrame constructor called with "
|
||
f"incompatible data and dtype: {err}"
|
||
)
|
||
raise exc from err
|
||
|
||
if arr.ndim != 0:
|
||
raise ValueError("DataFrame constructor not properly called!")
|
||
|
||
shape = (len(index), len(columns))
|
||
values = np.full(shape, arr)
|
||
|
||
mgr = init_ndarray(
|
||
values, index, columns, dtype=values.dtype, copy=False
|
||
)
|
||
|
||
NDFrame.__init__(self, mgr)
|
||
|
||
# ----------------------------------------------------------------------
|
||
|
||
@property
|
||
def axes(self) -> List[Index]:
|
||
"""
|
||
Return a list representing the axes of the DataFrame.
|
||
|
||
It has the row axis labels and column axis labels as the only members.
|
||
They are returned in that order.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
|
||
>>> df.axes
|
||
[RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
|
||
dtype='object')]
|
||
"""
|
||
return [self.index, self.columns]
|
||
|
||
@property
|
||
def shape(self) -> Tuple[int, int]:
|
||
"""
|
||
Return a tuple representing the dimensionality of the DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
ndarray.shape : Tuple of array dimensions.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
|
||
>>> df.shape
|
||
(2, 2)
|
||
|
||
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
|
||
... 'col3': [5, 6]})
|
||
>>> df.shape
|
||
(2, 3)
|
||
"""
|
||
return len(self.index), len(self.columns)
|
||
|
||
@property
|
||
def _is_homogeneous_type(self) -> bool:
|
||
"""
|
||
Whether all the columns in a DataFrame have the same type.
|
||
|
||
Returns
|
||
-------
|
||
bool
|
||
|
||
See Also
|
||
--------
|
||
Index._is_homogeneous_type : Whether the object has a single
|
||
dtype.
|
||
MultiIndex._is_homogeneous_type : Whether all the levels of a
|
||
MultiIndex have the same dtype.
|
||
|
||
Examples
|
||
--------
|
||
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
|
||
True
|
||
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
|
||
False
|
||
|
||
Items with the same type but different sizes are considered
|
||
different types.
|
||
|
||
>>> DataFrame({
|
||
... "A": np.array([1, 2], dtype=np.int32),
|
||
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
|
||
False
|
||
"""
|
||
if self._mgr.any_extension_types:
|
||
return len({block.dtype for block in self._mgr.blocks}) == 1
|
||
else:
|
||
return not self._is_mixed_type
|
||
|
||
@property
|
||
def _can_fast_transpose(self) -> bool:
|
||
"""
|
||
Can we transpose this DataFrame without creating any new array objects.
|
||
"""
|
||
if self._mgr.any_extension_types:
|
||
# TODO(EA2D) special case would be unnecessary with 2D EAs
|
||
return False
|
||
return len(self._mgr.blocks) == 1
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Rendering Methods
|
||
|
||
def _repr_fits_vertical_(self) -> bool:
|
||
"""
|
||
Check length against max_rows.
|
||
"""
|
||
max_rows = get_option("display.max_rows")
|
||
return len(self) <= max_rows
|
||
|
||
def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
|
||
"""
|
||
Check if full repr fits in horizontal boundaries imposed by the display
|
||
options width and max_columns.
|
||
|
||
In case of non-interactive session, no boundaries apply.
|
||
|
||
`ignore_width` is here so ipynb+HTML output can behave the way
|
||
users expect. display.max_columns remains in effect.
|
||
GH3541, GH3573
|
||
"""
|
||
width, height = console.get_console_size()
|
||
max_columns = get_option("display.max_columns")
|
||
nb_columns = len(self.columns)
|
||
|
||
# exceed max columns
|
||
if (max_columns and nb_columns > max_columns) or (
|
||
(not ignore_width) and width and nb_columns > (width // 2)
|
||
):
|
||
return False
|
||
|
||
# used by repr_html under IPython notebook or scripts ignore terminal
|
||
# dims
|
||
if ignore_width or not console.in_interactive_session():
|
||
return True
|
||
|
||
if get_option("display.width") is not None or console.in_ipython_frontend():
|
||
# check at least the column row for excessive width
|
||
max_rows = 1
|
||
else:
|
||
max_rows = get_option("display.max_rows")
|
||
|
||
# when auto-detecting, so width=None and not in ipython front end
|
||
# check whether repr fits horizontal by actually checking
|
||
# the width of the rendered repr
|
||
buf = StringIO()
|
||
|
||
# only care about the stuff we'll actually print out
|
||
# and to_string on entire frame may be expensive
|
||
d = self
|
||
|
||
if not (max_rows is None): # unlimited rows
|
||
# min of two, where one may be None
|
||
d = d.iloc[: min(max_rows, len(d))]
|
||
else:
|
||
return True
|
||
|
||
d.to_string(buf=buf)
|
||
value = buf.getvalue()
|
||
repr_width = max(len(line) for line in value.split("\n"))
|
||
|
||
return repr_width < width
|
||
|
||
def _info_repr(self) -> bool:
|
||
"""
|
||
True if the repr should show the info view.
|
||
"""
|
||
info_repr_option = get_option("display.large_repr") == "info"
|
||
return info_repr_option and not (
|
||
self._repr_fits_horizontal_() and self._repr_fits_vertical_()
|
||
)
|
||
|
||
def __repr__(self) -> str:
|
||
"""
|
||
Return a string representation for a particular DataFrame.
|
||
"""
|
||
buf = StringIO("")
|
||
if self._info_repr():
|
||
self.info(buf=buf)
|
||
return buf.getvalue()
|
||
|
||
max_rows = get_option("display.max_rows")
|
||
min_rows = get_option("display.min_rows")
|
||
max_cols = get_option("display.max_columns")
|
||
max_colwidth = get_option("display.max_colwidth")
|
||
show_dimensions = get_option("display.show_dimensions")
|
||
if get_option("display.expand_frame_repr"):
|
||
width, _ = console.get_console_size()
|
||
else:
|
||
width = None
|
||
self.to_string(
|
||
buf=buf,
|
||
max_rows=max_rows,
|
||
min_rows=min_rows,
|
||
max_cols=max_cols,
|
||
line_width=width,
|
||
max_colwidth=max_colwidth,
|
||
show_dimensions=show_dimensions,
|
||
)
|
||
|
||
return buf.getvalue()
|
||
|
||
def _repr_html_(self) -> Optional[str]:
|
||
"""
|
||
Return a html representation for a particular DataFrame.
|
||
|
||
Mainly for IPython notebook.
|
||
"""
|
||
if self._info_repr():
|
||
buf = StringIO("")
|
||
self.info(buf=buf)
|
||
# need to escape the <class>, should be the first line.
|
||
val = buf.getvalue().replace("<", r"<", 1)
|
||
val = val.replace(">", r">", 1)
|
||
return "<pre>" + val + "</pre>"
|
||
|
||
if get_option("display.notebook_repr_html"):
|
||
max_rows = get_option("display.max_rows")
|
||
min_rows = get_option("display.min_rows")
|
||
max_cols = get_option("display.max_columns")
|
||
show_dimensions = get_option("display.show_dimensions")
|
||
|
||
formatter = fmt.DataFrameFormatter(
|
||
self,
|
||
columns=None,
|
||
col_space=None,
|
||
na_rep="NaN",
|
||
formatters=None,
|
||
float_format=None,
|
||
sparsify=None,
|
||
justify=None,
|
||
index_names=True,
|
||
header=True,
|
||
index=True,
|
||
bold_rows=True,
|
||
escape=True,
|
||
max_rows=max_rows,
|
||
min_rows=min_rows,
|
||
max_cols=max_cols,
|
||
show_dimensions=show_dimensions,
|
||
decimal=".",
|
||
)
|
||
return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
|
||
else:
|
||
return None
|
||
|
||
@Substitution(
|
||
header_type="bool or sequence",
|
||
header="Write out the column names. If a list of strings "
|
||
"is given, it is assumed to be aliases for the "
|
||
"column names",
|
||
col_space_type="int, list or dict of int",
|
||
col_space="The minimum width of each column",
|
||
)
|
||
@Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
|
||
def to_string(
|
||
self,
|
||
buf: Optional[FilePathOrBuffer[str]] = None,
|
||
columns: Optional[Sequence[str]] = None,
|
||
col_space: Optional[int] = None,
|
||
header: Union[bool, Sequence[str]] = True,
|
||
index: bool = True,
|
||
na_rep: str = "NaN",
|
||
formatters: Optional[fmt.FormattersType] = None,
|
||
float_format: Optional[fmt.FloatFormatType] = None,
|
||
sparsify: Optional[bool] = None,
|
||
index_names: bool = True,
|
||
justify: Optional[str] = None,
|
||
max_rows: Optional[int] = None,
|
||
min_rows: Optional[int] = None,
|
||
max_cols: Optional[int] = None,
|
||
show_dimensions: bool = False,
|
||
decimal: str = ".",
|
||
line_width: Optional[int] = None,
|
||
max_colwidth: Optional[int] = None,
|
||
encoding: Optional[str] = None,
|
||
) -> Optional[str]:
|
||
"""
|
||
Render a DataFrame to a console-friendly tabular output.
|
||
%(shared_params)s
|
||
line_width : int, optional
|
||
Width to wrap a line in characters.
|
||
max_colwidth : int, optional
|
||
Max width to truncate each column in characters. By default, no limit.
|
||
|
||
.. versionadded:: 1.0.0
|
||
encoding : str, default "utf-8"
|
||
Set character encoding.
|
||
|
||
.. versionadded:: 1.0
|
||
%(returns)s
|
||
See Also
|
||
--------
|
||
to_html : Convert DataFrame to HTML.
|
||
|
||
Examples
|
||
--------
|
||
>>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
|
||
>>> df = pd.DataFrame(d)
|
||
>>> print(df.to_string())
|
||
col1 col2
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
"""
|
||
from pandas import option_context
|
||
|
||
with option_context("display.max_colwidth", max_colwidth):
|
||
formatter = fmt.DataFrameFormatter(
|
||
self,
|
||
columns=columns,
|
||
col_space=col_space,
|
||
na_rep=na_rep,
|
||
formatters=formatters,
|
||
float_format=float_format,
|
||
sparsify=sparsify,
|
||
justify=justify,
|
||
index_names=index_names,
|
||
header=header,
|
||
index=index,
|
||
min_rows=min_rows,
|
||
max_rows=max_rows,
|
||
max_cols=max_cols,
|
||
show_dimensions=show_dimensions,
|
||
decimal=decimal,
|
||
)
|
||
return fmt.DataFrameRenderer(formatter).to_string(
|
||
buf=buf,
|
||
encoding=encoding,
|
||
line_width=line_width,
|
||
)
|
||
|
||
# ----------------------------------------------------------------------
|
||
|
||
@property
|
||
def style(self) -> Styler:
|
||
"""
|
||
Returns a Styler object.
|
||
|
||
Contains methods for building a styled HTML representation of the DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
io.formats.style.Styler : Helps style a DataFrame or Series according to the
|
||
data with HTML and CSS.
|
||
"""
|
||
from pandas.io.formats.style import Styler
|
||
|
||
return Styler(self)
|
||
|
||
_shared_docs[
|
||
"items"
|
||
] = r"""
|
||
Iterate over (column name, Series) pairs.
|
||
|
||
Iterates over the DataFrame columns, returning a tuple with
|
||
the column name and the content as a Series.
|
||
|
||
Yields
|
||
------
|
||
label : object
|
||
The column names for the DataFrame being iterated over.
|
||
content : Series
|
||
The column entries belonging to each label, as a Series.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.iterrows : Iterate over DataFrame rows as
|
||
(index, Series) pairs.
|
||
DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
|
||
of the values.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
|
||
... 'population': [1864, 22000, 80000]},
|
||
... index=['panda', 'polar', 'koala'])
|
||
>>> df
|
||
species population
|
||
panda bear 1864
|
||
polar bear 22000
|
||
koala marsupial 80000
|
||
>>> for label, content in df.items():
|
||
... print(f'label: {label}')
|
||
... print(f'content: {content}', sep='\n')
|
||
...
|
||
label: species
|
||
content:
|
||
panda bear
|
||
polar bear
|
||
koala marsupial
|
||
Name: species, dtype: object
|
||
label: population
|
||
content:
|
||
panda 1864
|
||
polar 22000
|
||
koala 80000
|
||
Name: population, dtype: int64
|
||
"""
|
||
|
||
@Appender(_shared_docs["items"])
|
||
def items(self) -> Iterable[Tuple[Label, Series]]:
|
||
if self.columns.is_unique and hasattr(self, "_item_cache"):
|
||
for k in self.columns:
|
||
yield k, self._get_item_cache(k)
|
||
else:
|
||
for i, k in enumerate(self.columns):
|
||
yield k, self._ixs(i, axis=1)
|
||
|
||
@Appender(_shared_docs["items"])
|
||
def iteritems(self) -> Iterable[Tuple[Label, Series]]:
|
||
yield from self.items()
|
||
|
||
def iterrows(self) -> Iterable[Tuple[Label, Series]]:
|
||
"""
|
||
Iterate over DataFrame rows as (index, Series) pairs.
|
||
|
||
Yields
|
||
------
|
||
index : label or tuple of label
|
||
The index of the row. A tuple for a `MultiIndex`.
|
||
data : Series
|
||
The data of the row as a Series.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
|
||
DataFrame.items : Iterate over (column name, Series) pairs.
|
||
|
||
Notes
|
||
-----
|
||
1. Because ``iterrows`` returns a Series for each row,
|
||
it does **not** preserve dtypes across the rows (dtypes are
|
||
preserved across columns for DataFrames). For example,
|
||
|
||
>>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
|
||
>>> row = next(df.iterrows())[1]
|
||
>>> row
|
||
int 1.0
|
||
float 1.5
|
||
Name: 0, dtype: float64
|
||
>>> print(row['int'].dtype)
|
||
float64
|
||
>>> print(df['int'].dtype)
|
||
int64
|
||
|
||
To preserve dtypes while iterating over the rows, it is better
|
||
to use :meth:`itertuples` which returns namedtuples of the values
|
||
and which is generally faster than ``iterrows``.
|
||
|
||
2. You should **never modify** something you are iterating over.
|
||
This is not guaranteed to work in all cases. Depending on the
|
||
data types, the iterator returns a copy and not a view, and writing
|
||
to it will have no effect.
|
||
"""
|
||
columns = self.columns
|
||
klass = self._constructor_sliced
|
||
for k, v in zip(self.index, self.values):
|
||
s = klass(v, index=columns, name=k)
|
||
yield k, s
|
||
|
||
def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"):
|
||
"""
|
||
Iterate over DataFrame rows as namedtuples.
|
||
|
||
Parameters
|
||
----------
|
||
index : bool, default True
|
||
If True, return the index as the first element of the tuple.
|
||
name : str or None, default "Pandas"
|
||
The name of the returned namedtuples or None to return regular
|
||
tuples.
|
||
|
||
Returns
|
||
-------
|
||
iterator
|
||
An object to iterate over namedtuples for each row in the
|
||
DataFrame with the first field possibly being the index and
|
||
following fields being the column values.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
|
||
pairs.
|
||
DataFrame.items : Iterate over (column name, Series) pairs.
|
||
|
||
Notes
|
||
-----
|
||
The column names will be renamed to positional names if they are
|
||
invalid Python identifiers, repeated, or start with an underscore.
|
||
On python versions < 3.7 regular tuples are returned for DataFrames
|
||
with a large number of columns (>254).
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
|
||
... index=['dog', 'hawk'])
|
||
>>> df
|
||
num_legs num_wings
|
||
dog 4 0
|
||
hawk 2 2
|
||
>>> for row in df.itertuples():
|
||
... print(row)
|
||
...
|
||
Pandas(Index='dog', num_legs=4, num_wings=0)
|
||
Pandas(Index='hawk', num_legs=2, num_wings=2)
|
||
|
||
By setting the `index` parameter to False we can remove the index
|
||
as the first element of the tuple:
|
||
|
||
>>> for row in df.itertuples(index=False):
|
||
... print(row)
|
||
...
|
||
Pandas(num_legs=4, num_wings=0)
|
||
Pandas(num_legs=2, num_wings=2)
|
||
|
||
With the `name` parameter set we set a custom name for the yielded
|
||
namedtuples:
|
||
|
||
>>> for row in df.itertuples(name='Animal'):
|
||
... print(row)
|
||
...
|
||
Animal(Index='dog', num_legs=4, num_wings=0)
|
||
Animal(Index='hawk', num_legs=2, num_wings=2)
|
||
"""
|
||
arrays = []
|
||
fields = list(self.columns)
|
||
if index:
|
||
arrays.append(self.index)
|
||
fields.insert(0, "Index")
|
||
|
||
# use integer indexing because of possible duplicate column names
|
||
arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
|
||
|
||
if name is not None:
|
||
# https://github.com/python/mypy/issues/9046
|
||
# error: namedtuple() expects a string literal as the first argument
|
||
itertuple = collections.namedtuple( # type: ignore[misc]
|
||
name, fields, rename=True
|
||
)
|
||
return map(itertuple._make, zip(*arrays))
|
||
|
||
# fallback to regular tuples
|
||
return zip(*arrays)
|
||
|
||
def __len__(self) -> int:
|
||
"""
|
||
Returns length of info axis, but here we use the index.
|
||
"""
|
||
return len(self.index)
|
||
|
||
def dot(self, other):
|
||
"""
|
||
Compute the matrix multiplication between the DataFrame and other.
|
||
|
||
This method computes the matrix product between the DataFrame and the
|
||
values of an other Series, DataFrame or a numpy array.
|
||
|
||
It can also be called using ``self @ other`` in Python >= 3.5.
|
||
|
||
Parameters
|
||
----------
|
||
other : Series, DataFrame or array-like
|
||
The other object to compute the matrix product with.
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
If other is a Series, return the matrix product between self and
|
||
other as a Series. If other is a DataFrame or a numpy.array, return
|
||
the matrix product of self and other in a DataFrame of a np.array.
|
||
|
||
See Also
|
||
--------
|
||
Series.dot: Similar method for Series.
|
||
|
||
Notes
|
||
-----
|
||
The dimensions of DataFrame and other must be compatible in order to
|
||
compute the matrix multiplication. In addition, the column names of
|
||
DataFrame and the index of other must contain the same values, as they
|
||
will be aligned prior to the multiplication.
|
||
|
||
The dot method for Series computes the inner product, instead of the
|
||
matrix product here.
|
||
|
||
Examples
|
||
--------
|
||
Here we multiply a DataFrame with a Series.
|
||
|
||
>>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
|
||
>>> s = pd.Series([1, 1, 2, 1])
|
||
>>> df.dot(s)
|
||
0 -4
|
||
1 5
|
||
dtype: int64
|
||
|
||
Here we multiply a DataFrame with another DataFrame.
|
||
|
||
>>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
|
||
>>> df.dot(other)
|
||
0 1
|
||
0 1 4
|
||
1 2 2
|
||
|
||
Note that the dot method give the same result as @
|
||
|
||
>>> df @ other
|
||
0 1
|
||
0 1 4
|
||
1 2 2
|
||
|
||
The dot method works also if other is an np.array.
|
||
|
||
>>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
|
||
>>> df.dot(arr)
|
||
0 1
|
||
0 1 4
|
||
1 2 2
|
||
|
||
Note how shuffling of the objects does not change the result.
|
||
|
||
>>> s2 = s.reindex([1, 0, 2, 3])
|
||
>>> df.dot(s2)
|
||
0 -4
|
||
1 5
|
||
dtype: int64
|
||
"""
|
||
if isinstance(other, (Series, DataFrame)):
|
||
common = self.columns.union(other.index)
|
||
if len(common) > len(self.columns) or len(common) > len(other.index):
|
||
raise ValueError("matrices are not aligned")
|
||
|
||
left = self.reindex(columns=common, copy=False)
|
||
right = other.reindex(index=common, copy=False)
|
||
lvals = left.values
|
||
rvals = right._values
|
||
else:
|
||
left = self
|
||
lvals = self.values
|
||
rvals = np.asarray(other)
|
||
if lvals.shape[1] != rvals.shape[0]:
|
||
raise ValueError(
|
||
f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
|
||
)
|
||
|
||
if isinstance(other, DataFrame):
|
||
return self._constructor(
|
||
np.dot(lvals, rvals), index=left.index, columns=other.columns
|
||
)
|
||
elif isinstance(other, Series):
|
||
return self._constructor_sliced(np.dot(lvals, rvals), index=left.index)
|
||
elif isinstance(rvals, (np.ndarray, Index)):
|
||
result = np.dot(lvals, rvals)
|
||
if result.ndim == 2:
|
||
return self._constructor(result, index=left.index)
|
||
else:
|
||
return self._constructor_sliced(result, index=left.index)
|
||
else: # pragma: no cover
|
||
raise TypeError(f"unsupported type: {type(other)}")
|
||
|
||
def __matmul__(self, other):
|
||
"""
|
||
Matrix multiplication using binary `@` operator in Python>=3.5.
|
||
"""
|
||
return self.dot(other)
|
||
|
||
def __rmatmul__(self, other):
|
||
"""
|
||
Matrix multiplication using binary `@` operator in Python>=3.5.
|
||
"""
|
||
try:
|
||
return self.T.dot(np.transpose(other)).T
|
||
except ValueError as err:
|
||
if "shape mismatch" not in str(err):
|
||
raise
|
||
# GH#21581 give exception message for original shapes
|
||
msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
|
||
raise ValueError(msg) from err
|
||
|
||
# ----------------------------------------------------------------------
|
||
# IO methods (to / from other formats)
|
||
|
||
@classmethod
|
||
def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame:
|
||
"""
|
||
Construct DataFrame from dict of array-like or dicts.
|
||
|
||
Creates DataFrame object from dictionary by columns or by index
|
||
allowing dtype specification.
|
||
|
||
Parameters
|
||
----------
|
||
data : dict
|
||
Of the form {field : array-like} or {field : dict}.
|
||
orient : {'columns', 'index'}, default 'columns'
|
||
The "orientation" of the data. If the keys of the passed dict
|
||
should be the columns of the resulting DataFrame, pass 'columns'
|
||
(default). Otherwise if the keys should be rows, pass 'index'.
|
||
dtype : dtype, default None
|
||
Data type to force, otherwise infer.
|
||
columns : list, default None
|
||
Column labels to use when ``orient='index'``. Raises a ValueError
|
||
if used with ``orient='columns'``.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.from_records : DataFrame from structured ndarray, sequence
|
||
of tuples or dicts, or DataFrame.
|
||
DataFrame : DataFrame object creation using constructor.
|
||
|
||
Examples
|
||
--------
|
||
By default the keys of the dict become the DataFrame columns:
|
||
|
||
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
|
||
>>> pd.DataFrame.from_dict(data)
|
||
col_1 col_2
|
||
0 3 a
|
||
1 2 b
|
||
2 1 c
|
||
3 0 d
|
||
|
||
Specify ``orient='index'`` to create the DataFrame using dictionary
|
||
keys as rows:
|
||
|
||
>>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
|
||
>>> pd.DataFrame.from_dict(data, orient='index')
|
||
0 1 2 3
|
||
row_1 3 2 1 0
|
||
row_2 a b c d
|
||
|
||
When using the 'index' orientation, the column names can be
|
||
specified manually:
|
||
|
||
>>> pd.DataFrame.from_dict(data, orient='index',
|
||
... columns=['A', 'B', 'C', 'D'])
|
||
A B C D
|
||
row_1 3 2 1 0
|
||
row_2 a b c d
|
||
"""
|
||
index = None
|
||
orient = orient.lower()
|
||
if orient == "index":
|
||
if len(data) > 0:
|
||
# TODO speed up Series case
|
||
if isinstance(list(data.values())[0], (Series, dict)):
|
||
data = _from_nested_dict(data)
|
||
else:
|
||
data, index = list(data.values()), list(data.keys())
|
||
elif orient == "columns":
|
||
if columns is not None:
|
||
raise ValueError("cannot use columns parameter with orient='columns'")
|
||
else: # pragma: no cover
|
||
raise ValueError("only recognize index or columns for orient")
|
||
|
||
return cls(data, index=index, columns=columns, dtype=dtype)
|
||
|
||
def to_numpy(
|
||
self, dtype=None, copy: bool = False, na_value=lib.no_default
|
||
) -> np.ndarray:
|
||
"""
|
||
Convert the DataFrame to a NumPy array.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
By default, the dtype of the returned array will be the common NumPy
|
||
dtype of all types in the DataFrame. For example, if the dtypes are
|
||
``float16`` and ``float32``, the results dtype will be ``float32``.
|
||
This may require copying data and coercing values, which may be
|
||
expensive.
|
||
|
||
Parameters
|
||
----------
|
||
dtype : str or numpy.dtype, optional
|
||
The dtype to pass to :meth:`numpy.asarray`.
|
||
copy : bool, default False
|
||
Whether to ensure that the returned value is not a view on
|
||
another array. Note that ``copy=False`` does not *ensure* that
|
||
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
|
||
a copy is made, even if not strictly necessary.
|
||
na_value : Any, optional
|
||
The value to use for missing values. The default value depends
|
||
on `dtype` and the dtypes of the DataFrame columns.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
Returns
|
||
-------
|
||
numpy.ndarray
|
||
|
||
See Also
|
||
--------
|
||
Series.to_numpy : Similar method for Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
|
||
array([[1, 3],
|
||
[2, 4]])
|
||
|
||
With heterogeneous data, the lowest common type will have to
|
||
be used.
|
||
|
||
>>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
|
||
>>> df.to_numpy()
|
||
array([[1. , 3. ],
|
||
[2. , 4.5]])
|
||
|
||
For a mix of numeric and non-numeric types, the output array will
|
||
have object dtype.
|
||
|
||
>>> df['C'] = pd.date_range('2000', periods=2)
|
||
>>> df.to_numpy()
|
||
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
|
||
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
|
||
"""
|
||
self._consolidate_inplace()
|
||
result = self._mgr.as_array(
|
||
transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
|
||
)
|
||
if result.dtype is not dtype:
|
||
result = np.array(result, dtype=dtype, copy=False)
|
||
|
||
return result
|
||
|
||
def to_dict(self, orient="dict", into=dict):
|
||
"""
|
||
Convert the DataFrame to a dictionary.
|
||
|
||
The type of the key-value pairs can be customized with the parameters
|
||
(see below).
|
||
|
||
Parameters
|
||
----------
|
||
orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
|
||
Determines the type of the values of the dictionary.
|
||
|
||
- 'dict' (default) : dict like {column -> {index -> value}}
|
||
- 'list' : dict like {column -> [values]}
|
||
- 'series' : dict like {column -> Series(values)}
|
||
- 'split' : dict like
|
||
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
|
||
- 'records' : list like
|
||
[{column -> value}, ... , {column -> value}]
|
||
- 'index' : dict like {index -> {column -> value}}
|
||
|
||
Abbreviations are allowed. `s` indicates `series` and `sp`
|
||
indicates `split`.
|
||
|
||
into : class, default dict
|
||
The collections.abc.Mapping subclass used for all Mappings
|
||
in the return value. Can be the actual class or an empty
|
||
instance of the mapping type you want. If you want a
|
||
collections.defaultdict, you must pass it initialized.
|
||
|
||
Returns
|
||
-------
|
||
dict, list or collections.abc.Mapping
|
||
Return a collections.abc.Mapping object representing the DataFrame.
|
||
The resulting transformation depends on the `orient` parameter.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.from_dict: Create a DataFrame from a dictionary.
|
||
DataFrame.to_json: Convert a DataFrame to JSON format.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'col1': [1, 2],
|
||
... 'col2': [0.5, 0.75]},
|
||
... index=['row1', 'row2'])
|
||
>>> df
|
||
col1 col2
|
||
row1 1 0.50
|
||
row2 2 0.75
|
||
>>> df.to_dict()
|
||
{'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
|
||
|
||
You can specify the return orientation.
|
||
|
||
>>> df.to_dict('series')
|
||
{'col1': row1 1
|
||
row2 2
|
||
Name: col1, dtype: int64,
|
||
'col2': row1 0.50
|
||
row2 0.75
|
||
Name: col2, dtype: float64}
|
||
|
||
>>> df.to_dict('split')
|
||
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
|
||
'data': [[1, 0.5], [2, 0.75]]}
|
||
|
||
>>> df.to_dict('records')
|
||
[{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
|
||
|
||
>>> df.to_dict('index')
|
||
{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
|
||
|
||
You can also specify the mapping type.
|
||
|
||
>>> from collections import OrderedDict, defaultdict
|
||
>>> df.to_dict(into=OrderedDict)
|
||
OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
|
||
('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
|
||
|
||
If you want a `defaultdict`, you need to initialize it:
|
||
|
||
>>> dd = defaultdict(list)
|
||
>>> df.to_dict('records', into=dd)
|
||
[defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
|
||
defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
|
||
"""
|
||
if not self.columns.is_unique:
|
||
warnings.warn(
|
||
"DataFrame columns are not unique, some columns will be omitted.",
|
||
UserWarning,
|
||
stacklevel=2,
|
||
)
|
||
# GH16122
|
||
into_c = com.standardize_mapping(into)
|
||
|
||
orient = orient.lower()
|
||
# GH32515
|
||
if orient.startswith(("d", "l", "s", "r", "i")) and orient not in {
|
||
"dict",
|
||
"list",
|
||
"series",
|
||
"split",
|
||
"records",
|
||
"index",
|
||
}:
|
||
warnings.warn(
|
||
"Using short name for 'orient' is deprecated. Only the "
|
||
"options: ('dict', list, 'series', 'split', 'records', 'index') "
|
||
"will be used in a future version. Use one of the above "
|
||
"to silence this warning.",
|
||
FutureWarning,
|
||
)
|
||
|
||
if orient.startswith("d"):
|
||
orient = "dict"
|
||
elif orient.startswith("l"):
|
||
orient = "list"
|
||
elif orient.startswith("sp"):
|
||
orient = "split"
|
||
elif orient.startswith("s"):
|
||
orient = "series"
|
||
elif orient.startswith("r"):
|
||
orient = "records"
|
||
elif orient.startswith("i"):
|
||
orient = "index"
|
||
|
||
if orient == "dict":
|
||
return into_c((k, v.to_dict(into)) for k, v in self.items())
|
||
|
||
elif orient == "list":
|
||
return into_c((k, v.tolist()) for k, v in self.items())
|
||
|
||
elif orient == "split":
|
||
return into_c(
|
||
(
|
||
("index", self.index.tolist()),
|
||
("columns", self.columns.tolist()),
|
||
(
|
||
"data",
|
||
[
|
||
list(map(maybe_box_datetimelike, t))
|
||
for t in self.itertuples(index=False, name=None)
|
||
],
|
||
),
|
||
)
|
||
)
|
||
|
||
elif orient == "series":
|
||
return into_c((k, maybe_box_datetimelike(v)) for k, v in self.items())
|
||
|
||
elif orient == "records":
|
||
columns = self.columns.tolist()
|
||
rows = (
|
||
dict(zip(columns, row))
|
||
for row in self.itertuples(index=False, name=None)
|
||
)
|
||
return [
|
||
into_c((k, maybe_box_datetimelike(v)) for k, v in row.items())
|
||
for row in rows
|
||
]
|
||
|
||
elif orient == "index":
|
||
if not self.index.is_unique:
|
||
raise ValueError("DataFrame index must be unique for orient='index'.")
|
||
return into_c(
|
||
(t[0], dict(zip(self.columns, t[1:])))
|
||
for t in self.itertuples(name=None)
|
||
)
|
||
|
||
else:
|
||
raise ValueError(f"orient '{orient}' not understood")
|
||
|
||
def to_gbq(
|
||
self,
|
||
destination_table,
|
||
project_id=None,
|
||
chunksize=None,
|
||
reauth=False,
|
||
if_exists="fail",
|
||
auth_local_webserver=False,
|
||
table_schema=None,
|
||
location=None,
|
||
progress_bar=True,
|
||
credentials=None,
|
||
) -> None:
|
||
"""
|
||
Write a DataFrame to a Google BigQuery table.
|
||
|
||
This function requires the `pandas-gbq package
|
||
<https://pandas-gbq.readthedocs.io>`__.
|
||
|
||
See the `How to authenticate with Google BigQuery
|
||
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
|
||
guide for authentication instructions.
|
||
|
||
Parameters
|
||
----------
|
||
destination_table : str
|
||
Name of table to be written, in the form ``dataset.tablename``.
|
||
project_id : str, optional
|
||
Google BigQuery Account project ID. Optional when available from
|
||
the environment.
|
||
chunksize : int, optional
|
||
Number of rows to be inserted in each chunk from the dataframe.
|
||
Set to ``None`` to load the whole dataframe at once.
|
||
reauth : bool, default False
|
||
Force Google BigQuery to re-authenticate the user. This is useful
|
||
if multiple accounts are used.
|
||
if_exists : str, default 'fail'
|
||
Behavior when the destination table exists. Value can be one of:
|
||
|
||
``'fail'``
|
||
If table exists raise pandas_gbq.gbq.TableCreationError.
|
||
``'replace'``
|
||
If table exists, drop it, recreate it, and insert data.
|
||
``'append'``
|
||
If table exists, insert data. Create if does not exist.
|
||
auth_local_webserver : bool, default False
|
||
Use the `local webserver flow`_ instead of the `console flow`_
|
||
when getting user credentials.
|
||
|
||
.. _local webserver flow:
|
||
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
|
||
.. _console flow:
|
||
https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
|
||
|
||
*New in version 0.2.0 of pandas-gbq*.
|
||
table_schema : list of dicts, optional
|
||
List of BigQuery table fields to which according DataFrame
|
||
columns conform to, e.g. ``[{'name': 'col1', 'type':
|
||
'STRING'},...]``. If schema is not provided, it will be
|
||
generated according to dtypes of DataFrame columns. See
|
||
BigQuery API documentation on available names of a field.
|
||
|
||
*New in version 0.3.1 of pandas-gbq*.
|
||
location : str, optional
|
||
Location where the load job should run. See the `BigQuery locations
|
||
documentation
|
||
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
|
||
list of available locations. The location must match that of the
|
||
target dataset.
|
||
|
||
*New in version 0.5.0 of pandas-gbq*.
|
||
progress_bar : bool, default True
|
||
Use the library `tqdm` to show the progress bar for the upload,
|
||
chunk by chunk.
|
||
|
||
*New in version 0.5.0 of pandas-gbq*.
|
||
credentials : google.auth.credentials.Credentials, optional
|
||
Credentials for accessing Google APIs. Use this parameter to
|
||
override default credentials, such as to use Compute Engine
|
||
:class:`google.auth.compute_engine.Credentials` or Service
|
||
Account :class:`google.oauth2.service_account.Credentials`
|
||
directly.
|
||
|
||
*New in version 0.8.0 of pandas-gbq*.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
See Also
|
||
--------
|
||
pandas_gbq.to_gbq : This function in the pandas-gbq library.
|
||
read_gbq : Read a DataFrame from Google BigQuery.
|
||
"""
|
||
from pandas.io import gbq
|
||
|
||
gbq.to_gbq(
|
||
self,
|
||
destination_table,
|
||
project_id=project_id,
|
||
chunksize=chunksize,
|
||
reauth=reauth,
|
||
if_exists=if_exists,
|
||
auth_local_webserver=auth_local_webserver,
|
||
table_schema=table_schema,
|
||
location=location,
|
||
progress_bar=progress_bar,
|
||
credentials=credentials,
|
||
)
|
||
|
||
@classmethod
|
||
def from_records(
|
||
cls,
|
||
data,
|
||
index=None,
|
||
exclude=None,
|
||
columns=None,
|
||
coerce_float=False,
|
||
nrows=None,
|
||
) -> DataFrame:
|
||
"""
|
||
Convert structured or record ndarray to DataFrame.
|
||
|
||
Creates a DataFrame object from a structured ndarray, sequence of
|
||
tuples or dicts, or DataFrame.
|
||
|
||
Parameters
|
||
----------
|
||
data : structured ndarray, sequence of tuples or dicts, or DataFrame
|
||
Structured input data.
|
||
index : str, list of fields, array-like
|
||
Field of array to use as the index, alternately a specific set of
|
||
input labels to use.
|
||
exclude : sequence, default None
|
||
Columns or fields to exclude.
|
||
columns : sequence, default None
|
||
Column names to use. If the passed data do not have names
|
||
associated with them, this argument provides names for the
|
||
columns. Otherwise this argument indicates the order of the columns
|
||
in the result (any names not found in the data will become all-NA
|
||
columns).
|
||
coerce_float : bool, default False
|
||
Attempt to convert values of non-string, non-numeric objects (like
|
||
decimal.Decimal) to floating point, useful for SQL result sets.
|
||
nrows : int, default None
|
||
Number of rows to read if data is an iterator.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.from_dict : DataFrame from dict of array-like or dicts.
|
||
DataFrame : DataFrame object creation using constructor.
|
||
|
||
Examples
|
||
--------
|
||
Data can be provided as a structured ndarray:
|
||
|
||
>>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
|
||
... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
|
||
>>> pd.DataFrame.from_records(data)
|
||
col_1 col_2
|
||
0 3 a
|
||
1 2 b
|
||
2 1 c
|
||
3 0 d
|
||
|
||
Data can be provided as a list of dicts:
|
||
|
||
>>> data = [{'col_1': 3, 'col_2': 'a'},
|
||
... {'col_1': 2, 'col_2': 'b'},
|
||
... {'col_1': 1, 'col_2': 'c'},
|
||
... {'col_1': 0, 'col_2': 'd'}]
|
||
>>> pd.DataFrame.from_records(data)
|
||
col_1 col_2
|
||
0 3 a
|
||
1 2 b
|
||
2 1 c
|
||
3 0 d
|
||
|
||
Data can be provided as a list of tuples with corresponding columns:
|
||
|
||
>>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
|
||
>>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
|
||
col_1 col_2
|
||
0 3 a
|
||
1 2 b
|
||
2 1 c
|
||
3 0 d
|
||
"""
|
||
# Make a copy of the input columns so we can modify it
|
||
if columns is not None:
|
||
columns = ensure_index(columns)
|
||
|
||
if is_iterator(data):
|
||
if nrows == 0:
|
||
return cls()
|
||
|
||
try:
|
||
first_row = next(data)
|
||
except StopIteration:
|
||
return cls(index=index, columns=columns)
|
||
|
||
dtype = None
|
||
if hasattr(first_row, "dtype") and first_row.dtype.names:
|
||
dtype = first_row.dtype
|
||
|
||
values = [first_row]
|
||
|
||
if nrows is None:
|
||
values += data
|
||
else:
|
||
values.extend(itertools.islice(data, nrows - 1))
|
||
|
||
if dtype is not None:
|
||
data = np.array(values, dtype=dtype)
|
||
else:
|
||
data = values
|
||
|
||
if isinstance(data, dict):
|
||
if columns is None:
|
||
columns = arr_columns = ensure_index(sorted(data))
|
||
arrays = [data[k] for k in columns]
|
||
else:
|
||
arrays = []
|
||
arr_columns_list = []
|
||
for k, v in data.items():
|
||
if k in columns:
|
||
arr_columns_list.append(k)
|
||
arrays.append(v)
|
||
|
||
arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns)
|
||
|
||
elif isinstance(data, (np.ndarray, DataFrame)):
|
||
arrays, columns = to_arrays(data, columns)
|
||
if columns is not None:
|
||
columns = ensure_index(columns)
|
||
arr_columns = columns
|
||
else:
|
||
arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
|
||
|
||
arr_columns = ensure_index(arr_columns)
|
||
if columns is not None:
|
||
columns = ensure_index(columns)
|
||
else:
|
||
columns = arr_columns
|
||
|
||
if exclude is None:
|
||
exclude = set()
|
||
else:
|
||
exclude = set(exclude)
|
||
|
||
result_index = None
|
||
if index is not None:
|
||
if isinstance(index, str) or not hasattr(index, "__iter__"):
|
||
i = columns.get_loc(index)
|
||
exclude.add(index)
|
||
if len(arrays) > 0:
|
||
result_index = Index(arrays[i], name=index)
|
||
else:
|
||
result_index = Index([], name=index)
|
||
else:
|
||
try:
|
||
index_data = [arrays[arr_columns.get_loc(field)] for field in index]
|
||
except (KeyError, TypeError):
|
||
# raised by get_loc, see GH#29258
|
||
result_index = index
|
||
else:
|
||
result_index = ensure_index_from_sequences(index_data, names=index)
|
||
exclude.update(index)
|
||
|
||
if any(exclude):
|
||
arr_exclude = [x for x in exclude if x in arr_columns]
|
||
to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
|
||
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
|
||
|
||
arr_columns = arr_columns.drop(arr_exclude)
|
||
columns = columns.drop(exclude)
|
||
|
||
mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
|
||
|
||
return cls(mgr)
|
||
|
||
def to_records(
|
||
self, index=True, column_dtypes=None, index_dtypes=None
|
||
) -> np.recarray:
|
||
"""
|
||
Convert DataFrame to a NumPy record array.
|
||
|
||
Index will be included as the first field of the record array if
|
||
requested.
|
||
|
||
Parameters
|
||
----------
|
||
index : bool, default True
|
||
Include index in resulting record array, stored in 'index'
|
||
field or using the index label, if set.
|
||
column_dtypes : str, type, dict, default None
|
||
.. versionadded:: 0.24.0
|
||
|
||
If a string or type, the data type to store all columns. If
|
||
a dictionary, a mapping of column names and indices (zero-indexed)
|
||
to specific data types.
|
||
index_dtypes : str, type, dict, default None
|
||
.. versionadded:: 0.24.0
|
||
|
||
If a string or type, the data type to store all index levels. If
|
||
a dictionary, a mapping of index level names and indices
|
||
(zero-indexed) to specific data types.
|
||
|
||
This mapping is applied only if `index=True`.
|
||
|
||
Returns
|
||
-------
|
||
numpy.recarray
|
||
NumPy ndarray with the DataFrame labels as fields and each row
|
||
of the DataFrame as entries.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.from_records: Convert structured or record ndarray
|
||
to DataFrame.
|
||
numpy.recarray: An ndarray that allows field access using
|
||
attributes, analogous to typed columns in a
|
||
spreadsheet.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
|
||
... index=['a', 'b'])
|
||
>>> df
|
||
A B
|
||
a 1 0.50
|
||
b 2 0.75
|
||
>>> df.to_records()
|
||
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
|
||
dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
|
||
|
||
If the DataFrame index has no label then the recarray field name
|
||
is set to 'index'. If the index has a label then this is used as the
|
||
field name:
|
||
|
||
>>> df.index = df.index.rename("I")
|
||
>>> df.to_records()
|
||
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
|
||
dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
|
||
|
||
The index can be excluded from the record array:
|
||
|
||
>>> df.to_records(index=False)
|
||
rec.array([(1, 0.5 ), (2, 0.75)],
|
||
dtype=[('A', '<i8'), ('B', '<f8')])
|
||
|
||
Data types can be specified for the columns:
|
||
|
||
>>> df.to_records(column_dtypes={"A": "int32"})
|
||
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
|
||
dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
|
||
|
||
As well as for the index:
|
||
|
||
>>> df.to_records(index_dtypes="<S2")
|
||
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
|
||
dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
|
||
|
||
>>> index_dtypes = f"<S{df.index.str.len().max()}"
|
||
>>> df.to_records(index_dtypes=index_dtypes)
|
||
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
|
||
dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
|
||
"""
|
||
if index:
|
||
if isinstance(self.index, MultiIndex):
|
||
# array of tuples to numpy cols. copy copy copy
|
||
ix_vals = list(map(np.array, zip(*self.index._values)))
|
||
else:
|
||
ix_vals = [self.index.values]
|
||
|
||
arrays = ix_vals + [
|
||
np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
|
||
]
|
||
|
||
count = 0
|
||
index_names = list(self.index.names)
|
||
|
||
if isinstance(self.index, MultiIndex):
|
||
for i, n in enumerate(index_names):
|
||
if n is None:
|
||
index_names[i] = f"level_{count}"
|
||
count += 1
|
||
elif index_names[0] is None:
|
||
index_names = ["index"]
|
||
|
||
names = [str(name) for name in itertools.chain(index_names, self.columns)]
|
||
else:
|
||
arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
|
||
names = [str(c) for c in self.columns]
|
||
index_names = []
|
||
|
||
index_len = len(index_names)
|
||
formats = []
|
||
|
||
for i, v in enumerate(arrays):
|
||
index = i
|
||
|
||
# When the names and arrays are collected, we
|
||
# first collect those in the DataFrame's index,
|
||
# followed by those in its columns.
|
||
#
|
||
# Thus, the total length of the array is:
|
||
# len(index_names) + len(DataFrame.columns).
|
||
#
|
||
# This check allows us to see whether we are
|
||
# handling a name / array in the index or column.
|
||
if index < index_len:
|
||
dtype_mapping = index_dtypes
|
||
name = index_names[index]
|
||
else:
|
||
index -= index_len
|
||
dtype_mapping = column_dtypes
|
||
name = self.columns[index]
|
||
|
||
# We have a dictionary, so we get the data type
|
||
# associated with the index or column (which can
|
||
# be denoted by its name in the DataFrame or its
|
||
# position in DataFrame's array of indices or
|
||
# columns, whichever is applicable.
|
||
if is_dict_like(dtype_mapping):
|
||
if name in dtype_mapping:
|
||
dtype_mapping = dtype_mapping[name]
|
||
elif index in dtype_mapping:
|
||
dtype_mapping = dtype_mapping[index]
|
||
else:
|
||
dtype_mapping = None
|
||
|
||
# If no mapping can be found, use the array's
|
||
# dtype attribute for formatting.
|
||
#
|
||
# A valid dtype must either be a type or
|
||
# string naming a type.
|
||
if dtype_mapping is None:
|
||
formats.append(v.dtype)
|
||
elif isinstance(dtype_mapping, (type, np.dtype, str)):
|
||
formats.append(dtype_mapping)
|
||
else:
|
||
element = "row" if i < index_len else "column"
|
||
msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
|
||
raise ValueError(msg)
|
||
|
||
return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
|
||
|
||
@classmethod
|
||
def _from_arrays(
|
||
cls,
|
||
arrays,
|
||
columns,
|
||
index,
|
||
dtype: Optional[Dtype] = None,
|
||
verify_integrity: bool = True,
|
||
) -> DataFrame:
|
||
"""
|
||
Create DataFrame from a list of arrays corresponding to the columns.
|
||
|
||
Parameters
|
||
----------
|
||
arrays : list-like of arrays
|
||
Each array in the list corresponds to one column, in order.
|
||
columns : list-like, Index
|
||
The column names for the resulting DataFrame.
|
||
index : list-like, Index
|
||
The rows labels for the resulting DataFrame.
|
||
dtype : dtype, optional
|
||
Optional dtype to enforce for all arrays.
|
||
verify_integrity : bool, default True
|
||
Validate and homogenize all input. If set to False, it is assumed
|
||
that all elements of `arrays` are actual arrays how they will be
|
||
stored in a block (numpy ndarray or ExtensionArray), have the same
|
||
length as and are aligned with the index, and that `columns` and
|
||
`index` are ensured to be an Index object.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
"""
|
||
if dtype is not None:
|
||
dtype = pandas_dtype(dtype)
|
||
|
||
mgr = arrays_to_mgr(
|
||
arrays,
|
||
columns,
|
||
index,
|
||
columns,
|
||
dtype=dtype,
|
||
verify_integrity=verify_integrity,
|
||
)
|
||
return cls(mgr)
|
||
|
||
@doc(storage_options=generic._shared_docs["storage_options"])
|
||
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
|
||
def to_stata(
|
||
self,
|
||
path: FilePathOrBuffer,
|
||
convert_dates: Optional[Dict[Label, str]] = None,
|
||
write_index: bool = True,
|
||
byteorder: Optional[str] = None,
|
||
time_stamp: Optional[datetime.datetime] = None,
|
||
data_label: Optional[str] = None,
|
||
variable_labels: Optional[Dict[Label, str]] = None,
|
||
version: Optional[int] = 114,
|
||
convert_strl: Optional[Sequence[Label]] = None,
|
||
compression: CompressionOptions = "infer",
|
||
storage_options: StorageOptions = None,
|
||
) -> None:
|
||
"""
|
||
Export DataFrame object to Stata dta format.
|
||
|
||
Writes the DataFrame to a Stata dataset file.
|
||
"dta" files contain a Stata dataset.
|
||
|
||
Parameters
|
||
----------
|
||
path : str, buffer or path object
|
||
String, path object (pathlib.Path or py._path.local.LocalPath) or
|
||
object implementing a binary write() function. If using a buffer
|
||
then the buffer will not be automatically closed after the file
|
||
data has been written.
|
||
|
||
.. versionchanged:: 1.0.0
|
||
|
||
Previously this was "fname"
|
||
|
||
convert_dates : dict
|
||
Dictionary mapping columns containing datetime types to stata
|
||
internal format to use when writing the dates. Options are 'tc',
|
||
'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
|
||
or a name. Datetime columns that do not have a conversion type
|
||
specified will be converted to 'tc'. Raises NotImplementedError if
|
||
a datetime column has timezone information.
|
||
write_index : bool
|
||
Write the index to Stata dataset.
|
||
byteorder : str
|
||
Can be ">", "<", "little", or "big". default is `sys.byteorder`.
|
||
time_stamp : datetime
|
||
A datetime to use as file creation date. Default is the current
|
||
time.
|
||
data_label : str, optional
|
||
A label for the data set. Must be 80 characters or smaller.
|
||
variable_labels : dict
|
||
Dictionary containing columns as keys and variable labels as
|
||
values. Each label must be 80 characters or smaller.
|
||
version : {{114, 117, 118, 119, None}}, default 114
|
||
Version to use in the output dta file. Set to None to let pandas
|
||
decide between 118 or 119 formats depending on the number of
|
||
columns in the frame. Version 114 can be read by Stata 10 and
|
||
later. Version 117 can be read by Stata 13 or later. Version 118
|
||
is supported in Stata 14 and later. Version 119 is supported in
|
||
Stata 15 and later. Version 114 limits string variables to 244
|
||
characters or fewer while versions 117 and later allow strings
|
||
with lengths up to 2,000,000 characters. Versions 118 and 119
|
||
support Unicode characters, and version 119 supports more than
|
||
32,767 variables.
|
||
|
||
Version 119 should usually only be used when the number of
|
||
variables exceeds the capacity of dta format 118. Exporting
|
||
smaller datasets in format 119 may have unintended consequences,
|
||
and, as of November 2020, Stata SE cannot read version 119 files.
|
||
|
||
.. versionchanged:: 1.0.0
|
||
|
||
Added support for formats 118 and 119.
|
||
|
||
convert_strl : list, optional
|
||
List of column names to convert to string columns to Stata StrL
|
||
format. Only available if version is 117. Storing strings in the
|
||
StrL format can produce smaller dta files if strings have more than
|
||
8 characters and values are repeated.
|
||
compression : str or dict, default 'infer'
|
||
For on-the-fly compression of the output dta. If string, specifies
|
||
compression mode. If dict, value at key 'method' specifies
|
||
compression mode. Compression mode must be one of {{'infer', 'gzip',
|
||
'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and
|
||
`fname` is path-like, then detect compression from the following
|
||
extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
|
||
compression). If dict and compression mode is one of {{'zip',
|
||
'gzip', 'bz2'}}, or inferred as one of the above, other entries
|
||
passed as additional compression options.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
{storage_options}
|
||
|
||
.. versionadded:: 1.2.0
|
||
|
||
Raises
|
||
------
|
||
NotImplementedError
|
||
* If datetimes contain timezone information
|
||
* Column dtype is not representable in Stata
|
||
ValueError
|
||
* Columns listed in convert_dates are neither datetime64[ns]
|
||
or datetime.datetime
|
||
* Column listed in convert_dates is not in DataFrame
|
||
* Categorical label contains more than 32,000 characters
|
||
|
||
See Also
|
||
--------
|
||
read_stata : Import Stata data files.
|
||
io.stata.StataWriter : Low-level writer for Stata data files.
|
||
io.stata.StataWriter117 : Low-level writer for version 117 files.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
|
||
... 'parrot'],
|
||
... 'speed': [350, 18, 361, 15]}})
|
||
>>> df.to_stata('animals.dta') # doctest: +SKIP
|
||
"""
|
||
if version not in (114, 117, 118, 119, None):
|
||
raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
|
||
if version == 114:
|
||
if convert_strl is not None:
|
||
raise ValueError("strl is not supported in format 114")
|
||
from pandas.io.stata import StataWriter as statawriter
|
||
elif version == 117:
|
||
# mypy: Name 'statawriter' already defined (possibly by an import)
|
||
from pandas.io.stata import ( # type: ignore[no-redef]
|
||
StataWriter117 as statawriter,
|
||
)
|
||
else: # versions 118 and 119
|
||
# mypy: Name 'statawriter' already defined (possibly by an import)
|
||
from pandas.io.stata import ( # type: ignore[no-redef]
|
||
StataWriterUTF8 as statawriter,
|
||
)
|
||
|
||
kwargs: Dict[str, Any] = {}
|
||
if version is None or version >= 117:
|
||
# strl conversion is only supported >= 117
|
||
kwargs["convert_strl"] = convert_strl
|
||
if version is None or version >= 118:
|
||
# Specifying the version is only supported for UTF8 (118 or 119)
|
||
kwargs["version"] = version
|
||
|
||
# mypy: Too many arguments for "StataWriter"
|
||
writer = statawriter( # type: ignore[call-arg]
|
||
path,
|
||
self,
|
||
convert_dates=convert_dates,
|
||
byteorder=byteorder,
|
||
time_stamp=time_stamp,
|
||
data_label=data_label,
|
||
write_index=write_index,
|
||
variable_labels=variable_labels,
|
||
compression=compression,
|
||
storage_options=storage_options,
|
||
**kwargs,
|
||
)
|
||
writer.write_file()
|
||
|
||
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
|
||
def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None:
|
||
"""
|
||
Write a DataFrame to the binary Feather format.
|
||
|
||
Parameters
|
||
----------
|
||
path : str or file-like object
|
||
If a string, it will be used as Root Directory path.
|
||
**kwargs :
|
||
Additional keywords passed to :func:`pyarrow.feather.write_feather`.
|
||
Starting with pyarrow 0.17, this includes the `compression`,
|
||
`compression_level`, `chunksize` and `version` keywords.
|
||
|
||
.. versionadded:: 1.1.0
|
||
"""
|
||
from pandas.io.feather_format import to_feather
|
||
|
||
to_feather(self, path, **kwargs)
|
||
|
||
@doc(
|
||
Series.to_markdown,
|
||
klass=_shared_doc_kwargs["klass"],
|
||
storage_options=_shared_docs["storage_options"],
|
||
examples="""Examples
|
||
--------
|
||
>>> df = pd.DataFrame(
|
||
... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
|
||
... )
|
||
>>> print(df.to_markdown())
|
||
| | animal_1 | animal_2 |
|
||
|---:|:-----------|:-----------|
|
||
| 0 | elk | dog |
|
||
| 1 | pig | quetzal |
|
||
|
||
Output markdown with a tabulate option.
|
||
|
||
>>> print(df.to_markdown(tablefmt="grid"))
|
||
+----+------------+------------+
|
||
| | animal_1 | animal_2 |
|
||
+====+============+============+
|
||
| 0 | elk | dog |
|
||
+----+------------+------------+
|
||
| 1 | pig | quetzal |
|
||
+----+------------+------------+
|
||
""",
|
||
)
|
||
def to_markdown(
|
||
self,
|
||
buf: Optional[Union[IO[str], str]] = None,
|
||
mode: str = "wt",
|
||
index: bool = True,
|
||
storage_options: StorageOptions = None,
|
||
**kwargs,
|
||
) -> Optional[str]:
|
||
if "showindex" in kwargs:
|
||
warnings.warn(
|
||
"'showindex' is deprecated. Only 'index' will be used "
|
||
"in a future version. Use 'index' to silence this warning.",
|
||
FutureWarning,
|
||
stacklevel=2,
|
||
)
|
||
|
||
kwargs.setdefault("headers", "keys")
|
||
kwargs.setdefault("tablefmt", "pipe")
|
||
kwargs.setdefault("showindex", index)
|
||
tabulate = import_optional_dependency("tabulate")
|
||
result = tabulate.tabulate(self, **kwargs)
|
||
if buf is None:
|
||
return result
|
||
|
||
with get_handle(buf, mode, storage_options=storage_options) as handles:
|
||
assert not isinstance(handles.handle, (str, mmap.mmap))
|
||
handles.handle.writelines(result)
|
||
return None
|
||
|
||
@doc(storage_options=generic._shared_docs["storage_options"])
|
||
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
|
||
def to_parquet(
|
||
self,
|
||
path: Optional[FilePathOrBuffer] = None,
|
||
engine: str = "auto",
|
||
compression: Optional[str] = "snappy",
|
||
index: Optional[bool] = None,
|
||
partition_cols: Optional[List[str]] = None,
|
||
storage_options: StorageOptions = None,
|
||
**kwargs,
|
||
) -> Optional[bytes]:
|
||
"""
|
||
Write a DataFrame to the binary parquet format.
|
||
|
||
This function writes the dataframe as a `parquet file
|
||
<https://parquet.apache.org/>`_. You can choose different parquet
|
||
backends, and have the option of compression. See
|
||
:ref:`the user guide <io.parquet>` for more details.
|
||
|
||
Parameters
|
||
----------
|
||
path : str or file-like object, default None
|
||
If a string, it will be used as Root Directory path
|
||
when writing a partitioned dataset. By file-like object,
|
||
we refer to objects with a write() method, such as a file handle
|
||
(e.g. via builtin open function) or io.BytesIO. The engine
|
||
fastparquet does not accept file-like objects. If path is None,
|
||
a bytes object is returned.
|
||
|
||
.. versionchanged:: 1.2.0
|
||
|
||
Previously this was "fname"
|
||
|
||
engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
|
||
Parquet library to use. If 'auto', then the option
|
||
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
||
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
||
'pyarrow' is unavailable.
|
||
compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
|
||
Name of the compression to use. Use ``None`` for no compression.
|
||
index : bool, default None
|
||
If ``True``, include the dataframe's index(es) in the file output.
|
||
If ``False``, they will not be written to the file.
|
||
If ``None``, similar to ``True`` the dataframe's index(es)
|
||
will be saved. However, instead of being saved as values,
|
||
the RangeIndex will be stored as a range in the metadata so it
|
||
doesn't require much space and is faster. Other indexes will
|
||
be included as columns in the file output.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
partition_cols : list, optional, default None
|
||
Column names by which to partition the dataset.
|
||
Columns are partitioned in the order they are given.
|
||
Must be None if path is not a string.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
{storage_options}
|
||
|
||
.. versionadded:: 1.2.0
|
||
|
||
**kwargs
|
||
Additional arguments passed to the parquet library. See
|
||
:ref:`pandas io <io.parquet>` for more details.
|
||
|
||
Returns
|
||
-------
|
||
bytes if no path argument is provided else None
|
||
|
||
See Also
|
||
--------
|
||
read_parquet : Read a parquet file.
|
||
DataFrame.to_csv : Write a csv file.
|
||
DataFrame.to_sql : Write to a sql table.
|
||
DataFrame.to_hdf : Write to hdf.
|
||
|
||
Notes
|
||
-----
|
||
This function requires either the `fastparquet
|
||
<https://pypi.org/project/fastparquet>`_ or `pyarrow
|
||
<https://arrow.apache.org/docs/python/>`_ library.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
|
||
>>> df.to_parquet('df.parquet.gzip',
|
||
... compression='gzip') # doctest: +SKIP
|
||
>>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
|
||
col1 col2
|
||
0 1 3
|
||
1 2 4
|
||
|
||
If you want to get a buffer to the parquet content you can use a io.BytesIO
|
||
object, as long as you don't use partition_cols, which creates multiple files.
|
||
|
||
>>> import io
|
||
>>> f = io.BytesIO()
|
||
>>> df.to_parquet(f)
|
||
>>> f.seek(0)
|
||
0
|
||
>>> content = f.read()
|
||
"""
|
||
from pandas.io.parquet import to_parquet
|
||
|
||
return to_parquet(
|
||
self,
|
||
path,
|
||
engine,
|
||
compression=compression,
|
||
index=index,
|
||
partition_cols=partition_cols,
|
||
storage_options=storage_options,
|
||
**kwargs,
|
||
)
|
||
|
||
@Substitution(
|
||
header_type="bool",
|
||
header="Whether to print column labels, default True",
|
||
col_space_type="str or int, list or dict of int or str",
|
||
col_space="The minimum width of each column in CSS length "
|
||
"units. An int is assumed to be px units.\n\n"
|
||
" .. versionadded:: 0.25.0\n"
|
||
" Ability to use str",
|
||
)
|
||
@Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
|
||
def to_html(
|
||
self,
|
||
buf=None,
|
||
columns=None,
|
||
col_space=None,
|
||
header=True,
|
||
index=True,
|
||
na_rep="NaN",
|
||
formatters=None,
|
||
float_format=None,
|
||
sparsify=None,
|
||
index_names=True,
|
||
justify=None,
|
||
max_rows=None,
|
||
max_cols=None,
|
||
show_dimensions=False,
|
||
decimal=".",
|
||
bold_rows=True,
|
||
classes=None,
|
||
escape=True,
|
||
notebook=False,
|
||
border=None,
|
||
table_id=None,
|
||
render_links=False,
|
||
encoding=None,
|
||
):
|
||
"""
|
||
Render a DataFrame as an HTML table.
|
||
%(shared_params)s
|
||
bold_rows : bool, default True
|
||
Make the row labels bold in the output.
|
||
classes : str or list or tuple, default None
|
||
CSS class(es) to apply to the resulting html table.
|
||
escape : bool, default True
|
||
Convert the characters <, >, and & to HTML-safe sequences.
|
||
notebook : {True, False}, default False
|
||
Whether the generated HTML is for IPython Notebook.
|
||
border : int
|
||
A ``border=border`` attribute is included in the opening
|
||
`<table>` tag. Default ``pd.options.display.html.border``.
|
||
encoding : str, default "utf-8"
|
||
Set character encoding.
|
||
|
||
.. versionadded:: 1.0
|
||
|
||
table_id : str, optional
|
||
A css id is included in the opening `<table>` tag if specified.
|
||
render_links : bool, default False
|
||
Convert URLs to HTML links.
|
||
|
||
.. versionadded:: 0.24.0
|
||
%(returns)s
|
||
See Also
|
||
--------
|
||
to_string : Convert DataFrame to a string.
|
||
"""
|
||
if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
|
||
raise ValueError("Invalid value for justify parameter")
|
||
|
||
formatter = fmt.DataFrameFormatter(
|
||
self,
|
||
columns=columns,
|
||
col_space=col_space,
|
||
na_rep=na_rep,
|
||
header=header,
|
||
index=index,
|
||
formatters=formatters,
|
||
float_format=float_format,
|
||
bold_rows=bold_rows,
|
||
sparsify=sparsify,
|
||
justify=justify,
|
||
index_names=index_names,
|
||
escape=escape,
|
||
decimal=decimal,
|
||
max_rows=max_rows,
|
||
max_cols=max_cols,
|
||
show_dimensions=show_dimensions,
|
||
)
|
||
# TODO: a generic formatter wld b in DataFrameFormatter
|
||
return fmt.DataFrameRenderer(formatter).to_html(
|
||
buf=buf,
|
||
classes=classes,
|
||
notebook=notebook,
|
||
border=border,
|
||
encoding=encoding,
|
||
table_id=table_id,
|
||
render_links=render_links,
|
||
)
|
||
|
||
# ----------------------------------------------------------------------
|
||
@Substitution(
|
||
klass="DataFrame",
|
||
type_sub=" and columns",
|
||
max_cols_sub=dedent(
|
||
"""\
|
||
max_cols : int, optional
|
||
When to switch from the verbose to the truncated output. If the
|
||
DataFrame has more than `max_cols` columns, the truncated output
|
||
is used. By default, the setting in
|
||
``pandas.options.display.max_info_columns`` is used."""
|
||
),
|
||
show_counts_sub=dedent(
|
||
"""\
|
||
show_counts : bool, optional
|
||
Whether to show the non-null counts. By default, this is shown
|
||
only if the DataFrame is smaller than
|
||
``pandas.options.display.max_info_rows`` and
|
||
``pandas.options.display.max_info_columns``. A value of True always
|
||
shows the counts, and False never shows the counts.
|
||
null_counts : bool, optional
|
||
.. deprecated:: 1.2.0
|
||
Use show_counts instead."""
|
||
),
|
||
examples_sub=dedent(
|
||
"""\
|
||
>>> int_values = [1, 2, 3, 4, 5]
|
||
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
|
||
>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
|
||
>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
|
||
... "float_col": float_values})
|
||
>>> df
|
||
int_col text_col float_col
|
||
0 1 alpha 0.00
|
||
1 2 beta 0.25
|
||
2 3 gamma 0.50
|
||
3 4 delta 0.75
|
||
4 5 epsilon 1.00
|
||
|
||
Prints information of all columns:
|
||
|
||
>>> df.info(verbose=True)
|
||
<class 'pandas.core.frame.DataFrame'>
|
||
RangeIndex: 5 entries, 0 to 4
|
||
Data columns (total 3 columns):
|
||
# Column Non-Null Count Dtype
|
||
--- ------ -------------- -----
|
||
0 int_col 5 non-null int64
|
||
1 text_col 5 non-null object
|
||
2 float_col 5 non-null float64
|
||
dtypes: float64(1), int64(1), object(1)
|
||
memory usage: 248.0+ bytes
|
||
|
||
Prints a summary of columns count and its dtypes but not per column
|
||
information:
|
||
|
||
>>> df.info(verbose=False)
|
||
<class 'pandas.core.frame.DataFrame'>
|
||
RangeIndex: 5 entries, 0 to 4
|
||
Columns: 3 entries, int_col to float_col
|
||
dtypes: float64(1), int64(1), object(1)
|
||
memory usage: 248.0+ bytes
|
||
|
||
Pipe output of DataFrame.info to buffer instead of sys.stdout, get
|
||
buffer content and writes to a text file:
|
||
|
||
>>> import io
|
||
>>> buffer = io.StringIO()
|
||
>>> df.info(buf=buffer)
|
||
>>> s = buffer.getvalue()
|
||
>>> with open("df_info.txt", "w",
|
||
... encoding="utf-8") as f: # doctest: +SKIP
|
||
... f.write(s)
|
||
260
|
||
|
||
The `memory_usage` parameter allows deep introspection mode, specially
|
||
useful for big DataFrames and fine-tune memory optimization:
|
||
|
||
>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
|
||
>>> df = pd.DataFrame({
|
||
... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
|
||
... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
|
||
... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
|
||
... })
|
||
>>> df.info()
|
||
<class 'pandas.core.frame.DataFrame'>
|
||
RangeIndex: 1000000 entries, 0 to 999999
|
||
Data columns (total 3 columns):
|
||
# Column Non-Null Count Dtype
|
||
--- ------ -------------- -----
|
||
0 column_1 1000000 non-null object
|
||
1 column_2 1000000 non-null object
|
||
2 column_3 1000000 non-null object
|
||
dtypes: object(3)
|
||
memory usage: 22.9+ MB
|
||
|
||
>>> df.info(memory_usage='deep')
|
||
<class 'pandas.core.frame.DataFrame'>
|
||
RangeIndex: 1000000 entries, 0 to 999999
|
||
Data columns (total 3 columns):
|
||
# Column Non-Null Count Dtype
|
||
--- ------ -------------- -----
|
||
0 column_1 1000000 non-null object
|
||
1 column_2 1000000 non-null object
|
||
2 column_3 1000000 non-null object
|
||
dtypes: object(3)
|
||
memory usage: 165.9 MB"""
|
||
),
|
||
see_also_sub=dedent(
|
||
"""\
|
||
DataFrame.describe: Generate descriptive statistics of DataFrame
|
||
columns.
|
||
DataFrame.memory_usage: Memory usage of DataFrame columns."""
|
||
),
|
||
version_added_sub="",
|
||
)
|
||
@doc(BaseInfo.render)
|
||
def info(
|
||
self,
|
||
verbose: Optional[bool] = None,
|
||
buf: Optional[IO[str]] = None,
|
||
max_cols: Optional[int] = None,
|
||
memory_usage: Optional[Union[bool, str]] = None,
|
||
show_counts: Optional[bool] = None,
|
||
null_counts: Optional[bool] = None,
|
||
) -> None:
|
||
if null_counts is not None:
|
||
if show_counts is not None:
|
||
raise ValueError("null_counts used with show_counts. Use show_counts.")
|
||
warnings.warn(
|
||
"null_counts is deprecated. Use show_counts instead",
|
||
FutureWarning,
|
||
stacklevel=2,
|
||
)
|
||
show_counts = null_counts
|
||
info = DataFrameInfo(
|
||
data=self,
|
||
memory_usage=memory_usage,
|
||
)
|
||
info.render(
|
||
buf=buf,
|
||
max_cols=max_cols,
|
||
verbose=verbose,
|
||
show_counts=show_counts,
|
||
)
|
||
|
||
def memory_usage(self, index=True, deep=False) -> Series:
|
||
"""
|
||
Return the memory usage of each column in bytes.
|
||
|
||
The memory usage can optionally include the contribution of
|
||
the index and elements of `object` dtype.
|
||
|
||
This value is displayed in `DataFrame.info` by default. This can be
|
||
suppressed by setting ``pandas.options.display.memory_usage`` to False.
|
||
|
||
Parameters
|
||
----------
|
||
index : bool, default True
|
||
Specifies whether to include the memory usage of the DataFrame's
|
||
index in returned Series. If ``index=True``, the memory usage of
|
||
the index is the first item in the output.
|
||
deep : bool, default False
|
||
If True, introspect the data deeply by interrogating
|
||
`object` dtypes for system-level memory consumption, and include
|
||
it in the returned values.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
A Series whose index is the original column names and whose values
|
||
is the memory usage of each column in bytes.
|
||
|
||
See Also
|
||
--------
|
||
numpy.ndarray.nbytes : Total bytes consumed by the elements of an
|
||
ndarray.
|
||
Series.memory_usage : Bytes consumed by a Series.
|
||
Categorical : Memory-efficient array for string values with
|
||
many repeated values.
|
||
DataFrame.info : Concise summary of a DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
|
||
>>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
|
||
... for t in dtypes])
|
||
>>> df = pd.DataFrame(data)
|
||
>>> df.head()
|
||
int64 float64 complex128 object bool
|
||
0 1 1.0 1.0+0.0j 1 True
|
||
1 1 1.0 1.0+0.0j 1 True
|
||
2 1 1.0 1.0+0.0j 1 True
|
||
3 1 1.0 1.0+0.0j 1 True
|
||
4 1 1.0 1.0+0.0j 1 True
|
||
|
||
>>> df.memory_usage()
|
||
Index 128
|
||
int64 40000
|
||
float64 40000
|
||
complex128 80000
|
||
object 40000
|
||
bool 5000
|
||
dtype: int64
|
||
|
||
>>> df.memory_usage(index=False)
|
||
int64 40000
|
||
float64 40000
|
||
complex128 80000
|
||
object 40000
|
||
bool 5000
|
||
dtype: int64
|
||
|
||
The memory footprint of `object` dtype columns is ignored by default:
|
||
|
||
>>> df.memory_usage(deep=True)
|
||
Index 128
|
||
int64 40000
|
||
float64 40000
|
||
complex128 80000
|
||
object 180000
|
||
bool 5000
|
||
dtype: int64
|
||
|
||
Use a Categorical for efficient storage of an object-dtype column with
|
||
many repeated values.
|
||
|
||
>>> df['object'].astype('category').memory_usage(deep=True)
|
||
5244
|
||
"""
|
||
result = self._constructor_sliced(
|
||
[c.memory_usage(index=False, deep=deep) for col, c in self.items()],
|
||
index=self.columns,
|
||
)
|
||
if index:
|
||
result = self._constructor_sliced(
|
||
self.index.memory_usage(deep=deep), index=["Index"]
|
||
).append(result)
|
||
return result
|
||
|
||
def transpose(self, *args, copy: bool = False) -> DataFrame:
|
||
"""
|
||
Transpose index and columns.
|
||
|
||
Reflect the DataFrame over its main diagonal by writing rows as columns
|
||
and vice-versa. The property :attr:`.T` is an accessor to the method
|
||
:meth:`transpose`.
|
||
|
||
Parameters
|
||
----------
|
||
*args : tuple, optional
|
||
Accepted for compatibility with NumPy.
|
||
copy : bool, default False
|
||
Whether to copy the data after transposing, even for DataFrames
|
||
with a single dtype.
|
||
|
||
Note that a copy is always required for mixed dtype DataFrames,
|
||
or for DataFrames with any extension types.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
The transposed DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
numpy.transpose : Permute the dimensions of a given array.
|
||
|
||
Notes
|
||
-----
|
||
Transposing a DataFrame with mixed dtypes will result in a homogeneous
|
||
DataFrame with the `object` dtype. In such a case, a copy of the data
|
||
is always made.
|
||
|
||
Examples
|
||
--------
|
||
**Square DataFrame with homogeneous dtype**
|
||
|
||
>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
|
||
>>> df1 = pd.DataFrame(data=d1)
|
||
>>> df1
|
||
col1 col2
|
||
0 1 3
|
||
1 2 4
|
||
|
||
>>> df1_transposed = df1.T # or df1.transpose()
|
||
>>> df1_transposed
|
||
0 1
|
||
col1 1 2
|
||
col2 3 4
|
||
|
||
When the dtype is homogeneous in the original DataFrame, we get a
|
||
transposed DataFrame with the same dtype:
|
||
|
||
>>> df1.dtypes
|
||
col1 int64
|
||
col2 int64
|
||
dtype: object
|
||
>>> df1_transposed.dtypes
|
||
0 int64
|
||
1 int64
|
||
dtype: object
|
||
|
||
**Non-square DataFrame with mixed dtypes**
|
||
|
||
>>> d2 = {'name': ['Alice', 'Bob'],
|
||
... 'score': [9.5, 8],
|
||
... 'employed': [False, True],
|
||
... 'kids': [0, 0]}
|
||
>>> df2 = pd.DataFrame(data=d2)
|
||
>>> df2
|
||
name score employed kids
|
||
0 Alice 9.5 False 0
|
||
1 Bob 8.0 True 0
|
||
|
||
>>> df2_transposed = df2.T # or df2.transpose()
|
||
>>> df2_transposed
|
||
0 1
|
||
name Alice Bob
|
||
score 9.5 8.0
|
||
employed False True
|
||
kids 0 0
|
||
|
||
When the DataFrame has mixed dtypes, we get a transposed DataFrame with
|
||
the `object` dtype:
|
||
|
||
>>> df2.dtypes
|
||
name object
|
||
score float64
|
||
employed bool
|
||
kids int64
|
||
dtype: object
|
||
>>> df2_transposed.dtypes
|
||
0 object
|
||
1 object
|
||
dtype: object
|
||
"""
|
||
nv.validate_transpose(args, {})
|
||
# construct the args
|
||
|
||
dtypes = list(self.dtypes)
|
||
if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]):
|
||
# We have EAs with the same dtype. We can preserve that dtype in transpose.
|
||
dtype = dtypes[0]
|
||
arr_type = dtype.construct_array_type()
|
||
values = self.values
|
||
|
||
new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
|
||
result = self._constructor(
|
||
dict(zip(self.index, new_values)), index=self.columns
|
||
)
|
||
|
||
else:
|
||
new_values = self.values.T
|
||
if copy:
|
||
new_values = new_values.copy()
|
||
result = self._constructor(
|
||
new_values, index=self.columns, columns=self.index
|
||
)
|
||
|
||
return result.__finalize__(self, method="transpose")
|
||
|
||
@property
|
||
def T(self) -> DataFrame:
|
||
return self.transpose()
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Indexing Methods
|
||
|
||
def _ixs(self, i: int, axis: int = 0):
|
||
"""
|
||
Parameters
|
||
----------
|
||
i : int
|
||
axis : int
|
||
|
||
Notes
|
||
-----
|
||
If slice passed, the resulting data will be a view.
|
||
"""
|
||
# irow
|
||
if axis == 0:
|
||
new_values = self._mgr.fast_xs(i)
|
||
|
||
# if we are a copy, mark as such
|
||
copy = isinstance(new_values, np.ndarray) and new_values.base is None
|
||
result = self._constructor_sliced(
|
||
new_values,
|
||
index=self.columns,
|
||
name=self.index[i],
|
||
dtype=new_values.dtype,
|
||
)
|
||
result._set_is_copy(self, copy=copy)
|
||
return result
|
||
|
||
# icol
|
||
else:
|
||
label = self.columns[i]
|
||
|
||
values = self._mgr.iget(i)
|
||
result = self._box_col_values(values, i)
|
||
|
||
# this is a cached value, mark it so
|
||
result._set_as_cached(label, self)
|
||
|
||
return result
|
||
|
||
def _get_column_array(self, i: int) -> ArrayLike:
|
||
"""
|
||
Get the values of the i'th column (ndarray or ExtensionArray, as stored
|
||
in the Block)
|
||
"""
|
||
return self._mgr.iget_values(i)
|
||
|
||
def _iter_column_arrays(self) -> Iterator[ArrayLike]:
|
||
"""
|
||
Iterate over the arrays of all columns in order.
|
||
This returns the values as stored in the Block (ndarray or ExtensionArray).
|
||
"""
|
||
for i in range(len(self.columns)):
|
||
yield self._get_column_array(i)
|
||
|
||
def __getitem__(self, key):
|
||
key = lib.item_from_zerodim(key)
|
||
key = com.apply_if_callable(key, self)
|
||
|
||
if is_hashable(key):
|
||
# shortcut if the key is in columns
|
||
if self.columns.is_unique and key in self.columns:
|
||
if isinstance(self.columns, MultiIndex):
|
||
return self._getitem_multilevel(key)
|
||
return self._get_item_cache(key)
|
||
|
||
# Do we have a slicer (on rows)?
|
||
indexer = convert_to_index_sliceable(self, key)
|
||
if indexer is not None:
|
||
if isinstance(indexer, np.ndarray):
|
||
indexer = lib.maybe_indices_to_slice(
|
||
indexer.astype(np.intp, copy=False), len(self)
|
||
)
|
||
# either we have a slice or we have a string that can be converted
|
||
# to a slice for partial-string date indexing
|
||
return self._slice(indexer, axis=0)
|
||
|
||
# Do we have a (boolean) DataFrame?
|
||
if isinstance(key, DataFrame):
|
||
return self.where(key)
|
||
|
||
# Do we have a (boolean) 1d indexer?
|
||
if com.is_bool_indexer(key):
|
||
return self._getitem_bool_array(key)
|
||
|
||
# We are left with two options: a single key, and a collection of keys,
|
||
# We interpret tuples as collections only for non-MultiIndex
|
||
is_single_key = isinstance(key, tuple) or not is_list_like(key)
|
||
|
||
if is_single_key:
|
||
if self.columns.nlevels > 1:
|
||
return self._getitem_multilevel(key)
|
||
indexer = self.columns.get_loc(key)
|
||
if is_integer(indexer):
|
||
indexer = [indexer]
|
||
else:
|
||
if is_iterator(key):
|
||
key = list(key)
|
||
indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
|
||
|
||
# take() does not accept boolean indexers
|
||
if getattr(indexer, "dtype", None) == bool:
|
||
indexer = np.where(indexer)[0]
|
||
|
||
data = self._take_with_is_copy(indexer, axis=1)
|
||
|
||
if is_single_key:
|
||
# What does looking for a single key in a non-unique index return?
|
||
# The behavior is inconsistent. It returns a Series, except when
|
||
# - the key itself is repeated (test on data.shape, #9519), or
|
||
# - we have a MultiIndex on columns (test on self.columns, #21309)
|
||
if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
|
||
# GH#26490 using data[key] can cause RecursionError
|
||
data = data._get_item_cache(key)
|
||
|
||
return data
|
||
|
||
def _getitem_bool_array(self, key):
|
||
# also raises Exception if object array with NA values
|
||
# warning here just in case -- previously __setitem__ was
|
||
# reindexing but __getitem__ was not; it seems more reasonable to
|
||
# go with the __setitem__ behavior since that is more consistent
|
||
# with all other indexing behavior
|
||
if isinstance(key, Series) and not key.index.equals(self.index):
|
||
warnings.warn(
|
||
"Boolean Series key will be reindexed to match DataFrame index.",
|
||
UserWarning,
|
||
stacklevel=3,
|
||
)
|
||
elif len(key) != len(self.index):
|
||
raise ValueError(
|
||
f"Item wrong length {len(key)} instead of {len(self.index)}."
|
||
)
|
||
|
||
# check_bool_indexer will throw exception if Series key cannot
|
||
# be reindexed to match DataFrame rows
|
||
key = check_bool_indexer(self.index, key)
|
||
indexer = key.nonzero()[0]
|
||
return self._take_with_is_copy(indexer, axis=0)
|
||
|
||
def _getitem_multilevel(self, key):
|
||
# self.columns is a MultiIndex
|
||
loc = self.columns.get_loc(key)
|
||
if isinstance(loc, (slice, np.ndarray)):
|
||
new_columns = self.columns[loc]
|
||
result_columns = maybe_droplevels(new_columns, key)
|
||
if self._is_mixed_type:
|
||
result = self.reindex(columns=new_columns)
|
||
result.columns = result_columns
|
||
else:
|
||
new_values = self.values[:, loc]
|
||
result = self._constructor(
|
||
new_values, index=self.index, columns=result_columns
|
||
)
|
||
result = result.__finalize__(self)
|
||
|
||
# If there is only one column being returned, and its name is
|
||
# either an empty string, or a tuple with an empty string as its
|
||
# first element, then treat the empty string as a placeholder
|
||
# and return the column as if the user had provided that empty
|
||
# string in the key. If the result is a Series, exclude the
|
||
# implied empty string from its name.
|
||
if len(result.columns) == 1:
|
||
top = result.columns[0]
|
||
if isinstance(top, tuple):
|
||
top = top[0]
|
||
if top == "":
|
||
result = result[""]
|
||
if isinstance(result, Series):
|
||
result = self._constructor_sliced(
|
||
result, index=self.index, name=key
|
||
)
|
||
|
||
result._set_is_copy(self)
|
||
return result
|
||
else:
|
||
# loc is neither a slice nor ndarray, so must be an int
|
||
return self._ixs(loc, axis=1)
|
||
|
||
def _get_value(self, index, col, takeable: bool = False):
|
||
"""
|
||
Quickly retrieve single value at passed column and index.
|
||
|
||
Parameters
|
||
----------
|
||
index : row label
|
||
col : column label
|
||
takeable : interpret the index/col as indexers, default False
|
||
|
||
Returns
|
||
-------
|
||
scalar
|
||
"""
|
||
if takeable:
|
||
series = self._ixs(col, axis=1)
|
||
return series._values[index]
|
||
|
||
series = self._get_item_cache(col)
|
||
engine = self.index._engine
|
||
|
||
try:
|
||
loc = engine.get_loc(index)
|
||
return series._values[loc]
|
||
except KeyError:
|
||
# GH 20629
|
||
if self.index.nlevels > 1:
|
||
# partial indexing forbidden
|
||
raise
|
||
|
||
# we cannot handle direct indexing
|
||
# use positional
|
||
col = self.columns.get_loc(col)
|
||
index = self.index.get_loc(index)
|
||
return self._get_value(index, col, takeable=True)
|
||
|
||
def __setitem__(self, key, value):
|
||
key = com.apply_if_callable(key, self)
|
||
|
||
# see if we can slice the rows
|
||
indexer = convert_to_index_sliceable(self, key)
|
||
if indexer is not None:
|
||
# either we have a slice or we have a string that can be converted
|
||
# to a slice for partial-string date indexing
|
||
return self._setitem_slice(indexer, value)
|
||
|
||
if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
|
||
self._setitem_frame(key, value)
|
||
elif isinstance(key, (Series, np.ndarray, list, Index)):
|
||
self._setitem_array(key, value)
|
||
else:
|
||
# set column
|
||
self._set_item(key, value)
|
||
|
||
def _setitem_slice(self, key: slice, value):
|
||
# NB: we can't just use self.loc[key] = value because that
|
||
# operates on labels and we need to operate positional for
|
||
# backwards-compat, xref GH#31469
|
||
self._check_setitem_copy()
|
||
self.iloc[key] = value
|
||
|
||
def _setitem_array(self, key, value):
|
||
# also raises Exception if object array with NA values
|
||
if com.is_bool_indexer(key):
|
||
if len(key) != len(self.index):
|
||
raise ValueError(
|
||
f"Item wrong length {len(key)} instead of {len(self.index)}!"
|
||
)
|
||
key = check_bool_indexer(self.index, key)
|
||
indexer = key.nonzero()[0]
|
||
self._check_setitem_copy()
|
||
if isinstance(value, DataFrame):
|
||
# GH#39931 reindex since iloc does not align
|
||
value = value.reindex(self.index.take(indexer))
|
||
self.iloc[indexer] = value
|
||
else:
|
||
if isinstance(value, DataFrame):
|
||
if len(value.columns) != len(key):
|
||
raise ValueError("Columns must be same length as key")
|
||
for k1, k2 in zip(key, value.columns):
|
||
self[k1] = value[k2]
|
||
else:
|
||
self.loc._ensure_listlike_indexer(key, axis=1, value=value)
|
||
indexer = self.loc._get_listlike_indexer(
|
||
key, axis=1, raise_missing=False
|
||
)[1]
|
||
self._check_setitem_copy()
|
||
self.iloc[:, indexer] = value
|
||
|
||
def _setitem_frame(self, key, value):
|
||
# support boolean setting with DataFrame input, e.g.
|
||
# df[df > df2] = 0
|
||
if isinstance(key, np.ndarray):
|
||
if key.shape != self.shape:
|
||
raise ValueError("Array conditional must be same shape as self")
|
||
key = self._constructor(key, **self._construct_axes_dict())
|
||
|
||
if key.size and not is_bool_dtype(key.values):
|
||
raise TypeError(
|
||
"Must pass DataFrame or 2-d ndarray with boolean values only"
|
||
)
|
||
|
||
self._check_inplace_setting(value)
|
||
self._check_setitem_copy()
|
||
self._where(-key, value, inplace=True)
|
||
|
||
def _iset_item(self, loc: int, value):
|
||
self._ensure_valid_index(value)
|
||
|
||
# technically _sanitize_column expects a label, not a position,
|
||
# but the behavior is the same as long as we pass broadcast=False
|
||
value = self._sanitize_column(loc, value, broadcast=False)
|
||
NDFrame._iset_item(self, loc, value)
|
||
|
||
# check if we are modifying a copy
|
||
# try to set first as we want an invalid
|
||
# value exception to occur first
|
||
if len(self):
|
||
self._check_setitem_copy()
|
||
|
||
def _set_item(self, key, value):
|
||
"""
|
||
Add series to DataFrame in specified column.
|
||
|
||
If series is a numpy-array (not a Series/TimeSeries), it must be the
|
||
same length as the DataFrames index or an error will be thrown.
|
||
|
||
Series/TimeSeries will be conformed to the DataFrames index to
|
||
ensure homogeneity.
|
||
"""
|
||
self._ensure_valid_index(value)
|
||
value = self._sanitize_column(key, value)
|
||
NDFrame._set_item(self, key, value)
|
||
|
||
# check if we are modifying a copy
|
||
# try to set first as we want an invalid
|
||
# value exception to occur first
|
||
if len(self):
|
||
self._check_setitem_copy()
|
||
|
||
def _set_value(self, index, col, value, takeable: bool = False):
|
||
"""
|
||
Put single value at passed column and index.
|
||
|
||
Parameters
|
||
----------
|
||
index : row label
|
||
col : column label
|
||
value : scalar
|
||
takeable : interpret the index/col as indexers, default False
|
||
"""
|
||
try:
|
||
if takeable is True:
|
||
series = self._ixs(col, axis=1)
|
||
series._set_value(index, value, takeable=True)
|
||
return
|
||
|
||
series = self._get_item_cache(col)
|
||
engine = self.index._engine
|
||
loc = engine.get_loc(index)
|
||
validate_numeric_casting(series.dtype, value)
|
||
|
||
series._values[loc] = value
|
||
# Note: trying to use series._set_value breaks tests in
|
||
# tests.frame.indexing.test_indexing and tests.indexing.test_partial
|
||
except (KeyError, TypeError):
|
||
# set using a non-recursive method & reset the cache
|
||
if takeable:
|
||
self.iloc[index, col] = value
|
||
else:
|
||
self.loc[index, col] = value
|
||
self._item_cache.pop(col, None)
|
||
|
||
def _ensure_valid_index(self, value):
|
||
"""
|
||
Ensure that if we don't have an index, that we can create one from the
|
||
passed value.
|
||
"""
|
||
# GH5632, make sure that we are a Series convertible
|
||
if not len(self.index) and is_list_like(value) and len(value):
|
||
try:
|
||
value = Series(value)
|
||
except (ValueError, NotImplementedError, TypeError) as err:
|
||
raise ValueError(
|
||
"Cannot set a frame with no defined index "
|
||
"and a value that cannot be converted to a Series"
|
||
) from err
|
||
|
||
# GH31368 preserve name of index
|
||
index_copy = value.index.copy()
|
||
if self.index.name is not None:
|
||
index_copy.name = self.index.name
|
||
|
||
self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
|
||
|
||
def _box_col_values(self, values, loc: int) -> Series:
|
||
"""
|
||
Provide boxed values for a column.
|
||
"""
|
||
# Lookup in columns so that if e.g. a str datetime was passed
|
||
# we attach the Timestamp object as the name.
|
||
name = self.columns[loc]
|
||
klass = self._constructor_sliced
|
||
return klass(values, index=self.index, name=name, fastpath=True)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Unsorted
|
||
|
||
def query(self, expr, inplace=False, **kwargs):
|
||
"""
|
||
Query the columns of a DataFrame with a boolean expression.
|
||
|
||
Parameters
|
||
----------
|
||
expr : str
|
||
The query string to evaluate.
|
||
|
||
You can refer to variables
|
||
in the environment by prefixing them with an '@' character like
|
||
``@a + b``.
|
||
|
||
You can refer to column names that are not valid Python variable names
|
||
by surrounding them in backticks. Thus, column names containing spaces
|
||
or punctuations (besides underscores) or starting with digits must be
|
||
surrounded by backticks. (For example, a column named "Area (cm^2) would
|
||
be referenced as `Area (cm^2)`). Column names which are Python keywords
|
||
(like "list", "for", "import", etc) cannot be used.
|
||
|
||
For example, if one of your columns is called ``a a`` and you want
|
||
to sum it with ``b``, your query should be ```a a` + b``.
|
||
|
||
.. versionadded:: 0.25.0
|
||
Backtick quoting introduced.
|
||
|
||
.. versionadded:: 1.0.0
|
||
Expanding functionality of backtick quoting for more than only spaces.
|
||
|
||
inplace : bool
|
||
Whether the query should modify the data in place or return
|
||
a modified copy.
|
||
**kwargs
|
||
See the documentation for :func:`eval` for complete details
|
||
on the keyword arguments accepted by :meth:`DataFrame.query`.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame resulting from the provided query expression or
|
||
None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
eval : Evaluate a string describing operations on
|
||
DataFrame columns.
|
||
DataFrame.eval : Evaluate a string describing operations on
|
||
DataFrame columns.
|
||
|
||
Notes
|
||
-----
|
||
The result of the evaluation of this expression is first passed to
|
||
:attr:`DataFrame.loc` and if that fails because of a
|
||
multidimensional key (e.g., a DataFrame) then the result will be passed
|
||
to :meth:`DataFrame.__getitem__`.
|
||
|
||
This method uses the top-level :func:`eval` function to
|
||
evaluate the passed query.
|
||
|
||
The :meth:`~pandas.DataFrame.query` method uses a slightly
|
||
modified Python syntax by default. For example, the ``&`` and ``|``
|
||
(bitwise) operators have the precedence of their boolean cousins,
|
||
:keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
|
||
however the semantics are different.
|
||
|
||
You can change the semantics of the expression by passing the keyword
|
||
argument ``parser='python'``. This enforces the same semantics as
|
||
evaluation in Python space. Likewise, you can pass ``engine='python'``
|
||
to evaluate an expression using Python itself as a backend. This is not
|
||
recommended as it is inefficient compared to using ``numexpr`` as the
|
||
engine.
|
||
|
||
The :attr:`DataFrame.index` and
|
||
:attr:`DataFrame.columns` attributes of the
|
||
:class:`~pandas.DataFrame` instance are placed in the query namespace
|
||
by default, which allows you to treat both the index and columns of the
|
||
frame as a column in the frame.
|
||
The identifier ``index`` is used for the frame index; you can also
|
||
use the name of the index to identify it in a query. Please note that
|
||
Python keywords may not be used as identifiers.
|
||
|
||
For further details and examples see the ``query`` documentation in
|
||
:ref:`indexing <indexing.query>`.
|
||
|
||
*Backtick quoted variables*
|
||
|
||
Backtick quoted variables are parsed as literal Python code and
|
||
are converted internally to a Python valid identifier.
|
||
This can lead to the following problems.
|
||
|
||
During parsing a number of disallowed characters inside the backtick
|
||
quoted string are replaced by strings that are allowed as a Python identifier.
|
||
These characters include all operators in Python, the space character, the
|
||
question mark, the exclamation mark, the dollar sign, and the euro sign.
|
||
For other characters that fall outside the ASCII range (U+0001..U+007F)
|
||
and those that are not further specified in PEP 3131,
|
||
the query parser will raise an error.
|
||
This excludes whitespace different than the space character,
|
||
but also the hashtag (as it is used for comments) and the backtick
|
||
itself (backtick can also not be escaped).
|
||
|
||
In a special case, quotes that make a pair around a backtick can
|
||
confuse the parser.
|
||
For example, ```it's` > `that's``` will raise an error,
|
||
as it forms a quoted string (``'s > `that'``) with a backtick inside.
|
||
|
||
See also the Python documentation about lexical analysis
|
||
(https://docs.python.org/3/reference/lexical_analysis.html)
|
||
in combination with the source code in :mod:`pandas.core.computation.parsing`.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': range(1, 6),
|
||
... 'B': range(10, 0, -2),
|
||
... 'C C': range(10, 5, -1)})
|
||
>>> df
|
||
A B C C
|
||
0 1 10 10
|
||
1 2 8 9
|
||
2 3 6 8
|
||
3 4 4 7
|
||
4 5 2 6
|
||
>>> df.query('A > B')
|
||
A B C C
|
||
4 5 2 6
|
||
|
||
The previous expression is equivalent to
|
||
|
||
>>> df[df.A > df.B]
|
||
A B C C
|
||
4 5 2 6
|
||
|
||
For columns with spaces in their name, you can use backtick quoting.
|
||
|
||
>>> df.query('B == `C C`')
|
||
A B C C
|
||
0 1 10 10
|
||
|
||
The previous expression is equivalent to
|
||
|
||
>>> df[df.B == df['C C']]
|
||
A B C C
|
||
0 1 10 10
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
if not isinstance(expr, str):
|
||
msg = f"expr must be a string to be evaluated, {type(expr)} given"
|
||
raise ValueError(msg)
|
||
kwargs["level"] = kwargs.pop("level", 0) + 1
|
||
kwargs["target"] = None
|
||
res = self.eval(expr, **kwargs)
|
||
|
||
try:
|
||
result = self.loc[res]
|
||
except ValueError:
|
||
# when res is multi-dimensional loc raises, but this is sometimes a
|
||
# valid query
|
||
result = self[res]
|
||
|
||
if inplace:
|
||
self._update_inplace(result)
|
||
else:
|
||
return result
|
||
|
||
def eval(self, expr, inplace=False, **kwargs):
|
||
"""
|
||
Evaluate a string describing operations on DataFrame columns.
|
||
|
||
Operates on columns only, not specific rows or elements. This allows
|
||
`eval` to run arbitrary code, which can make you vulnerable to code
|
||
injection if you pass user input to this function.
|
||
|
||
Parameters
|
||
----------
|
||
expr : str
|
||
The expression string to evaluate.
|
||
inplace : bool, default False
|
||
If the expression contains an assignment, whether to perform the
|
||
operation inplace and mutate the existing DataFrame. Otherwise,
|
||
a new DataFrame is returned.
|
||
**kwargs
|
||
See the documentation for :func:`eval` for complete details
|
||
on the keyword arguments accepted by
|
||
:meth:`~pandas.DataFrame.query`.
|
||
|
||
Returns
|
||
-------
|
||
ndarray, scalar, pandas object, or None
|
||
The result of the evaluation or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.query : Evaluates a boolean expression to query the columns
|
||
of a frame.
|
||
DataFrame.assign : Can evaluate an expression or function to create new
|
||
values for a column.
|
||
eval : Evaluate a Python expression as a string using various
|
||
backends.
|
||
|
||
Notes
|
||
-----
|
||
For more details see the API documentation for :func:`~eval`.
|
||
For detailed examples see :ref:`enhancing performance with eval
|
||
<enhancingperf.eval>`.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
|
||
>>> df
|
||
A B
|
||
0 1 10
|
||
1 2 8
|
||
2 3 6
|
||
3 4 4
|
||
4 5 2
|
||
>>> df.eval('A + B')
|
||
0 11
|
||
1 10
|
||
2 9
|
||
3 8
|
||
4 7
|
||
dtype: int64
|
||
|
||
Assignment is allowed though by default the original DataFrame is not
|
||
modified.
|
||
|
||
>>> df.eval('C = A + B')
|
||
A B C
|
||
0 1 10 11
|
||
1 2 8 10
|
||
2 3 6 9
|
||
3 4 4 8
|
||
4 5 2 7
|
||
>>> df
|
||
A B
|
||
0 1 10
|
||
1 2 8
|
||
2 3 6
|
||
3 4 4
|
||
4 5 2
|
||
|
||
Use ``inplace=True`` to modify the original DataFrame.
|
||
|
||
>>> df.eval('C = A + B', inplace=True)
|
||
>>> df
|
||
A B C
|
||
0 1 10 11
|
||
1 2 8 10
|
||
2 3 6 9
|
||
3 4 4 8
|
||
4 5 2 7
|
||
|
||
Multiple columns can be assigned to using multi-line expressions:
|
||
|
||
>>> df.eval(
|
||
... '''
|
||
... C = A + B
|
||
... D = A - B
|
||
... '''
|
||
... )
|
||
A B C D
|
||
0 1 10 11 -9
|
||
1 2 8 10 -6
|
||
2 3 6 9 -3
|
||
3 4 4 8 0
|
||
4 5 2 7 3
|
||
"""
|
||
from pandas.core.computation.eval import eval as _eval
|
||
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
resolvers = kwargs.pop("resolvers", None)
|
||
kwargs["level"] = kwargs.pop("level", 0) + 1
|
||
if resolvers is None:
|
||
index_resolvers = self._get_index_resolvers()
|
||
column_resolvers = self._get_cleaned_column_resolvers()
|
||
resolvers = column_resolvers, index_resolvers
|
||
if "target" not in kwargs:
|
||
kwargs["target"] = self
|
||
kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
|
||
|
||
return _eval(expr, inplace=inplace, **kwargs)
|
||
|
||
def select_dtypes(self, include=None, exclude=None) -> DataFrame:
|
||
"""
|
||
Return a subset of the DataFrame's columns based on the column dtypes.
|
||
|
||
Parameters
|
||
----------
|
||
include, exclude : scalar or list-like
|
||
A selection of dtypes or strings to be included/excluded. At least
|
||
one of these parameters must be supplied.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
The subset of the frame including the dtypes in ``include`` and
|
||
excluding the dtypes in ``exclude``.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
* If both of ``include`` and ``exclude`` are empty
|
||
* If ``include`` and ``exclude`` have overlapping elements
|
||
* If any kind of string dtype is passed in.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.dtypes: Return Series with the data type of each column.
|
||
|
||
Notes
|
||
-----
|
||
* To select all *numeric* types, use ``np.number`` or ``'number'``
|
||
* To select strings you must use the ``object`` dtype, but note that
|
||
this will return *all* object dtype columns
|
||
* See the `numpy dtype hierarchy
|
||
<https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
|
||
* To select datetimes, use ``np.datetime64``, ``'datetime'`` or
|
||
``'datetime64'``
|
||
* To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
|
||
``'timedelta64'``
|
||
* To select Pandas categorical dtypes, use ``'category'``
|
||
* To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
|
||
0.20.0) or ``'datetime64[ns, tz]'``
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'a': [1, 2] * 3,
|
||
... 'b': [True, False] * 3,
|
||
... 'c': [1.0, 2.0] * 3})
|
||
>>> df
|
||
a b c
|
||
0 1 True 1.0
|
||
1 2 False 2.0
|
||
2 1 True 1.0
|
||
3 2 False 2.0
|
||
4 1 True 1.0
|
||
5 2 False 2.0
|
||
|
||
>>> df.select_dtypes(include='bool')
|
||
b
|
||
0 True
|
||
1 False
|
||
2 True
|
||
3 False
|
||
4 True
|
||
5 False
|
||
|
||
>>> df.select_dtypes(include=['float64'])
|
||
c
|
||
0 1.0
|
||
1 2.0
|
||
2 1.0
|
||
3 2.0
|
||
4 1.0
|
||
5 2.0
|
||
|
||
>>> df.select_dtypes(exclude=['int64'])
|
||
b c
|
||
0 True 1.0
|
||
1 False 2.0
|
||
2 True 1.0
|
||
3 False 2.0
|
||
4 True 1.0
|
||
5 False 2.0
|
||
"""
|
||
if not is_list_like(include):
|
||
include = (include,) if include is not None else ()
|
||
if not is_list_like(exclude):
|
||
exclude = (exclude,) if exclude is not None else ()
|
||
|
||
selection = (frozenset(include), frozenset(exclude))
|
||
|
||
if not any(selection):
|
||
raise ValueError("at least one of include or exclude must be nonempty")
|
||
|
||
# convert the myriad valid dtypes object to a single representation
|
||
include = frozenset(infer_dtype_from_object(x) for x in include)
|
||
exclude = frozenset(infer_dtype_from_object(x) for x in exclude)
|
||
for dtypes in (include, exclude):
|
||
invalidate_string_dtypes(dtypes)
|
||
|
||
# can't both include AND exclude!
|
||
if not include.isdisjoint(exclude):
|
||
raise ValueError(f"include and exclude overlap on {(include & exclude)}")
|
||
|
||
# We raise when both include and exclude are empty
|
||
# Hence, we can just shrink the columns we want to keep
|
||
keep_these = np.full(self.shape[1], True)
|
||
|
||
def extract_unique_dtypes_from_dtypes_set(
|
||
dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray
|
||
) -> List[Dtype]:
|
||
extracted_dtypes = [
|
||
unique_dtype
|
||
for unique_dtype in unique_dtypes
|
||
# error: Argument 1 to "tuple" has incompatible type
|
||
# "FrozenSet[Union[ExtensionDtype, str, Any, Type[str],
|
||
# Type[float], Type[int], Type[complex], Type[bool]]]";
|
||
# expected "Iterable[Union[type, Tuple[Any, ...]]]"
|
||
if issubclass(
|
||
unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type]
|
||
)
|
||
]
|
||
return extracted_dtypes
|
||
|
||
unique_dtypes = self.dtypes.unique()
|
||
|
||
if include:
|
||
included_dtypes = extract_unique_dtypes_from_dtypes_set(
|
||
include, unique_dtypes
|
||
)
|
||
keep_these &= self.dtypes.isin(included_dtypes)
|
||
|
||
if exclude:
|
||
excluded_dtypes = extract_unique_dtypes_from_dtypes_set(
|
||
exclude, unique_dtypes
|
||
)
|
||
keep_these &= ~self.dtypes.isin(excluded_dtypes)
|
||
|
||
return self.iloc[:, keep_these.values]
|
||
|
||
def insert(self, loc, column, value, allow_duplicates=False) -> None:
|
||
"""
|
||
Insert column into DataFrame at specified location.
|
||
|
||
Raises a ValueError if `column` is already contained in the DataFrame,
|
||
unless `allow_duplicates` is set to True.
|
||
|
||
Parameters
|
||
----------
|
||
loc : int
|
||
Insertion index. Must verify 0 <= loc <= len(columns).
|
||
column : str, number, or hashable object
|
||
Label of the inserted column.
|
||
value : int, Series, or array-like
|
||
allow_duplicates : bool, optional
|
||
"""
|
||
if allow_duplicates and not self.flags.allows_duplicate_labels:
|
||
raise ValueError(
|
||
"Cannot specify 'allow_duplicates=True' when "
|
||
"'self.flags.allows_duplicate_labels' is False."
|
||
)
|
||
self._ensure_valid_index(value)
|
||
value = self._sanitize_column(column, value, broadcast=False)
|
||
self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates)
|
||
|
||
def assign(self, **kwargs) -> DataFrame:
|
||
r"""
|
||
Assign new columns to a DataFrame.
|
||
|
||
Returns a new object with all original columns in addition to new ones.
|
||
Existing columns that are re-assigned will be overwritten.
|
||
|
||
Parameters
|
||
----------
|
||
**kwargs : dict of {str: callable or Series}
|
||
The column names are keywords. If the values are
|
||
callable, they are computed on the DataFrame and
|
||
assigned to the new columns. The callable must not
|
||
change input DataFrame (though pandas doesn't check it).
|
||
If the values are not callable, (e.g. a Series, scalar, or array),
|
||
they are simply assigned.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
A new DataFrame with the new columns in addition to
|
||
all the existing columns.
|
||
|
||
Notes
|
||
-----
|
||
Assigning multiple columns within the same ``assign`` is possible.
|
||
Later items in '\*\*kwargs' may refer to newly created or modified
|
||
columns in 'df'; items are computed and assigned into 'df' in order.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
|
||
... index=['Portland', 'Berkeley'])
|
||
>>> df
|
||
temp_c
|
||
Portland 17.0
|
||
Berkeley 25.0
|
||
|
||
Where the value is a callable, evaluated on `df`:
|
||
|
||
>>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
|
||
temp_c temp_f
|
||
Portland 17.0 62.6
|
||
Berkeley 25.0 77.0
|
||
|
||
Alternatively, the same behavior can be achieved by directly
|
||
referencing an existing Series or sequence:
|
||
|
||
>>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
|
||
temp_c temp_f
|
||
Portland 17.0 62.6
|
||
Berkeley 25.0 77.0
|
||
|
||
You can create multiple columns within the same assign where one
|
||
of the columns depends on another one defined within the same assign:
|
||
|
||
>>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
|
||
... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
|
||
temp_c temp_f temp_k
|
||
Portland 17.0 62.6 290.15
|
||
Berkeley 25.0 77.0 298.15
|
||
"""
|
||
data = self.copy()
|
||
|
||
for k, v in kwargs.items():
|
||
data[k] = com.apply_if_callable(v, data)
|
||
return data
|
||
|
||
def _sanitize_column(self, key, value, broadcast=True):
|
||
"""
|
||
Ensures new columns (which go into the BlockManager as new blocks) are
|
||
always copied and converted into an array.
|
||
|
||
Parameters
|
||
----------
|
||
key : object
|
||
value : scalar, Series, or array-like
|
||
broadcast : bool, default True
|
||
If ``key`` matches multiple duplicate column names in the
|
||
DataFrame, this parameter indicates whether ``value`` should be
|
||
tiled so that the returned array contains a (duplicated) column for
|
||
each occurrence of the key. If False, ``value`` will not be tiled.
|
||
|
||
Returns
|
||
-------
|
||
numpy.ndarray
|
||
"""
|
||
|
||
def reindexer(value):
|
||
# reindex if necessary
|
||
|
||
if value.index.equals(self.index) or not len(self.index):
|
||
value = value._values.copy()
|
||
else:
|
||
|
||
# GH 4107
|
||
try:
|
||
value = value.reindex(self.index)._values
|
||
except ValueError as err:
|
||
# raised in MultiIndex.from_tuples, see test_insert_error_msmgs
|
||
if not value.index.is_unique:
|
||
# duplicate axis
|
||
raise err
|
||
|
||
# other
|
||
raise TypeError(
|
||
"incompatible index of inserted column with frame index"
|
||
) from err
|
||
return value
|
||
|
||
if isinstance(value, Series):
|
||
value = reindexer(value)
|
||
|
||
elif isinstance(value, DataFrame):
|
||
# align right-hand-side columns if self.columns
|
||
# is multi-index and self[key] is a sub-frame
|
||
if isinstance(self.columns, MultiIndex) and key in self.columns:
|
||
loc = self.columns.get_loc(key)
|
||
if isinstance(loc, (slice, Series, np.ndarray, Index)):
|
||
cols = maybe_droplevels(self.columns[loc], key)
|
||
if len(cols) and not cols.equals(value.columns):
|
||
value = value.reindex(cols, axis=1)
|
||
# now align rows
|
||
value = reindexer(value).T
|
||
|
||
elif isinstance(value, ExtensionArray):
|
||
# Explicitly copy here, instead of in sanitize_index,
|
||
# as sanitize_index won't copy an EA, even with copy=True
|
||
value = value.copy()
|
||
value = sanitize_index(value, self.index)
|
||
|
||
elif isinstance(value, Index) or is_sequence(value):
|
||
|
||
# turn me into an ndarray
|
||
value = sanitize_index(value, self.index)
|
||
if not isinstance(value, (np.ndarray, Index)):
|
||
if isinstance(value, list) and len(value) > 0:
|
||
value = maybe_convert_platform(value)
|
||
else:
|
||
value = com.asarray_tuplesafe(value)
|
||
elif value.ndim == 2:
|
||
value = value.copy().T
|
||
elif isinstance(value, Index):
|
||
value = value.copy(deep=True)
|
||
else:
|
||
value = value.copy()
|
||
|
||
# possibly infer to datetimelike
|
||
if is_object_dtype(value.dtype):
|
||
value = maybe_infer_to_datetimelike(value)
|
||
|
||
else:
|
||
# cast ignores pandas dtypes. so save the dtype first
|
||
infer_dtype, fill_value = infer_dtype_from_scalar(value, pandas_dtype=True)
|
||
|
||
value = construct_1d_arraylike_from_scalar(
|
||
fill_value, len(self), infer_dtype
|
||
)
|
||
|
||
value = maybe_cast_to_datetime(value, infer_dtype)
|
||
|
||
# return internal types directly
|
||
if is_extension_array_dtype(value):
|
||
return value
|
||
|
||
# broadcast across multiple columns if necessary
|
||
if broadcast and key in self.columns and value.ndim == 1:
|
||
if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
|
||
existing_piece = self[key]
|
||
if isinstance(existing_piece, DataFrame):
|
||
value = np.tile(value, (len(existing_piece.columns), 1))
|
||
|
||
return np.atleast_2d(np.asarray(value))
|
||
|
||
@property
|
||
def _series(self):
|
||
return {
|
||
item: Series(
|
||
self._mgr.iget(idx), index=self.index, name=item, fastpath=True
|
||
)
|
||
for idx, item in enumerate(self.columns)
|
||
}
|
||
|
||
def lookup(self, row_labels, col_labels) -> np.ndarray:
|
||
"""
|
||
Label-based "fancy indexing" function for DataFrame.
|
||
Given equal-length arrays of row and column labels, return an
|
||
array of the values corresponding to each (row, col) pair.
|
||
|
||
.. deprecated:: 1.2.0
|
||
DataFrame.lookup is deprecated,
|
||
use DataFrame.melt and DataFrame.loc instead.
|
||
For further details see
|
||
:ref:`Looking up values by index/column labels <indexing.lookup>`.
|
||
|
||
Parameters
|
||
----------
|
||
row_labels : sequence
|
||
The row labels to use for lookup.
|
||
col_labels : sequence
|
||
The column labels to use for lookup.
|
||
|
||
Returns
|
||
-------
|
||
numpy.ndarray
|
||
The found values.
|
||
"""
|
||
msg = (
|
||
"The 'lookup' method is deprecated and will be"
|
||
"removed in a future version."
|
||
"You can use DataFrame.melt and DataFrame.loc"
|
||
"as a substitute."
|
||
)
|
||
warnings.warn(msg, FutureWarning, stacklevel=2)
|
||
|
||
n = len(row_labels)
|
||
if n != len(col_labels):
|
||
raise ValueError("Row labels must have same size as column labels")
|
||
if not (self.index.is_unique and self.columns.is_unique):
|
||
# GH#33041
|
||
raise ValueError("DataFrame.lookup requires unique index and columns")
|
||
|
||
thresh = 1000
|
||
if not self._is_mixed_type or n > thresh:
|
||
values = self.values
|
||
ridx = self.index.get_indexer(row_labels)
|
||
cidx = self.columns.get_indexer(col_labels)
|
||
if (ridx == -1).any():
|
||
raise KeyError("One or more row labels was not found")
|
||
if (cidx == -1).any():
|
||
raise KeyError("One or more column labels was not found")
|
||
flat_index = ridx * len(self.columns) + cidx
|
||
result = values.flat[flat_index]
|
||
else:
|
||
result = np.empty(n, dtype="O")
|
||
for i, (r, c) in enumerate(zip(row_labels, col_labels)):
|
||
result[i] = self._get_value(r, c)
|
||
|
||
if is_object_dtype(result):
|
||
result = lib.maybe_convert_objects(result)
|
||
|
||
return result
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Reindexing and alignment
|
||
|
||
def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
|
||
frame = self
|
||
|
||
columns = axes["columns"]
|
||
if columns is not None:
|
||
frame = frame._reindex_columns(
|
||
columns, method, copy, level, fill_value, limit, tolerance
|
||
)
|
||
|
||
index = axes["index"]
|
||
if index is not None:
|
||
frame = frame._reindex_index(
|
||
index, method, copy, level, fill_value, limit, tolerance
|
||
)
|
||
|
||
return frame
|
||
|
||
def _reindex_index(
|
||
self,
|
||
new_index,
|
||
method,
|
||
copy,
|
||
level,
|
||
fill_value=np.nan,
|
||
limit=None,
|
||
tolerance=None,
|
||
):
|
||
new_index, indexer = self.index.reindex(
|
||
new_index, method=method, level=level, limit=limit, tolerance=tolerance
|
||
)
|
||
return self._reindex_with_indexers(
|
||
{0: [new_index, indexer]},
|
||
copy=copy,
|
||
fill_value=fill_value,
|
||
allow_dups=False,
|
||
)
|
||
|
||
def _reindex_columns(
|
||
self,
|
||
new_columns,
|
||
method,
|
||
copy,
|
||
level,
|
||
fill_value=None,
|
||
limit=None,
|
||
tolerance=None,
|
||
):
|
||
new_columns, indexer = self.columns.reindex(
|
||
new_columns, method=method, level=level, limit=limit, tolerance=tolerance
|
||
)
|
||
return self._reindex_with_indexers(
|
||
{1: [new_columns, indexer]},
|
||
copy=copy,
|
||
fill_value=fill_value,
|
||
allow_dups=False,
|
||
)
|
||
|
||
def _reindex_multi(self, axes, copy, fill_value) -> DataFrame:
|
||
"""
|
||
We are guaranteed non-Nones in the axes.
|
||
"""
|
||
new_index, row_indexer = self.index.reindex(axes["index"])
|
||
new_columns, col_indexer = self.columns.reindex(axes["columns"])
|
||
|
||
if row_indexer is not None and col_indexer is not None:
|
||
indexer = row_indexer, col_indexer
|
||
new_values = algorithms.take_2d_multi(
|
||
self.values, indexer, fill_value=fill_value
|
||
)
|
||
return self._constructor(new_values, index=new_index, columns=new_columns)
|
||
else:
|
||
return self._reindex_with_indexers(
|
||
{0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
|
||
copy=copy,
|
||
fill_value=fill_value,
|
||
)
|
||
|
||
@doc(NDFrame.align, **_shared_doc_kwargs)
|
||
def align(
|
||
self,
|
||
other,
|
||
join="outer",
|
||
axis=None,
|
||
level=None,
|
||
copy=True,
|
||
fill_value=None,
|
||
method=None,
|
||
limit=None,
|
||
fill_axis=0,
|
||
broadcast_axis=None,
|
||
) -> DataFrame:
|
||
return super().align(
|
||
other,
|
||
join=join,
|
||
axis=axis,
|
||
level=level,
|
||
copy=copy,
|
||
fill_value=fill_value,
|
||
method=method,
|
||
limit=limit,
|
||
fill_axis=fill_axis,
|
||
broadcast_axis=broadcast_axis,
|
||
)
|
||
|
||
@Appender(
|
||
"""
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||
|
||
Change the row labels.
|
||
|
||
>>> df.set_axis(['a', 'b', 'c'], axis='index')
|
||
A B
|
||
a 1 4
|
||
b 2 5
|
||
c 3 6
|
||
|
||
Change the column labels.
|
||
|
||
>>> df.set_axis(['I', 'II'], axis='columns')
|
||
I II
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
|
||
Now, update the labels inplace.
|
||
|
||
>>> df.set_axis(['i', 'ii'], axis='columns', inplace=True)
|
||
>>> df
|
||
i ii
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
"""
|
||
)
|
||
@Substitution(
|
||
**_shared_doc_kwargs,
|
||
extended_summary_sub=" column or",
|
||
axis_description_sub=", and 1 identifies the columns",
|
||
see_also_sub=" or columns",
|
||
)
|
||
@Appender(NDFrame.set_axis.__doc__)
|
||
def set_axis(self, labels, axis: Axis = 0, inplace: bool = False):
|
||
return super().set_axis(labels, axis=axis, inplace=inplace)
|
||
|
||
@Substitution(**_shared_doc_kwargs)
|
||
@Appender(NDFrame.reindex.__doc__)
|
||
@rewrite_axis_style_signature(
|
||
"labels",
|
||
[
|
||
("method", None),
|
||
("copy", True),
|
||
("level", None),
|
||
("fill_value", np.nan),
|
||
("limit", None),
|
||
("tolerance", None),
|
||
],
|
||
)
|
||
def reindex(self, *args, **kwargs) -> DataFrame:
|
||
axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
|
||
kwargs.update(axes)
|
||
# Pop these, since the values are in `kwargs` under different names
|
||
kwargs.pop("axis", None)
|
||
kwargs.pop("labels", None)
|
||
return super().reindex(**kwargs)
|
||
|
||
def drop(
|
||
self,
|
||
labels=None,
|
||
axis=0,
|
||
index=None,
|
||
columns=None,
|
||
level=None,
|
||
inplace=False,
|
||
errors="raise",
|
||
):
|
||
"""
|
||
Drop specified labels from rows or columns.
|
||
|
||
Remove rows or columns by specifying label names and corresponding
|
||
axis, or by specifying directly index or column names. When using a
|
||
multi-index, labels on different levels can be removed by specifying
|
||
the level.
|
||
|
||
Parameters
|
||
----------
|
||
labels : single label or list-like
|
||
Index or column labels to drop.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Whether to drop labels from the index (0 or 'index') or
|
||
columns (1 or 'columns').
|
||
index : single label or list-like
|
||
Alternative to specifying axis (``labels, axis=0``
|
||
is equivalent to ``index=labels``).
|
||
columns : single label or list-like
|
||
Alternative to specifying axis (``labels, axis=1``
|
||
is equivalent to ``columns=labels``).
|
||
level : int or level name, optional
|
||
For MultiIndex, level from which the labels will be removed.
|
||
inplace : bool, default False
|
||
If False, return a copy. Otherwise, do operation
|
||
inplace and return None.
|
||
errors : {'ignore', 'raise'}, default 'raise'
|
||
If 'ignore', suppress error and only existing labels are
|
||
dropped.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame without the removed index or column labels or
|
||
None if ``inplace=True``.
|
||
|
||
Raises
|
||
------
|
||
KeyError
|
||
If any of the labels is not found in the selected axis.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.loc : Label-location based indexer for selection by label.
|
||
DataFrame.dropna : Return DataFrame with labels on given axis omitted
|
||
where (all or any) data are missing.
|
||
DataFrame.drop_duplicates : Return DataFrame with duplicate rows
|
||
removed, optionally only considering certain columns.
|
||
Series.drop : Return Series with specified index labels removed.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
|
||
... columns=['A', 'B', 'C', 'D'])
|
||
>>> df
|
||
A B C D
|
||
0 0 1 2 3
|
||
1 4 5 6 7
|
||
2 8 9 10 11
|
||
|
||
Drop columns
|
||
|
||
>>> df.drop(['B', 'C'], axis=1)
|
||
A D
|
||
0 0 3
|
||
1 4 7
|
||
2 8 11
|
||
|
||
>>> df.drop(columns=['B', 'C'])
|
||
A D
|
||
0 0 3
|
||
1 4 7
|
||
2 8 11
|
||
|
||
Drop a row by index
|
||
|
||
>>> df.drop([0, 1])
|
||
A B C D
|
||
2 8 9 10 11
|
||
|
||
Drop columns and/or rows of MultiIndex DataFrame
|
||
|
||
>>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
|
||
... ['speed', 'weight', 'length']],
|
||
... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
|
||
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
|
||
>>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
|
||
... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
|
||
... [250, 150], [1.5, 0.8], [320, 250],
|
||
... [1, 0.8], [0.3, 0.2]])
|
||
>>> df
|
||
big small
|
||
lama speed 45.0 30.0
|
||
weight 200.0 100.0
|
||
length 1.5 1.0
|
||
cow speed 30.0 20.0
|
||
weight 250.0 150.0
|
||
length 1.5 0.8
|
||
falcon speed 320.0 250.0
|
||
weight 1.0 0.8
|
||
length 0.3 0.2
|
||
|
||
>>> df.drop(index='cow', columns='small')
|
||
big
|
||
lama speed 45.0
|
||
weight 200.0
|
||
length 1.5
|
||
falcon speed 320.0
|
||
weight 1.0
|
||
length 0.3
|
||
|
||
>>> df.drop(index='length', level=1)
|
||
big small
|
||
lama speed 45.0 30.0
|
||
weight 200.0 100.0
|
||
cow speed 30.0 20.0
|
||
weight 250.0 150.0
|
||
falcon speed 320.0 250.0
|
||
weight 1.0 0.8
|
||
"""
|
||
return super().drop(
|
||
labels=labels,
|
||
axis=axis,
|
||
index=index,
|
||
columns=columns,
|
||
level=level,
|
||
inplace=inplace,
|
||
errors=errors,
|
||
)
|
||
|
||
@rewrite_axis_style_signature(
|
||
"mapper",
|
||
[("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
|
||
)
|
||
def rename(
|
||
self,
|
||
mapper: Optional[Renamer] = None,
|
||
*,
|
||
index: Optional[Renamer] = None,
|
||
columns: Optional[Renamer] = None,
|
||
axis: Optional[Axis] = None,
|
||
copy: bool = True,
|
||
inplace: bool = False,
|
||
level: Optional[Level] = None,
|
||
errors: str = "ignore",
|
||
) -> Optional[DataFrame]:
|
||
"""
|
||
Alter axes labels.
|
||
|
||
Function / dict values must be unique (1-to-1). Labels not contained in
|
||
a dict / Series will be left as-is. Extra labels listed don't throw an
|
||
error.
|
||
|
||
See the :ref:`user guide <basics.rename>` for more.
|
||
|
||
Parameters
|
||
----------
|
||
mapper : dict-like or function
|
||
Dict-like or function transformations to apply to
|
||
that axis' values. Use either ``mapper`` and ``axis`` to
|
||
specify the axis to target with ``mapper``, or ``index`` and
|
||
``columns``.
|
||
index : dict-like or function
|
||
Alternative to specifying axis (``mapper, axis=0``
|
||
is equivalent to ``index=mapper``).
|
||
columns : dict-like or function
|
||
Alternative to specifying axis (``mapper, axis=1``
|
||
is equivalent to ``columns=mapper``).
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Axis to target with ``mapper``. Can be either the axis name
|
||
('index', 'columns') or number (0, 1). The default is 'index'.
|
||
copy : bool, default True
|
||
Also copy underlying data.
|
||
inplace : bool, default False
|
||
Whether to return a new DataFrame. If True then value of copy is
|
||
ignored.
|
||
level : int or level name, default None
|
||
In case of a MultiIndex, only rename labels in the specified
|
||
level.
|
||
errors : {'ignore', 'raise'}, default 'ignore'
|
||
If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
|
||
or `columns` contains labels that are not present in the Index
|
||
being transformed.
|
||
If 'ignore', existing keys will be renamed and extra keys will be
|
||
ignored.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame with the renamed axis labels or None if ``inplace=True``.
|
||
|
||
Raises
|
||
------
|
||
KeyError
|
||
If any of the labels is not found in the selected axis and
|
||
"errors='raise'".
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.rename_axis : Set the name of the axis.
|
||
|
||
Examples
|
||
--------
|
||
``DataFrame.rename`` supports two calling conventions
|
||
|
||
* ``(index=index_mapper, columns=columns_mapper, ...)``
|
||
* ``(mapper, axis={'index', 'columns'}, ...)``
|
||
|
||
We *highly* recommend using keyword arguments to clarify your
|
||
intent.
|
||
|
||
Rename columns using a mapping:
|
||
|
||
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||
>>> df.rename(columns={"A": "a", "B": "c"})
|
||
a c
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
|
||
Rename index using a mapping:
|
||
|
||
>>> df.rename(index={0: "x", 1: "y", 2: "z"})
|
||
A B
|
||
x 1 4
|
||
y 2 5
|
||
z 3 6
|
||
|
||
Cast index labels to a different type:
|
||
|
||
>>> df.index
|
||
RangeIndex(start=0, stop=3, step=1)
|
||
>>> df.rename(index=str).index
|
||
Index(['0', '1', '2'], dtype='object')
|
||
|
||
>>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
|
||
Traceback (most recent call last):
|
||
KeyError: ['C'] not found in axis
|
||
|
||
Using axis-style parameters:
|
||
|
||
>>> df.rename(str.lower, axis='columns')
|
||
a b
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
|
||
>>> df.rename({1: 2, 2: 4}, axis='index')
|
||
A B
|
||
0 1 4
|
||
2 2 5
|
||
4 3 6
|
||
"""
|
||
return super().rename(
|
||
mapper=mapper,
|
||
index=index,
|
||
columns=columns,
|
||
axis=axis,
|
||
copy=copy,
|
||
inplace=inplace,
|
||
level=level,
|
||
errors=errors,
|
||
)
|
||
|
||
@doc(NDFrame.fillna, **_shared_doc_kwargs)
|
||
def fillna(
|
||
self,
|
||
value=None,
|
||
method=None,
|
||
axis=None,
|
||
inplace=False,
|
||
limit=None,
|
||
downcast=None,
|
||
) -> Optional[DataFrame]:
|
||
return super().fillna(
|
||
value=value,
|
||
method=method,
|
||
axis=axis,
|
||
inplace=inplace,
|
||
limit=limit,
|
||
downcast=downcast,
|
||
)
|
||
|
||
def pop(self, item: Label) -> Series:
|
||
"""
|
||
Return item and drop from frame. Raise KeyError if not found.
|
||
|
||
Parameters
|
||
----------
|
||
item : label
|
||
Label of column to be popped.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
|
||
... ('parrot', 'bird', 24.0),
|
||
... ('lion', 'mammal', 80.5),
|
||
... ('monkey', 'mammal', np.nan)],
|
||
... columns=('name', 'class', 'max_speed'))
|
||
>>> df
|
||
name class max_speed
|
||
0 falcon bird 389.0
|
||
1 parrot bird 24.0
|
||
2 lion mammal 80.5
|
||
3 monkey mammal NaN
|
||
|
||
>>> df.pop('class')
|
||
0 bird
|
||
1 bird
|
||
2 mammal
|
||
3 mammal
|
||
Name: class, dtype: object
|
||
|
||
>>> df
|
||
name max_speed
|
||
0 falcon 389.0
|
||
1 parrot 24.0
|
||
2 lion 80.5
|
||
3 monkey NaN
|
||
"""
|
||
return super().pop(item=item)
|
||
|
||
@doc(NDFrame.replace, **_shared_doc_kwargs)
|
||
def replace(
|
||
self,
|
||
to_replace=None,
|
||
value=None,
|
||
inplace=False,
|
||
limit=None,
|
||
regex=False,
|
||
method="pad",
|
||
):
|
||
return super().replace(
|
||
to_replace=to_replace,
|
||
value=value,
|
||
inplace=inplace,
|
||
limit=limit,
|
||
regex=regex,
|
||
method=method,
|
||
)
|
||
|
||
def _replace_columnwise(
|
||
self, mapping: Dict[Label, Tuple[Any, Any]], inplace: bool, regex
|
||
):
|
||
"""
|
||
Dispatch to Series.replace column-wise.
|
||
|
||
|
||
Parameters
|
||
----------
|
||
mapping : dict
|
||
of the form {col: (target, value)}
|
||
inplace : bool
|
||
regex : bool or same types as `to_replace` in DataFrame.replace
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
"""
|
||
# Operate column-wise
|
||
res = self if inplace else self.copy()
|
||
ax = self.columns
|
||
|
||
for i in range(len(ax)):
|
||
if ax[i] in mapping:
|
||
ser = self.iloc[:, i]
|
||
|
||
target, value = mapping[ax[i]]
|
||
newobj = ser.replace(target, value, regex=regex)
|
||
|
||
res.iloc[:, i] = newobj
|
||
|
||
if inplace:
|
||
return
|
||
return res.__finalize__(self)
|
||
|
||
@doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
|
||
def shift(
|
||
self, periods=1, freq=None, axis=0, fill_value=lib.no_default
|
||
) -> DataFrame:
|
||
axis = self._get_axis_number(axis)
|
||
|
||
ncols = len(self.columns)
|
||
if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0:
|
||
# We will infer fill_value to match the closest column
|
||
|
||
# Use a column that we know is valid for our column's dtype GH#38434
|
||
label = self.columns[0]
|
||
|
||
if periods > 0:
|
||
result = self.iloc[:, :-periods]
|
||
for col in range(min(ncols, abs(periods))):
|
||
# TODO(EA2D): doing this in a loop unnecessary with 2D EAs
|
||
# Define filler inside loop so we get a copy
|
||
filler = self.iloc[:, 0].shift(len(self))
|
||
result.insert(0, label, filler, allow_duplicates=True)
|
||
else:
|
||
result = self.iloc[:, -periods:]
|
||
for col in range(min(ncols, abs(periods))):
|
||
# Define filler inside loop so we get a copy
|
||
filler = self.iloc[:, -1].shift(len(self))
|
||
result.insert(
|
||
len(result.columns), label, filler, allow_duplicates=True
|
||
)
|
||
|
||
result.columns = self.columns.copy()
|
||
return result
|
||
|
||
return super().shift(
|
||
periods=periods, freq=freq, axis=axis, fill_value=fill_value
|
||
)
|
||
|
||
def set_index(
|
||
self, keys, drop=True, append=False, inplace=False, verify_integrity=False
|
||
):
|
||
"""
|
||
Set the DataFrame index using existing columns.
|
||
|
||
Set the DataFrame index (row labels) using one or more existing
|
||
columns or arrays (of the correct length). The index can replace the
|
||
existing index or expand on it.
|
||
|
||
Parameters
|
||
----------
|
||
keys : label or array-like or list of labels/arrays
|
||
This parameter can be either a single column key, a single array of
|
||
the same length as the calling DataFrame, or a list containing an
|
||
arbitrary combination of column keys and arrays. Here, "array"
|
||
encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
|
||
instances of :class:`~collections.abc.Iterator`.
|
||
drop : bool, default True
|
||
Delete columns to be used as the new index.
|
||
append : bool, default False
|
||
Whether to append columns to existing index.
|
||
inplace : bool, default False
|
||
If True, modifies the DataFrame in place (do not create a new object).
|
||
verify_integrity : bool, default False
|
||
Check the new index for duplicates. Otherwise defer the check until
|
||
necessary. Setting to False will improve the performance of this
|
||
method.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
Changed row labels or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.reset_index : Opposite of set_index.
|
||
DataFrame.reindex : Change to new indices or expand indices.
|
||
DataFrame.reindex_like : Change to same indices as other DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'month': [1, 4, 7, 10],
|
||
... 'year': [2012, 2014, 2013, 2014],
|
||
... 'sale': [55, 40, 84, 31]})
|
||
>>> df
|
||
month year sale
|
||
0 1 2012 55
|
||
1 4 2014 40
|
||
2 7 2013 84
|
||
3 10 2014 31
|
||
|
||
Set the index to become the 'month' column:
|
||
|
||
>>> df.set_index('month')
|
||
year sale
|
||
month
|
||
1 2012 55
|
||
4 2014 40
|
||
7 2013 84
|
||
10 2014 31
|
||
|
||
Create a MultiIndex using columns 'year' and 'month':
|
||
|
||
>>> df.set_index(['year', 'month'])
|
||
sale
|
||
year month
|
||
2012 1 55
|
||
2014 4 40
|
||
2013 7 84
|
||
2014 10 31
|
||
|
||
Create a MultiIndex using an Index and a column:
|
||
|
||
>>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
|
||
month sale
|
||
year
|
||
1 2012 1 55
|
||
2 2014 4 40
|
||
3 2013 7 84
|
||
4 2014 10 31
|
||
|
||
Create a MultiIndex using two Series:
|
||
|
||
>>> s = pd.Series([1, 2, 3, 4])
|
||
>>> df.set_index([s, s**2])
|
||
month year sale
|
||
1 1 1 2012 55
|
||
2 4 4 2014 40
|
||
3 9 7 2013 84
|
||
4 16 10 2014 31
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
self._check_inplace_and_allows_duplicate_labels(inplace)
|
||
if not isinstance(keys, list):
|
||
keys = [keys]
|
||
|
||
err_msg = (
|
||
'The parameter "keys" may be a column key, one-dimensional '
|
||
"array, or a list containing only valid column keys and "
|
||
"one-dimensional arrays."
|
||
)
|
||
|
||
missing: List[Label] = []
|
||
for col in keys:
|
||
if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
|
||
# arrays are fine as long as they are one-dimensional
|
||
# iterators get converted to list below
|
||
if getattr(col, "ndim", 1) != 1:
|
||
raise ValueError(err_msg)
|
||
else:
|
||
# everything else gets tried as a key; see GH 24969
|
||
try:
|
||
found = col in self.columns
|
||
except TypeError as err:
|
||
raise TypeError(
|
||
f"{err_msg}. Received column of type {type(col)}"
|
||
) from err
|
||
else:
|
||
if not found:
|
||
missing.append(col)
|
||
|
||
if missing:
|
||
raise KeyError(f"None of {missing} are in the columns")
|
||
|
||
if inplace:
|
||
frame = self
|
||
else:
|
||
frame = self.copy()
|
||
|
||
arrays = []
|
||
names: List[Label] = []
|
||
if append:
|
||
names = list(self.index.names)
|
||
if isinstance(self.index, MultiIndex):
|
||
for i in range(self.index.nlevels):
|
||
arrays.append(self.index._get_level_values(i))
|
||
else:
|
||
arrays.append(self.index)
|
||
|
||
to_remove: List[Label] = []
|
||
for col in keys:
|
||
if isinstance(col, MultiIndex):
|
||
for n in range(col.nlevels):
|
||
arrays.append(col._get_level_values(n))
|
||
names.extend(col.names)
|
||
elif isinstance(col, (Index, Series)):
|
||
# if Index then not MultiIndex (treated above)
|
||
arrays.append(col)
|
||
names.append(col.name)
|
||
elif isinstance(col, (list, np.ndarray)):
|
||
arrays.append(col)
|
||
names.append(None)
|
||
elif isinstance(col, abc.Iterator):
|
||
arrays.append(list(col))
|
||
names.append(None)
|
||
# from here, col can only be a column label
|
||
else:
|
||
arrays.append(frame[col]._values)
|
||
names.append(col)
|
||
if drop:
|
||
to_remove.append(col)
|
||
|
||
if len(arrays[-1]) != len(self):
|
||
# check newest element against length of calling frame, since
|
||
# ensure_index_from_sequences would not raise for append=False.
|
||
raise ValueError(
|
||
f"Length mismatch: Expected {len(self)} rows, "
|
||
f"received array of length {len(arrays[-1])}"
|
||
)
|
||
|
||
index = ensure_index_from_sequences(arrays, names)
|
||
|
||
if verify_integrity and not index.is_unique:
|
||
duplicates = index[index.duplicated()].unique()
|
||
raise ValueError(f"Index has duplicate keys: {duplicates}")
|
||
|
||
# use set to handle duplicate column names gracefully in case of drop
|
||
for c in set(to_remove):
|
||
del frame[c]
|
||
|
||
# clear up memory usage
|
||
index._cleanup()
|
||
|
||
frame.index = index
|
||
|
||
if not inplace:
|
||
return frame
|
||
|
||
@overload
|
||
# https://github.com/python/mypy/issues/6580
|
||
# Overloaded function signatures 1 and 2 overlap with incompatible return types
|
||
def reset_index( # type: ignore[misc]
|
||
self,
|
||
level: Optional[Union[Hashable, Sequence[Hashable]]] = ...,
|
||
drop: bool = ...,
|
||
inplace: Literal[False] = ...,
|
||
col_level: Hashable = ...,
|
||
col_fill: Label = ...,
|
||
) -> DataFrame:
|
||
...
|
||
|
||
@overload
|
||
def reset_index(
|
||
self,
|
||
level: Optional[Union[Hashable, Sequence[Hashable]]] = ...,
|
||
drop: bool = ...,
|
||
inplace: Literal[True] = ...,
|
||
col_level: Hashable = ...,
|
||
col_fill: Label = ...,
|
||
) -> None:
|
||
...
|
||
|
||
def reset_index(
|
||
self,
|
||
level: Optional[Union[Hashable, Sequence[Hashable]]] = None,
|
||
drop: bool = False,
|
||
inplace: bool = False,
|
||
col_level: Hashable = 0,
|
||
col_fill: Label = "",
|
||
) -> Optional[DataFrame]:
|
||
"""
|
||
Reset the index, or a level of it.
|
||
|
||
Reset the index of the DataFrame, and use the default one instead.
|
||
If the DataFrame has a MultiIndex, this method can remove one or more
|
||
levels.
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, tuple, or list, default None
|
||
Only remove the given levels from the index. Removes all levels by
|
||
default.
|
||
drop : bool, default False
|
||
Do not try to insert index into dataframe columns. This resets
|
||
the index to the default integer index.
|
||
inplace : bool, default False
|
||
Modify the DataFrame in place (do not create a new object).
|
||
col_level : int or str, default 0
|
||
If the columns have multiple levels, determines which level the
|
||
labels are inserted into. By default it is inserted into the first
|
||
level.
|
||
col_fill : object, default ''
|
||
If the columns have multiple levels, determines how the other
|
||
levels are named. If None then the index name is repeated.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame with the new index or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.set_index : Opposite of reset_index.
|
||
DataFrame.reindex : Change to new indices or expand indices.
|
||
DataFrame.reindex_like : Change to same indices as other DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([('bird', 389.0),
|
||
... ('bird', 24.0),
|
||
... ('mammal', 80.5),
|
||
... ('mammal', np.nan)],
|
||
... index=['falcon', 'parrot', 'lion', 'monkey'],
|
||
... columns=('class', 'max_speed'))
|
||
>>> df
|
||
class max_speed
|
||
falcon bird 389.0
|
||
parrot bird 24.0
|
||
lion mammal 80.5
|
||
monkey mammal NaN
|
||
|
||
When we reset the index, the old index is added as a column, and a
|
||
new sequential index is used:
|
||
|
||
>>> df.reset_index()
|
||
index class max_speed
|
||
0 falcon bird 389.0
|
||
1 parrot bird 24.0
|
||
2 lion mammal 80.5
|
||
3 monkey mammal NaN
|
||
|
||
We can use the `drop` parameter to avoid the old index being added as
|
||
a column:
|
||
|
||
>>> df.reset_index(drop=True)
|
||
class max_speed
|
||
0 bird 389.0
|
||
1 bird 24.0
|
||
2 mammal 80.5
|
||
3 mammal NaN
|
||
|
||
You can also use `reset_index` with `MultiIndex`.
|
||
|
||
>>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
|
||
... ('bird', 'parrot'),
|
||
... ('mammal', 'lion'),
|
||
... ('mammal', 'monkey')],
|
||
... names=['class', 'name'])
|
||
>>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
|
||
... ('species', 'type')])
|
||
>>> df = pd.DataFrame([(389.0, 'fly'),
|
||
... ( 24.0, 'fly'),
|
||
... ( 80.5, 'run'),
|
||
... (np.nan, 'jump')],
|
||
... index=index,
|
||
... columns=columns)
|
||
>>> df
|
||
speed species
|
||
max type
|
||
class name
|
||
bird falcon 389.0 fly
|
||
parrot 24.0 fly
|
||
mammal lion 80.5 run
|
||
monkey NaN jump
|
||
|
||
If the index has multiple levels, we can reset a subset of them:
|
||
|
||
>>> df.reset_index(level='class')
|
||
class speed species
|
||
max type
|
||
name
|
||
falcon bird 389.0 fly
|
||
parrot bird 24.0 fly
|
||
lion mammal 80.5 run
|
||
monkey mammal NaN jump
|
||
|
||
If we are not dropping the index, by default, it is placed in the top
|
||
level. We can place it in another level:
|
||
|
||
>>> df.reset_index(level='class', col_level=1)
|
||
speed species
|
||
class max type
|
||
name
|
||
falcon bird 389.0 fly
|
||
parrot bird 24.0 fly
|
||
lion mammal 80.5 run
|
||
monkey mammal NaN jump
|
||
|
||
When the index is inserted under another level, we can specify under
|
||
which one with the parameter `col_fill`:
|
||
|
||
>>> df.reset_index(level='class', col_level=1, col_fill='species')
|
||
species speed species
|
||
class max type
|
||
name
|
||
falcon bird 389.0 fly
|
||
parrot bird 24.0 fly
|
||
lion mammal 80.5 run
|
||
monkey mammal NaN jump
|
||
|
||
If we specify a nonexistent level for `col_fill`, it is created:
|
||
|
||
>>> df.reset_index(level='class', col_level=1, col_fill='genus')
|
||
genus speed species
|
||
class max type
|
||
name
|
||
falcon bird 389.0 fly
|
||
parrot bird 24.0 fly
|
||
lion mammal 80.5 run
|
||
monkey mammal NaN jump
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
self._check_inplace_and_allows_duplicate_labels(inplace)
|
||
if inplace:
|
||
new_obj = self
|
||
else:
|
||
new_obj = self.copy()
|
||
|
||
new_index = ibase.default_index(len(new_obj))
|
||
if level is not None:
|
||
if not isinstance(level, (tuple, list)):
|
||
level = [level]
|
||
level = [self.index._get_level_number(lev) for lev in level]
|
||
if len(level) < self.index.nlevels:
|
||
new_index = self.index.droplevel(level)
|
||
|
||
if not drop:
|
||
to_insert: Iterable[Tuple[Any, Optional[Any]]]
|
||
if isinstance(self.index, MultiIndex):
|
||
names = [
|
||
(n if n is not None else f"level_{i}")
|
||
for i, n in enumerate(self.index.names)
|
||
]
|
||
to_insert = zip(self.index.levels, self.index.codes)
|
||
else:
|
||
default = "index" if "index" not in self else "level_0"
|
||
names = [default] if self.index.name is None else [self.index.name]
|
||
to_insert = ((self.index, None),)
|
||
|
||
multi_col = isinstance(self.columns, MultiIndex)
|
||
for i, (lev, lab) in reversed(list(enumerate(to_insert))):
|
||
if not (level is None or i in level):
|
||
continue
|
||
name = names[i]
|
||
if multi_col:
|
||
col_name = list(name) if isinstance(name, tuple) else [name]
|
||
if col_fill is None:
|
||
if len(col_name) not in (1, self.columns.nlevels):
|
||
raise ValueError(
|
||
"col_fill=None is incompatible "
|
||
f"with incomplete column name {name}"
|
||
)
|
||
col_fill = col_name[0]
|
||
|
||
lev_num = self.columns._get_level_number(col_level)
|
||
name_lst = [col_fill] * lev_num + col_name
|
||
missing = self.columns.nlevels - len(name_lst)
|
||
name_lst += [col_fill] * missing
|
||
name = tuple(name_lst)
|
||
# to ndarray and maybe infer different dtype
|
||
level_values = maybe_casted_values(lev, lab)
|
||
new_obj.insert(0, name, level_values)
|
||
|
||
new_obj.index = new_index
|
||
if not inplace:
|
||
return new_obj
|
||
|
||
return None
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Reindex-based selection methods
|
||
|
||
@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
|
||
def isna(self) -> DataFrame:
|
||
result = self._constructor(self._mgr.isna(func=isna))
|
||
return result.__finalize__(self, method="isna")
|
||
|
||
@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
|
||
def isnull(self) -> DataFrame:
|
||
return self.isna()
|
||
|
||
@doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
|
||
def notna(self) -> DataFrame:
|
||
return ~self.isna()
|
||
|
||
@doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
|
||
def notnull(self) -> DataFrame:
|
||
return ~self.isna()
|
||
|
||
def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
|
||
"""
|
||
Remove missing values.
|
||
|
||
See the :ref:`User Guide <missing_data>` for more on which values are
|
||
considered missing, and how to work with missing data.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Determine if rows or columns which contain missing values are
|
||
removed.
|
||
|
||
* 0, or 'index' : Drop rows which contain missing values.
|
||
* 1, or 'columns' : Drop columns which contain missing value.
|
||
|
||
.. versionchanged:: 1.0.0
|
||
|
||
Pass tuple or list to drop on multiple axes.
|
||
Only a single axis is allowed.
|
||
|
||
how : {'any', 'all'}, default 'any'
|
||
Determine if row or column is removed from DataFrame, when we have
|
||
at least one NA or all NA.
|
||
|
||
* 'any' : If any NA values are present, drop that row or column.
|
||
* 'all' : If all values are NA, drop that row or column.
|
||
|
||
thresh : int, optional
|
||
Require that many non-NA values.
|
||
subset : array-like, optional
|
||
Labels along other axis to consider, e.g. if you are dropping rows
|
||
these would be a list of columns to include.
|
||
inplace : bool, default False
|
||
If True, do operation inplace and return None.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame with NA entries dropped from it or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.isna: Indicate missing values.
|
||
DataFrame.notna : Indicate existing (non-missing) values.
|
||
DataFrame.fillna : Replace missing values.
|
||
Series.dropna : Drop missing values.
|
||
Index.dropna : Drop missing indices.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
|
||
... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
|
||
... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
|
||
... pd.NaT]})
|
||
>>> df
|
||
name toy born
|
||
0 Alfred NaN NaT
|
||
1 Batman Batmobile 1940-04-25
|
||
2 Catwoman Bullwhip NaT
|
||
|
||
Drop the rows where at least one element is missing.
|
||
|
||
>>> df.dropna()
|
||
name toy born
|
||
1 Batman Batmobile 1940-04-25
|
||
|
||
Drop the columns where at least one element is missing.
|
||
|
||
>>> df.dropna(axis='columns')
|
||
name
|
||
0 Alfred
|
||
1 Batman
|
||
2 Catwoman
|
||
|
||
Drop the rows where all elements are missing.
|
||
|
||
>>> df.dropna(how='all')
|
||
name toy born
|
||
0 Alfred NaN NaT
|
||
1 Batman Batmobile 1940-04-25
|
||
2 Catwoman Bullwhip NaT
|
||
|
||
Keep only the rows with at least 2 non-NA values.
|
||
|
||
>>> df.dropna(thresh=2)
|
||
name toy born
|
||
1 Batman Batmobile 1940-04-25
|
||
2 Catwoman Bullwhip NaT
|
||
|
||
Define in which columns to look for missing values.
|
||
|
||
>>> df.dropna(subset=['name', 'toy'])
|
||
name toy born
|
||
1 Batman Batmobile 1940-04-25
|
||
2 Catwoman Bullwhip NaT
|
||
|
||
Keep the DataFrame with valid entries in the same variable.
|
||
|
||
>>> df.dropna(inplace=True)
|
||
>>> df
|
||
name toy born
|
||
1 Batman Batmobile 1940-04-25
|
||
"""
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
if isinstance(axis, (tuple, list)):
|
||
# GH20987
|
||
raise TypeError("supplying multiple axes to axis is no longer supported.")
|
||
|
||
axis = self._get_axis_number(axis)
|
||
agg_axis = 1 - axis
|
||
|
||
agg_obj = self
|
||
if subset is not None:
|
||
ax = self._get_axis(agg_axis)
|
||
indices = ax.get_indexer_for(subset)
|
||
check = indices == -1
|
||
if check.any():
|
||
raise KeyError(list(np.compress(check, subset)))
|
||
agg_obj = self.take(indices, axis=agg_axis)
|
||
|
||
count = agg_obj.count(axis=agg_axis)
|
||
|
||
if thresh is not None:
|
||
mask = count >= thresh
|
||
elif how == "any":
|
||
mask = count == len(agg_obj._get_axis(agg_axis))
|
||
elif how == "all":
|
||
mask = count > 0
|
||
else:
|
||
if how is not None:
|
||
raise ValueError(f"invalid how option: {how}")
|
||
else:
|
||
raise TypeError("must specify how or thresh")
|
||
|
||
result = self.loc(axis=axis)[mask]
|
||
|
||
if inplace:
|
||
self._update_inplace(result)
|
||
else:
|
||
return result
|
||
|
||
def drop_duplicates(
|
||
self,
|
||
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
|
||
keep: Union[str, bool] = "first",
|
||
inplace: bool = False,
|
||
ignore_index: bool = False,
|
||
) -> Optional[DataFrame]:
|
||
"""
|
||
Return DataFrame with duplicate rows removed.
|
||
|
||
Considering certain columns is optional. Indexes, including time indexes
|
||
are ignored.
|
||
|
||
Parameters
|
||
----------
|
||
subset : column label or sequence of labels, optional
|
||
Only consider certain columns for identifying duplicates, by
|
||
default use all of the columns.
|
||
keep : {'first', 'last', False}, default 'first'
|
||
Determines which duplicates (if any) to keep.
|
||
- ``first`` : Drop duplicates except for the first occurrence.
|
||
- ``last`` : Drop duplicates except for the last occurrence.
|
||
- False : Drop all duplicates.
|
||
inplace : bool, default False
|
||
Whether to drop duplicates in place or to return a copy.
|
||
ignore_index : bool, default False
|
||
If True, the resulting axis will be labeled 0, 1, …, n - 1.
|
||
|
||
.. versionadded:: 1.0.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
DataFrame with duplicates removed or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.value_counts: Count unique combinations of columns.
|
||
|
||
Examples
|
||
--------
|
||
Consider dataset containing ramen rating.
|
||
|
||
>>> df = pd.DataFrame({
|
||
... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
|
||
... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
|
||
... 'rating': [4, 4, 3.5, 15, 5]
|
||
... })
|
||
>>> df
|
||
brand style rating
|
||
0 Yum Yum cup 4.0
|
||
1 Yum Yum cup 4.0
|
||
2 Indomie cup 3.5
|
||
3 Indomie pack 15.0
|
||
4 Indomie pack 5.0
|
||
|
||
By default, it removes duplicate rows based on all columns.
|
||
|
||
>>> df.drop_duplicates()
|
||
brand style rating
|
||
0 Yum Yum cup 4.0
|
||
2 Indomie cup 3.5
|
||
3 Indomie pack 15.0
|
||
4 Indomie pack 5.0
|
||
|
||
To remove duplicates on specific column(s), use ``subset``.
|
||
|
||
>>> df.drop_duplicates(subset=['brand'])
|
||
brand style rating
|
||
0 Yum Yum cup 4.0
|
||
2 Indomie cup 3.5
|
||
|
||
To remove duplicates and keep last occurrences, use ``keep``.
|
||
|
||
>>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
|
||
brand style rating
|
||
1 Yum Yum cup 4.0
|
||
2 Indomie cup 3.5
|
||
4 Indomie pack 5.0
|
||
"""
|
||
if self.empty:
|
||
return self.copy()
|
||
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
|
||
duplicated = self.duplicated(subset, keep=keep)
|
||
|
||
result = self[-duplicated]
|
||
if ignore_index:
|
||
result.index = ibase.default_index(len(result))
|
||
|
||
if inplace:
|
||
self._update_inplace(result)
|
||
return None
|
||
else:
|
||
return result
|
||
|
||
def duplicated(
|
||
self,
|
||
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
|
||
keep: Union[str, bool] = "first",
|
||
) -> Series:
|
||
"""
|
||
Return boolean Series denoting duplicate rows.
|
||
|
||
Considering certain columns is optional.
|
||
|
||
Parameters
|
||
----------
|
||
subset : column label or sequence of labels, optional
|
||
Only consider certain columns for identifying duplicates, by
|
||
default use all of the columns.
|
||
keep : {'first', 'last', False}, default 'first'
|
||
Determines which duplicates (if any) to mark.
|
||
|
||
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
|
||
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
|
||
- False : Mark all duplicates as ``True``.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Boolean series for each duplicated rows.
|
||
|
||
See Also
|
||
--------
|
||
Index.duplicated : Equivalent method on index.
|
||
Series.duplicated : Equivalent method on Series.
|
||
Series.drop_duplicates : Remove duplicate values from Series.
|
||
DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
Consider dataset containing ramen rating.
|
||
|
||
>>> df = pd.DataFrame({
|
||
... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
|
||
... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
|
||
... 'rating': [4, 4, 3.5, 15, 5]
|
||
... })
|
||
>>> df
|
||
brand style rating
|
||
0 Yum Yum cup 4.0
|
||
1 Yum Yum cup 4.0
|
||
2 Indomie cup 3.5
|
||
3 Indomie pack 15.0
|
||
4 Indomie pack 5.0
|
||
|
||
By default, for each set of duplicated values, the first occurrence
|
||
is set on False and all others on True.
|
||
|
||
>>> df.duplicated()
|
||
0 False
|
||
1 True
|
||
2 False
|
||
3 False
|
||
4 False
|
||
dtype: bool
|
||
|
||
By using 'last', the last occurrence of each set of duplicated values
|
||
is set on False and all others on True.
|
||
|
||
>>> df.duplicated(keep='last')
|
||
0 True
|
||
1 False
|
||
2 False
|
||
3 False
|
||
4 False
|
||
dtype: bool
|
||
|
||
By setting ``keep`` on False, all duplicates are True.
|
||
|
||
>>> df.duplicated(keep=False)
|
||
0 True
|
||
1 True
|
||
2 False
|
||
3 False
|
||
4 False
|
||
dtype: bool
|
||
|
||
To find duplicates on specific column(s), use ``subset``.
|
||
|
||
>>> df.duplicated(subset=['brand'])
|
||
0 False
|
||
1 True
|
||
2 False
|
||
3 True
|
||
4 True
|
||
dtype: bool
|
||
"""
|
||
from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64
|
||
|
||
if self.empty:
|
||
return self._constructor_sliced(dtype=bool)
|
||
|
||
def f(vals):
|
||
labels, shape = algorithms.factorize(
|
||
vals, size_hint=min(len(self), SIZE_HINT_LIMIT)
|
||
)
|
||
return labels.astype("i8", copy=False), len(shape)
|
||
|
||
if subset is None:
|
||
subset = self.columns
|
||
elif (
|
||
not np.iterable(subset)
|
||
or isinstance(subset, str)
|
||
or isinstance(subset, tuple)
|
||
and subset in self.columns
|
||
):
|
||
subset = (subset,)
|
||
|
||
# needed for mypy since can't narrow types using np.iterable
|
||
subset = cast(Iterable, subset)
|
||
|
||
# Verify all columns in subset exist in the queried dataframe
|
||
# Otherwise, raise a KeyError, same as if you try to __getitem__ with a
|
||
# key that doesn't exist.
|
||
diff = Index(subset).difference(self.columns)
|
||
if not diff.empty:
|
||
raise KeyError(diff)
|
||
|
||
vals = (col.values for name, col in self.items() if name in subset)
|
||
labels, shape = map(list, zip(*map(f, vals)))
|
||
|
||
ids = get_group_index(labels, shape, sort=False, xnull=False)
|
||
result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index)
|
||
return result.__finalize__(self, method="duplicated")
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Sorting
|
||
# TODO: Just move the sort_values doc here.
|
||
@Substitution(**_shared_doc_kwargs)
|
||
@Appender(NDFrame.sort_values.__doc__)
|
||
# error: Signature of "sort_values" incompatible with supertype "NDFrame"
|
||
def sort_values( # type: ignore[override]
|
||
self,
|
||
by,
|
||
axis=0,
|
||
ascending=True,
|
||
inplace=False,
|
||
kind="quicksort",
|
||
na_position="last",
|
||
ignore_index=False,
|
||
key: ValueKeyFunc = None,
|
||
):
|
||
inplace = validate_bool_kwarg(inplace, "inplace")
|
||
axis = self._get_axis_number(axis)
|
||
|
||
if not isinstance(by, list):
|
||
by = [by]
|
||
if is_sequence(ascending) and len(by) != len(ascending):
|
||
raise ValueError(
|
||
f"Length of ascending ({len(ascending)}) != length of by ({len(by)})"
|
||
)
|
||
if len(by) > 1:
|
||
|
||
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
|
||
|
||
# need to rewrap columns in Series to apply key function
|
||
if key is not None:
|
||
keys = [Series(k, name=name) for (k, name) in zip(keys, by)]
|
||
|
||
indexer = lexsort_indexer(
|
||
keys, orders=ascending, na_position=na_position, key=key
|
||
)
|
||
indexer = ensure_platform_int(indexer)
|
||
else:
|
||
|
||
by = by[0]
|
||
k = self._get_label_or_level_values(by, axis=axis)
|
||
|
||
# need to rewrap column in Series to apply key function
|
||
if key is not None:
|
||
k = Series(k, name=by)
|
||
|
||
if isinstance(ascending, (tuple, list)):
|
||
ascending = ascending[0]
|
||
|
||
indexer = nargsort(
|
||
k, kind=kind, ascending=ascending, na_position=na_position, key=key
|
||
)
|
||
|
||
new_data = self._mgr.take(
|
||
indexer, axis=self._get_block_manager_axis(axis), verify=False
|
||
)
|
||
|
||
if ignore_index:
|
||
new_data.axes[1] = ibase.default_index(len(indexer))
|
||
|
||
result = self._constructor(new_data)
|
||
if inplace:
|
||
return self._update_inplace(result)
|
||
else:
|
||
return result.__finalize__(self, method="sort_values")
|
||
|
||
def sort_index(
|
||
self,
|
||
axis=0,
|
||
level=None,
|
||
ascending: Union[Union[bool, int], Sequence[Union[bool, int]]] = True,
|
||
inplace: bool = False,
|
||
kind: str = "quicksort",
|
||
na_position: str = "last",
|
||
sort_remaining: bool = True,
|
||
ignore_index: bool = False,
|
||
key: IndexKeyFunc = None,
|
||
):
|
||
"""
|
||
Sort object by labels (along an axis).
|
||
|
||
Returns a new DataFrame sorted by label if `inplace` argument is
|
||
``False``, otherwise updates the original DataFrame and returns None.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis along which to sort. The value 0 identifies the rows,
|
||
and 1 identifies the columns.
|
||
level : int or level name or list of ints or list of level names
|
||
If not None, sort on values in specified index level(s).
|
||
ascending : bool or list-like of bools, default True
|
||
Sort ascending vs. descending. When the index is a MultiIndex the
|
||
sort direction can be controlled for each level individually.
|
||
inplace : bool, default False
|
||
If True, perform operation in-place.
|
||
kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
|
||
Choice of sorting algorithm. See also ndarray.np.sort for more
|
||
information. `mergesort` is the only stable algorithm. For
|
||
DataFrames, this option is only applied when sorting on a single
|
||
column or label.
|
||
na_position : {'first', 'last'}, default 'last'
|
||
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
|
||
Not implemented for MultiIndex.
|
||
sort_remaining : bool, default True
|
||
If True and sorting by level and index is multilevel, sort by other
|
||
levels too (in order) after sorting by specified level.
|
||
ignore_index : bool, default False
|
||
If True, the resulting axis will be labeled 0, 1, …, n - 1.
|
||
|
||
.. versionadded:: 1.0.0
|
||
|
||
key : callable, optional
|
||
If not None, apply the key function to the index values
|
||
before sorting. This is similar to the `key` argument in the
|
||
builtin :meth:`sorted` function, with the notable difference that
|
||
this `key` function should be *vectorized*. It should expect an
|
||
``Index`` and return an ``Index`` of the same shape. For MultiIndex
|
||
inputs, the key is applied *per level*.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or None
|
||
The original DataFrame sorted by the labels or None if ``inplace=True``.
|
||
|
||
See Also
|
||
--------
|
||
Series.sort_index : Sort Series by the index.
|
||
DataFrame.sort_values : Sort DataFrame by the value.
|
||
Series.sort_values : Sort Series by the value.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
|
||
... columns=['A'])
|
||
>>> df.sort_index()
|
||
A
|
||
1 4
|
||
29 2
|
||
100 1
|
||
150 5
|
||
234 3
|
||
|
||
By default, it sorts in ascending order, to sort in descending order,
|
||
use ``ascending=False``
|
||
|
||
>>> df.sort_index(ascending=False)
|
||
A
|
||
234 3
|
||
150 5
|
||
100 1
|
||
29 2
|
||
1 4
|
||
|
||
A key function can be specified which is applied to the index before
|
||
sorting. For a ``MultiIndex`` this is applied to each level separately.
|
||
|
||
>>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
|
||
>>> df.sort_index(key=lambda x: x.str.lower())
|
||
a
|
||
A 1
|
||
b 2
|
||
C 3
|
||
d 4
|
||
"""
|
||
return super().sort_index(
|
||
axis,
|
||
level,
|
||
ascending,
|
||
inplace,
|
||
kind,
|
||
na_position,
|
||
sort_remaining,
|
||
ignore_index,
|
||
key,
|
||
)
|
||
|
||
def value_counts(
|
||
self,
|
||
subset: Optional[Sequence[Label]] = None,
|
||
normalize: bool = False,
|
||
sort: bool = True,
|
||
ascending: bool = False,
|
||
):
|
||
"""
|
||
Return a Series containing counts of unique rows in the DataFrame.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
Parameters
|
||
----------
|
||
subset : list-like, optional
|
||
Columns to use when counting unique combinations.
|
||
normalize : bool, default False
|
||
Return proportions rather than frequencies.
|
||
sort : bool, default True
|
||
Sort by frequencies.
|
||
ascending : bool, default False
|
||
Sort in ascending order.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
See Also
|
||
--------
|
||
Series.value_counts: Equivalent method on Series.
|
||
|
||
Notes
|
||
-----
|
||
The returned Series will have a MultiIndex with one level per input
|
||
column. By default, rows that contain any NA values are omitted from
|
||
the result. By default, the resulting Series will be in descending
|
||
order so that the first element is the most frequently-occurring row.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
|
||
... 'num_wings': [2, 0, 0, 0]},
|
||
... index=['falcon', 'dog', 'cat', 'ant'])
|
||
>>> df
|
||
num_legs num_wings
|
||
falcon 2 2
|
||
dog 4 0
|
||
cat 4 0
|
||
ant 6 0
|
||
|
||
>>> df.value_counts()
|
||
num_legs num_wings
|
||
4 0 2
|
||
2 2 1
|
||
6 0 1
|
||
dtype: int64
|
||
|
||
>>> df.value_counts(sort=False)
|
||
num_legs num_wings
|
||
2 2 1
|
||
4 0 2
|
||
6 0 1
|
||
dtype: int64
|
||
|
||
>>> df.value_counts(ascending=True)
|
||
num_legs num_wings
|
||
2 2 1
|
||
6 0 1
|
||
4 0 2
|
||
dtype: int64
|
||
|
||
>>> df.value_counts(normalize=True)
|
||
num_legs num_wings
|
||
4 0 0.50
|
||
2 2 0.25
|
||
6 0 0.25
|
||
dtype: float64
|
||
"""
|
||
if subset is None:
|
||
subset = self.columns.tolist()
|
||
|
||
counts = self.groupby(subset).grouper.size()
|
||
|
||
if sort:
|
||
counts = counts.sort_values(ascending=ascending)
|
||
if normalize:
|
||
counts /= counts.sum()
|
||
|
||
# Force MultiIndex for single column
|
||
if len(subset) == 1:
|
||
counts.index = MultiIndex.from_arrays(
|
||
[counts.index], names=[counts.index.name]
|
||
)
|
||
|
||
return counts
|
||
|
||
def nlargest(self, n, columns, keep="first") -> DataFrame:
|
||
"""
|
||
Return the first `n` rows ordered by `columns` in descending order.
|
||
|
||
Return the first `n` rows with the largest values in `columns`, in
|
||
descending order. The columns that are not specified are returned as
|
||
well, but not used for ordering.
|
||
|
||
This method is equivalent to
|
||
``df.sort_values(columns, ascending=False).head(n)``, but more
|
||
performant.
|
||
|
||
Parameters
|
||
----------
|
||
n : int
|
||
Number of rows to return.
|
||
columns : label or list of labels
|
||
Column label(s) to order by.
|
||
keep : {'first', 'last', 'all'}, default 'first'
|
||
Where there are duplicate values:
|
||
|
||
- `first` : prioritize the first occurrence(s)
|
||
- `last` : prioritize the last occurrence(s)
|
||
- ``all`` : do not drop any duplicates, even it means
|
||
selecting more than `n` items.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
The first `n` rows ordered by the given columns in descending
|
||
order.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
|
||
ascending order.
|
||
DataFrame.sort_values : Sort DataFrame by the values.
|
||
DataFrame.head : Return the first `n` rows without re-ordering.
|
||
|
||
Notes
|
||
-----
|
||
This function cannot be used with all column types. For example, when
|
||
specifying columns with `object` or `category` dtypes, ``TypeError`` is
|
||
raised.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
|
||
... 434000, 434000, 337000, 11300,
|
||
... 11300, 11300],
|
||
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
|
||
... 17036, 182, 38, 311],
|
||
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
|
||
... "IS", "NR", "TV", "AI"]},
|
||
... index=["Italy", "France", "Malta",
|
||
... "Maldives", "Brunei", "Iceland",
|
||
... "Nauru", "Tuvalu", "Anguilla"])
|
||
>>> df
|
||
population GDP alpha-2
|
||
Italy 59000000 1937894 IT
|
||
France 65000000 2583560 FR
|
||
Malta 434000 12011 MT
|
||
Maldives 434000 4520 MV
|
||
Brunei 434000 12128 BN
|
||
Iceland 337000 17036 IS
|
||
Nauru 11300 182 NR
|
||
Tuvalu 11300 38 TV
|
||
Anguilla 11300 311 AI
|
||
|
||
In the following example, we will use ``nlargest`` to select the three
|
||
rows having the largest values in column "population".
|
||
|
||
>>> df.nlargest(3, 'population')
|
||
population GDP alpha-2
|
||
France 65000000 2583560 FR
|
||
Italy 59000000 1937894 IT
|
||
Malta 434000 12011 MT
|
||
|
||
When using ``keep='last'``, ties are resolved in reverse order:
|
||
|
||
>>> df.nlargest(3, 'population', keep='last')
|
||
population GDP alpha-2
|
||
France 65000000 2583560 FR
|
||
Italy 59000000 1937894 IT
|
||
Brunei 434000 12128 BN
|
||
|
||
When using ``keep='all'``, all duplicate items are maintained:
|
||
|
||
>>> df.nlargest(3, 'population', keep='all')
|
||
population GDP alpha-2
|
||
France 65000000 2583560 FR
|
||
Italy 59000000 1937894 IT
|
||
Malta 434000 12011 MT
|
||
Maldives 434000 4520 MV
|
||
Brunei 434000 12128 BN
|
||
|
||
To order by the largest values in column "population" and then "GDP",
|
||
we can specify multiple columns like in the next example.
|
||
|
||
>>> df.nlargest(3, ['population', 'GDP'])
|
||
population GDP alpha-2
|
||
France 65000000 2583560 FR
|
||
Italy 59000000 1937894 IT
|
||
Brunei 434000 12128 BN
|
||
"""
|
||
return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
|
||
|
||
def nsmallest(self, n, columns, keep="first") -> DataFrame:
|
||
"""
|
||
Return the first `n` rows ordered by `columns` in ascending order.
|
||
|
||
Return the first `n` rows with the smallest values in `columns`, in
|
||
ascending order. The columns that are not specified are returned as
|
||
well, but not used for ordering.
|
||
|
||
This method is equivalent to
|
||
``df.sort_values(columns, ascending=True).head(n)``, but more
|
||
performant.
|
||
|
||
Parameters
|
||
----------
|
||
n : int
|
||
Number of items to retrieve.
|
||
columns : list or str
|
||
Column name or names to order by.
|
||
keep : {'first', 'last', 'all'}, default 'first'
|
||
Where there are duplicate values:
|
||
|
||
- ``first`` : take the first occurrence.
|
||
- ``last`` : take the last occurrence.
|
||
- ``all`` : do not drop any duplicates, even it means
|
||
selecting more than `n` items.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
|
||
descending order.
|
||
DataFrame.sort_values : Sort DataFrame by the values.
|
||
DataFrame.head : Return the first `n` rows without re-ordering.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
|
||
... 434000, 434000, 337000, 337000,
|
||
... 11300, 11300],
|
||
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
|
||
... 17036, 182, 38, 311],
|
||
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
|
||
... "IS", "NR", "TV", "AI"]},
|
||
... index=["Italy", "France", "Malta",
|
||
... "Maldives", "Brunei", "Iceland",
|
||
... "Nauru", "Tuvalu", "Anguilla"])
|
||
>>> df
|
||
population GDP alpha-2
|
||
Italy 59000000 1937894 IT
|
||
France 65000000 2583560 FR
|
||
Malta 434000 12011 MT
|
||
Maldives 434000 4520 MV
|
||
Brunei 434000 12128 BN
|
||
Iceland 337000 17036 IS
|
||
Nauru 337000 182 NR
|
||
Tuvalu 11300 38 TV
|
||
Anguilla 11300 311 AI
|
||
|
||
In the following example, we will use ``nsmallest`` to select the
|
||
three rows having the smallest values in column "population".
|
||
|
||
>>> df.nsmallest(3, 'population')
|
||
population GDP alpha-2
|
||
Tuvalu 11300 38 TV
|
||
Anguilla 11300 311 AI
|
||
Iceland 337000 17036 IS
|
||
|
||
When using ``keep='last'``, ties are resolved in reverse order:
|
||
|
||
>>> df.nsmallest(3, 'population', keep='last')
|
||
population GDP alpha-2
|
||
Anguilla 11300 311 AI
|
||
Tuvalu 11300 38 TV
|
||
Nauru 337000 182 NR
|
||
|
||
When using ``keep='all'``, all duplicate items are maintained:
|
||
|
||
>>> df.nsmallest(3, 'population', keep='all')
|
||
population GDP alpha-2
|
||
Tuvalu 11300 38 TV
|
||
Anguilla 11300 311 AI
|
||
Iceland 337000 17036 IS
|
||
Nauru 337000 182 NR
|
||
|
||
To order by the smallest values in column "population" and then "GDP", we can
|
||
specify multiple columns like in the next example.
|
||
|
||
>>> df.nsmallest(3, ['population', 'GDP'])
|
||
population GDP alpha-2
|
||
Tuvalu 11300 38 TV
|
||
Anguilla 11300 311 AI
|
||
Nauru 337000 182 NR
|
||
"""
|
||
return algorithms.SelectNFrame(
|
||
self, n=n, keep=keep, columns=columns
|
||
).nsmallest()
|
||
|
||
def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame:
|
||
"""
|
||
Swap levels i and j in a MultiIndex on a particular axis.
|
||
|
||
Parameters
|
||
----------
|
||
i, j : int or str
|
||
Levels of the indices to be swapped. Can pass level name as string.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to swap levels on. 0 or 'index' for row-wise, 1 or
|
||
'columns' for column-wise.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
"""
|
||
result = self.copy()
|
||
|
||
axis = self._get_axis_number(axis)
|
||
|
||
if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
|
||
raise TypeError("Can only swap levels on a hierarchical axis.")
|
||
|
||
if axis == 0:
|
||
assert isinstance(result.index, MultiIndex)
|
||
result.index = result.index.swaplevel(i, j)
|
||
else:
|
||
assert isinstance(result.columns, MultiIndex)
|
||
result.columns = result.columns.swaplevel(i, j)
|
||
return result
|
||
|
||
def reorder_levels(self, order, axis=0) -> DataFrame:
|
||
"""
|
||
Rearrange index levels using input order. May not drop or duplicate levels.
|
||
|
||
Parameters
|
||
----------
|
||
order : list of int or list of str
|
||
List representing new level order. Reference level by number
|
||
(position) or by key (label).
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Where to reorder levels.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
"""
|
||
axis = self._get_axis_number(axis)
|
||
if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
|
||
raise TypeError("Can only reorder levels on a hierarchical axis.")
|
||
|
||
result = self.copy()
|
||
|
||
if axis == 0:
|
||
assert isinstance(result.index, MultiIndex)
|
||
result.index = result.index.reorder_levels(order)
|
||
else:
|
||
assert isinstance(result.columns, MultiIndex)
|
||
result.columns = result.columns.reorder_levels(order)
|
||
return result
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Arithmetic Methods
|
||
|
||
def _cmp_method(self, other, op):
|
||
axis = 1 # only relevant for Series other case
|
||
|
||
self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
|
||
|
||
# See GH#4537 for discussion of scalar op behavior
|
||
new_data = self._dispatch_frame_op(other, op, axis=axis)
|
||
return self._construct_result(new_data)
|
||
|
||
def _arith_method(self, other, op):
|
||
if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None):
|
||
return ops.frame_arith_method_with_reindex(self, other, op)
|
||
|
||
axis = 1 # only relevant for Series other case
|
||
|
||
self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
|
||
|
||
new_data = self._dispatch_frame_op(other, op, axis=axis)
|
||
return self._construct_result(new_data)
|
||
|
||
_logical_method = _arith_method
|
||
|
||
def _dispatch_frame_op(self, right, func, axis: Optional[int] = None):
|
||
"""
|
||
Evaluate the frame operation func(left, right) by evaluating
|
||
column-by-column, dispatching to the Series implementation.
|
||
|
||
Parameters
|
||
----------
|
||
right : scalar, Series, or DataFrame
|
||
func : arithmetic or comparison operator
|
||
axis : {None, 0, 1}
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
"""
|
||
# Get the appropriate array-op to apply to each column/block's values.
|
||
array_op = ops.get_array_op(func)
|
||
|
||
right = lib.item_from_zerodim(right)
|
||
if not is_list_like(right):
|
||
# i.e. scalar, faster than checking np.ndim(right) == 0
|
||
bm = self._mgr.apply(array_op, right=right)
|
||
return type(self)(bm)
|
||
|
||
elif isinstance(right, DataFrame):
|
||
assert self.index.equals(right.index)
|
||
assert self.columns.equals(right.columns)
|
||
# TODO: The previous assertion `assert right._indexed_same(self)`
|
||
# fails in cases with empty columns reached via
|
||
# _frame_arith_method_with_reindex
|
||
|
||
bm = self._mgr.operate_blockwise(right._mgr, array_op)
|
||
return type(self)(bm)
|
||
|
||
elif isinstance(right, Series) and axis == 1:
|
||
# axis=1 means we want to operate row-by-row
|
||
assert right.index.equals(self.columns)
|
||
|
||
right = right._values
|
||
# maybe_align_as_frame ensures we do not have an ndarray here
|
||
assert not isinstance(right, np.ndarray)
|
||
|
||
arrays = [
|
||
array_op(_left, _right)
|
||
for _left, _right in zip(self._iter_column_arrays(), right)
|
||
]
|
||
|
||
elif isinstance(right, Series):
|
||
assert right.index.equals(self.index) # Handle other cases later
|
||
right = right._values
|
||
|
||
arrays = [array_op(left, right) for left in self._iter_column_arrays()]
|
||
|
||
else:
|
||
# Remaining cases have less-obvious dispatch rules
|
||
raise NotImplementedError(right)
|
||
|
||
return type(self)._from_arrays(
|
||
arrays, self.columns, self.index, verify_integrity=False
|
||
)
|
||
|
||
def _combine_frame(self, other: DataFrame, func, fill_value=None):
|
||
# at this point we have `self._indexed_same(other)`
|
||
|
||
if fill_value is None:
|
||
# since _arith_op may be called in a loop, avoid function call
|
||
# overhead if possible by doing this check once
|
||
_arith_op = func
|
||
|
||
else:
|
||
|
||
def _arith_op(left, right):
|
||
# for the mixed_type case where we iterate over columns,
|
||
# _arith_op(left, right) is equivalent to
|
||
# left._binop(right, func, fill_value=fill_value)
|
||
left, right = ops.fill_binop(left, right, fill_value)
|
||
return func(left, right)
|
||
|
||
new_data = self._dispatch_frame_op(other, _arith_op)
|
||
return new_data
|
||
|
||
def _construct_result(self, result) -> DataFrame:
|
||
"""
|
||
Wrap the result of an arithmetic, comparison, or logical operation.
|
||
|
||
Parameters
|
||
----------
|
||
result : DataFrame
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
"""
|
||
out = self._constructor(result, copy=False)
|
||
# Pin columns instead of passing to constructor for compat with
|
||
# non-unique columns case
|
||
out.columns = self.columns
|
||
out.index = self.index
|
||
return out
|
||
|
||
def __divmod__(self, other) -> Tuple[DataFrame, DataFrame]:
|
||
# Naive implementation, room for optimization
|
||
div = self // other
|
||
mod = self - div * other
|
||
return div, mod
|
||
|
||
def __rdivmod__(self, other) -> Tuple[DataFrame, DataFrame]:
|
||
# Naive implementation, room for optimization
|
||
div = other // self
|
||
mod = other - div * self
|
||
return div, mod
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Combination-Related
|
||
|
||
@doc(
|
||
_shared_docs["compare"],
|
||
"""
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
DataFrame that shows the differences stacked side by side.
|
||
|
||
The resulting index will be a MultiIndex with 'self' and 'other'
|
||
stacked alternately at the inner level.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
When the two DataFrames don't have identical labels or shape.
|
||
|
||
See Also
|
||
--------
|
||
Series.compare : Compare with another Series and show differences.
|
||
DataFrame.equals : Test whether two objects contain the same elements.
|
||
|
||
Notes
|
||
-----
|
||
Matching NaNs will not appear as a difference.
|
||
|
||
Can only compare identically-labeled
|
||
(i.e. same shape, identical row and column labels) DataFrames
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame(
|
||
... {{
|
||
... "col1": ["a", "a", "b", "b", "a"],
|
||
... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
|
||
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
|
||
... }},
|
||
... columns=["col1", "col2", "col3"],
|
||
... )
|
||
>>> df
|
||
col1 col2 col3
|
||
0 a 1.0 1.0
|
||
1 a 2.0 2.0
|
||
2 b 3.0 3.0
|
||
3 b NaN 4.0
|
||
4 a 5.0 5.0
|
||
|
||
>>> df2 = df.copy()
|
||
>>> df2.loc[0, 'col1'] = 'c'
|
||
>>> df2.loc[2, 'col3'] = 4.0
|
||
>>> df2
|
||
col1 col2 col3
|
||
0 c 1.0 1.0
|
||
1 a 2.0 2.0
|
||
2 b 3.0 4.0
|
||
3 b NaN 4.0
|
||
4 a 5.0 5.0
|
||
|
||
Align the differences on columns
|
||
|
||
>>> df.compare(df2)
|
||
col1 col3
|
||
self other self other
|
||
0 a c NaN NaN
|
||
2 NaN NaN 3.0 4.0
|
||
|
||
Stack the differences on rows
|
||
|
||
>>> df.compare(df2, align_axis=0)
|
||
col1 col3
|
||
0 self a NaN
|
||
other c NaN
|
||
2 self NaN 3.0
|
||
other NaN 4.0
|
||
|
||
Keep the equal values
|
||
|
||
>>> df.compare(df2, keep_equal=True)
|
||
col1 col3
|
||
self other self other
|
||
0 a c 1.0 1.0
|
||
2 b b 3.0 4.0
|
||
|
||
Keep all original rows and columns
|
||
|
||
>>> df.compare(df2, keep_shape=True)
|
||
col1 col2 col3
|
||
self other self other self other
|
||
0 a c NaN NaN NaN NaN
|
||
1 NaN NaN NaN NaN NaN NaN
|
||
2 NaN NaN NaN NaN 3.0 4.0
|
||
3 NaN NaN NaN NaN NaN NaN
|
||
4 NaN NaN NaN NaN NaN NaN
|
||
|
||
Keep all original rows and columns and also all original values
|
||
|
||
>>> df.compare(df2, keep_shape=True, keep_equal=True)
|
||
col1 col2 col3
|
||
self other self other self other
|
||
0 a c 1.0 1.0 1.0 1.0
|
||
1 a a 2.0 2.0 2.0 2.0
|
||
2 b b 3.0 3.0 3.0 4.0
|
||
3 b b NaN NaN 4.0 4.0
|
||
4 a a 5.0 5.0 5.0 5.0
|
||
""",
|
||
klass=_shared_doc_kwargs["klass"],
|
||
)
|
||
def compare(
|
||
self,
|
||
other: DataFrame,
|
||
align_axis: Axis = 1,
|
||
keep_shape: bool = False,
|
||
keep_equal: bool = False,
|
||
) -> DataFrame:
|
||
return super().compare(
|
||
other=other,
|
||
align_axis=align_axis,
|
||
keep_shape=keep_shape,
|
||
keep_equal=keep_equal,
|
||
)
|
||
|
||
def combine(
|
||
self, other: DataFrame, func, fill_value=None, overwrite=True
|
||
) -> DataFrame:
|
||
"""
|
||
Perform column-wise combine with another DataFrame.
|
||
|
||
Combines a DataFrame with `other` DataFrame using `func`
|
||
to element-wise combine columns. The row and column indexes of the
|
||
resulting DataFrame will be the union of the two.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame
|
||
The DataFrame to merge column-wise.
|
||
func : function
|
||
Function that takes two series as inputs and return a Series or a
|
||
scalar. Used to merge the two dataframes column by columns.
|
||
fill_value : scalar value, default None
|
||
The value to fill NaNs with prior to passing any column to the
|
||
merge func.
|
||
overwrite : bool, default True
|
||
If True, columns in `self` that do not exist in `other` will be
|
||
overwritten with NaNs.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Combination of the provided DataFrames.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.combine_first : Combine two DataFrame objects and default to
|
||
non-null values in frame calling the method.
|
||
|
||
Examples
|
||
--------
|
||
Combine using a simple function that chooses the smaller column.
|
||
|
||
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
|
||
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
|
||
>>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
|
||
>>> df1.combine(df2, take_smaller)
|
||
A B
|
||
0 0 3
|
||
1 0 3
|
||
|
||
Example using a true element-wise combine function.
|
||
|
||
>>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
|
||
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
|
||
>>> df1.combine(df2, np.minimum)
|
||
A B
|
||
0 1 2
|
||
1 0 3
|
||
|
||
Using `fill_value` fills Nones prior to passing the column to the
|
||
merge function.
|
||
|
||
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
|
||
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
|
||
>>> df1.combine(df2, take_smaller, fill_value=-5)
|
||
A B
|
||
0 0 -5.0
|
||
1 0 4.0
|
||
|
||
However, if the same element in both dataframes is None, that None
|
||
is preserved
|
||
|
||
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
|
||
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
|
||
>>> df1.combine(df2, take_smaller, fill_value=-5)
|
||
A B
|
||
0 0 -5.0
|
||
1 0 3.0
|
||
|
||
Example that demonstrates the use of `overwrite` and behavior when
|
||
the axis differ between the dataframes.
|
||
|
||
>>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
|
||
>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
|
||
>>> df1.combine(df2, take_smaller)
|
||
A B C
|
||
0 NaN NaN NaN
|
||
1 NaN 3.0 -10.0
|
||
2 NaN 3.0 1.0
|
||
|
||
>>> df1.combine(df2, take_smaller, overwrite=False)
|
||
A B C
|
||
0 0.0 NaN NaN
|
||
1 0.0 3.0 -10.0
|
||
2 NaN 3.0 1.0
|
||
|
||
Demonstrating the preference of the passed in dataframe.
|
||
|
||
>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
|
||
>>> df2.combine(df1, take_smaller)
|
||
A B C
|
||
0 0.0 NaN NaN
|
||
1 0.0 3.0 NaN
|
||
2 NaN 3.0 NaN
|
||
|
||
>>> df2.combine(df1, take_smaller, overwrite=False)
|
||
A B C
|
||
0 0.0 NaN NaN
|
||
1 0.0 3.0 1.0
|
||
2 NaN 3.0 1.0
|
||
"""
|
||
other_idxlen = len(other.index) # save for compare
|
||
|
||
this, other = self.align(other, copy=False)
|
||
new_index = this.index
|
||
|
||
if other.empty and len(new_index) == len(self.index):
|
||
return self.copy()
|
||
|
||
if self.empty and len(other) == other_idxlen:
|
||
return other.copy()
|
||
|
||
# sorts if possible
|
||
new_columns = this.columns.union(other.columns)
|
||
do_fill = fill_value is not None
|
||
result = {}
|
||
for col in new_columns:
|
||
series = this[col]
|
||
otherSeries = other[col]
|
||
|
||
this_dtype = series.dtype
|
||
other_dtype = otherSeries.dtype
|
||
|
||
this_mask = isna(series)
|
||
other_mask = isna(otherSeries)
|
||
|
||
# don't overwrite columns unnecessarily
|
||
# DO propagate if this column is not in the intersection
|
||
if not overwrite and other_mask.all():
|
||
result[col] = this[col].copy()
|
||
continue
|
||
|
||
if do_fill:
|
||
series = series.copy()
|
||
otherSeries = otherSeries.copy()
|
||
series[this_mask] = fill_value
|
||
otherSeries[other_mask] = fill_value
|
||
|
||
if col not in self.columns:
|
||
# If self DataFrame does not have col in other DataFrame,
|
||
# try to promote series, which is all NaN, as other_dtype.
|
||
new_dtype = other_dtype
|
||
try:
|
||
series = series.astype(new_dtype, copy=False)
|
||
except ValueError:
|
||
# e.g. new_dtype is integer types
|
||
pass
|
||
else:
|
||
# if we have different dtypes, possibly promote
|
||
new_dtype = find_common_type([this_dtype, other_dtype])
|
||
if not is_dtype_equal(this_dtype, new_dtype):
|
||
series = series.astype(new_dtype)
|
||
if not is_dtype_equal(other_dtype, new_dtype):
|
||
otherSeries = otherSeries.astype(new_dtype)
|
||
|
||
arr = func(series, otherSeries)
|
||
arr = maybe_downcast_to_dtype(arr, new_dtype)
|
||
|
||
result[col] = arr
|
||
|
||
# convert_objects just in case
|
||
return self._constructor(result, index=new_index, columns=new_columns)
|
||
|
||
def combine_first(self, other: DataFrame) -> DataFrame:
|
||
"""
|
||
Update null elements with value in the same location in `other`.
|
||
|
||
Combine two DataFrame objects by filling null values in one DataFrame
|
||
with non-null values from other DataFrame. The row and column indexes
|
||
of the resulting DataFrame will be the union of the two.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame
|
||
Provided DataFrame to use to fill null values.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.combine : Perform series-wise operation on two DataFrames
|
||
using a given function.
|
||
|
||
Examples
|
||
--------
|
||
>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
|
||
>>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
|
||
>>> df1.combine_first(df2)
|
||
A B
|
||
0 1.0 3.0
|
||
1 0.0 4.0
|
||
|
||
Null values still persist if the location of that null value
|
||
does not exist in `other`
|
||
|
||
>>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
|
||
>>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
|
||
>>> df1.combine_first(df2)
|
||
A B C
|
||
0 NaN 4.0 NaN
|
||
1 0.0 3.0 1.0
|
||
2 NaN 3.0 1.0
|
||
"""
|
||
import pandas.core.computation.expressions as expressions
|
||
|
||
def combiner(x, y):
|
||
mask = extract_array(isna(x))
|
||
|
||
x_values = extract_array(x, extract_numpy=True)
|
||
y_values = extract_array(y, extract_numpy=True)
|
||
|
||
# If the column y in other DataFrame is not in first DataFrame,
|
||
# just return y_values.
|
||
if y.name not in self.columns:
|
||
return y_values
|
||
|
||
return expressions.where(mask, y_values, x_values)
|
||
|
||
return self.combine(other, combiner, overwrite=False)
|
||
|
||
def update(
|
||
self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
|
||
) -> None:
|
||
"""
|
||
Modify in place using non-NA values from another DataFrame.
|
||
|
||
Aligns on indices. There is no return value.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame, or object coercible into a DataFrame
|
||
Should have at least one matching index/column label
|
||
with the original DataFrame. If a Series is passed,
|
||
its name attribute must be set, and that will be
|
||
used as the column name to align with the original DataFrame.
|
||
join : {'left'}, default 'left'
|
||
Only left join is implemented, keeping the index and columns of the
|
||
original object.
|
||
overwrite : bool, default True
|
||
How to handle non-NA values for overlapping keys:
|
||
|
||
* True: overwrite original DataFrame's values
|
||
with values from `other`.
|
||
* False: only update values that are NA in
|
||
the original DataFrame.
|
||
|
||
filter_func : callable(1d-array) -> bool 1d-array, optional
|
||
Can choose to replace values other than NA. Return True for values
|
||
that should be updated.
|
||
errors : {'raise', 'ignore'}, default 'ignore'
|
||
If 'raise', will raise a ValueError if the DataFrame and `other`
|
||
both contain non-NA data in the same place.
|
||
|
||
.. versionchanged:: 0.24.0
|
||
Changed from `raise_conflict=False|True`
|
||
to `errors='ignore'|'raise'`.
|
||
|
||
Returns
|
||
-------
|
||
None : method directly changes calling object
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
* When `errors='raise'` and there's overlapping non-NA data.
|
||
* When `errors` is not either `'ignore'` or `'raise'`
|
||
NotImplementedError
|
||
* If `join != 'left'`
|
||
|
||
See Also
|
||
--------
|
||
dict.update : Similar method for dictionaries.
|
||
DataFrame.merge : For column(s)-on-column(s) operations.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': [1, 2, 3],
|
||
... 'B': [400, 500, 600]})
|
||
>>> new_df = pd.DataFrame({'B': [4, 5, 6],
|
||
... 'C': [7, 8, 9]})
|
||
>>> df.update(new_df)
|
||
>>> df
|
||
A B
|
||
0 1 4
|
||
1 2 5
|
||
2 3 6
|
||
|
||
The DataFrame's length does not increase as a result of the update,
|
||
only values at matching index/column labels are updated.
|
||
|
||
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
|
||
... 'B': ['x', 'y', 'z']})
|
||
>>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
|
||
>>> df.update(new_df)
|
||
>>> df
|
||
A B
|
||
0 a d
|
||
1 b e
|
||
2 c f
|
||
|
||
For Series, its name attribute must be set.
|
||
|
||
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
|
||
... 'B': ['x', 'y', 'z']})
|
||
>>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
|
||
>>> df.update(new_column)
|
||
>>> df
|
||
A B
|
||
0 a d
|
||
1 b y
|
||
2 c e
|
||
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
|
||
... 'B': ['x', 'y', 'z']})
|
||
>>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
|
||
>>> df.update(new_df)
|
||
>>> df
|
||
A B
|
||
0 a x
|
||
1 b d
|
||
2 c e
|
||
|
||
If `other` contains NaNs the corresponding values are not updated
|
||
in the original dataframe.
|
||
|
||
>>> df = pd.DataFrame({'A': [1, 2, 3],
|
||
... 'B': [400, 500, 600]})
|
||
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
|
||
>>> df.update(new_df)
|
||
>>> df
|
||
A B
|
||
0 1 4.0
|
||
1 2 500.0
|
||
2 3 6.0
|
||
"""
|
||
import pandas.core.computation.expressions as expressions
|
||
|
||
# TODO: Support other joins
|
||
if join != "left": # pragma: no cover
|
||
raise NotImplementedError("Only left join is supported")
|
||
if errors not in ["ignore", "raise"]:
|
||
raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
|
||
|
||
if not isinstance(other, DataFrame):
|
||
other = DataFrame(other)
|
||
|
||
other = other.reindex_like(self)
|
||
|
||
for col in self.columns:
|
||
this = self[col]._values
|
||
that = other[col]._values
|
||
if filter_func is not None:
|
||
with np.errstate(all="ignore"):
|
||
mask = ~filter_func(this) | isna(that)
|
||
else:
|
||
if errors == "raise":
|
||
mask_this = notna(that)
|
||
mask_that = notna(this)
|
||
if any(mask_this & mask_that):
|
||
raise ValueError("Data overlaps.")
|
||
|
||
if overwrite:
|
||
mask = isna(that)
|
||
else:
|
||
mask = notna(this)
|
||
|
||
# don't overwrite columns unnecessarily
|
||
if mask.all():
|
||
continue
|
||
|
||
self[col] = expressions.where(mask, this, that)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Data reshaping
|
||
@Appender(
|
||
"""
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
|
||
... 'Parrot', 'Parrot'],
|
||
... 'Max Speed': [380., 370., 24., 26.]})
|
||
>>> df
|
||
Animal Max Speed
|
||
0 Falcon 380.0
|
||
1 Falcon 370.0
|
||
2 Parrot 24.0
|
||
3 Parrot 26.0
|
||
>>> df.groupby(['Animal']).mean()
|
||
Max Speed
|
||
Animal
|
||
Falcon 375.0
|
||
Parrot 25.0
|
||
|
||
**Hierarchical Indexes**
|
||
|
||
We can groupby different levels of a hierarchical index
|
||
using the `level` parameter:
|
||
|
||
>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
|
||
... ['Captive', 'Wild', 'Captive', 'Wild']]
|
||
>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
|
||
>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
|
||
... index=index)
|
||
>>> df
|
||
Max Speed
|
||
Animal Type
|
||
Falcon Captive 390.0
|
||
Wild 350.0
|
||
Parrot Captive 30.0
|
||
Wild 20.0
|
||
>>> df.groupby(level=0).mean()
|
||
Max Speed
|
||
Animal
|
||
Falcon 370.0
|
||
Parrot 25.0
|
||
>>> df.groupby(level="Type").mean()
|
||
Max Speed
|
||
Type
|
||
Captive 210.0
|
||
Wild 185.0
|
||
|
||
We can also choose to include NA in group keys or not by setting
|
||
`dropna` parameter, the default setting is `True`:
|
||
|
||
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
|
||
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
|
||
|
||
>>> df.groupby(by=["b"]).sum()
|
||
a c
|
||
b
|
||
1.0 2 3
|
||
2.0 2 5
|
||
|
||
>>> df.groupby(by=["b"], dropna=False).sum()
|
||
a c
|
||
b
|
||
1.0 2 3
|
||
2.0 2 5
|
||
NaN 1 4
|
||
|
||
>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
|
||
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
|
||
|
||
>>> df.groupby(by="a").sum()
|
||
b c
|
||
a
|
||
a 13.0 13.0
|
||
b 12.3 123.0
|
||
|
||
>>> df.groupby(by="a", dropna=False).sum()
|
||
b c
|
||
a
|
||
a 13.0 13.0
|
||
b 12.3 123.0
|
||
NaN 12.3 33.0
|
||
"""
|
||
)
|
||
@Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
|
||
def groupby(
|
||
self,
|
||
by=None,
|
||
axis=0,
|
||
level=None,
|
||
as_index: bool = True,
|
||
sort: bool = True,
|
||
group_keys: bool = True,
|
||
squeeze: bool = no_default,
|
||
observed: bool = False,
|
||
dropna: bool = True,
|
||
) -> DataFrameGroupBy:
|
||
from pandas.core.groupby.generic import DataFrameGroupBy
|
||
|
||
if squeeze is not no_default:
|
||
warnings.warn(
|
||
(
|
||
"The `squeeze` parameter is deprecated and "
|
||
"will be removed in a future version."
|
||
),
|
||
FutureWarning,
|
||
stacklevel=2,
|
||
)
|
||
else:
|
||
squeeze = False
|
||
|
||
if level is None and by is None:
|
||
raise TypeError("You have to supply one of 'by' and 'level'")
|
||
axis = self._get_axis_number(axis)
|
||
|
||
return DataFrameGroupBy(
|
||
obj=self,
|
||
keys=by,
|
||
axis=axis,
|
||
level=level,
|
||
as_index=as_index,
|
||
sort=sort,
|
||
group_keys=group_keys,
|
||
squeeze=squeeze,
|
||
observed=observed,
|
||
dropna=dropna,
|
||
)
|
||
|
||
_shared_docs[
|
||
"pivot"
|
||
] = """
|
||
Return reshaped DataFrame organized by given index / column values.
|
||
|
||
Reshape data (produce a "pivot" table) based on column values. Uses
|
||
unique values from specified `index` / `columns` to form axes of the
|
||
resulting DataFrame. This function does not support data
|
||
aggregation, multiple values will result in a MultiIndex in the
|
||
columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
|
||
|
||
Parameters
|
||
----------%s
|
||
index : str or object or a list of str, optional
|
||
Column to use to make new frame's index. If None, uses
|
||
existing index.
|
||
|
||
.. versionchanged:: 1.1.0
|
||
Also accept list of index names.
|
||
|
||
columns : str or object or a list of str
|
||
Column to use to make new frame's columns.
|
||
|
||
.. versionchanged:: 1.1.0
|
||
Also accept list of columns names.
|
||
|
||
values : str, object or a list of the previous, optional
|
||
Column(s) to use for populating new frame's values. If not
|
||
specified, all remaining columns will be used and the result will
|
||
have hierarchically indexed columns.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Returns reshaped DataFrame.
|
||
|
||
Raises
|
||
------
|
||
ValueError:
|
||
When there are any `index`, `columns` combinations with multiple
|
||
values. `DataFrame.pivot_table` when you need to aggregate.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.pivot_table : Generalization of pivot that can handle
|
||
duplicate values for one index/column pair.
|
||
DataFrame.unstack : Pivot based on the index values instead of a
|
||
column.
|
||
wide_to_long : Wide panel to long format. Less flexible but more
|
||
user-friendly than melt.
|
||
|
||
Notes
|
||
-----
|
||
For finer-tuned control, see hierarchical indexing documentation along
|
||
with the related stack/unstack methods.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
|
||
... 'two'],
|
||
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
|
||
... 'baz': [1, 2, 3, 4, 5, 6],
|
||
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
|
||
>>> df
|
||
foo bar baz zoo
|
||
0 one A 1 x
|
||
1 one B 2 y
|
||
2 one C 3 z
|
||
3 two A 4 q
|
||
4 two B 5 w
|
||
5 two C 6 t
|
||
|
||
>>> df.pivot(index='foo', columns='bar', values='baz')
|
||
bar A B C
|
||
foo
|
||
one 1 2 3
|
||
two 4 5 6
|
||
|
||
>>> df.pivot(index='foo', columns='bar')['baz']
|
||
bar A B C
|
||
foo
|
||
one 1 2 3
|
||
two 4 5 6
|
||
|
||
>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
|
||
baz zoo
|
||
bar A B C A B C
|
||
foo
|
||
one 1 2 3 x y z
|
||
two 4 5 6 q w t
|
||
|
||
You could also assign a list of column names or a list of index names.
|
||
|
||
>>> df = pd.DataFrame({
|
||
... "lev1": [1, 1, 1, 2, 2, 2],
|
||
... "lev2": [1, 1, 2, 1, 1, 2],
|
||
... "lev3": [1, 2, 1, 2, 1, 2],
|
||
... "lev4": [1, 2, 3, 4, 5, 6],
|
||
... "values": [0, 1, 2, 3, 4, 5]})
|
||
>>> df
|
||
lev1 lev2 lev3 lev4 values
|
||
0 1 1 1 1 0
|
||
1 1 1 2 2 1
|
||
2 1 2 1 3 2
|
||
3 2 1 2 4 3
|
||
4 2 1 1 5 4
|
||
5 2 2 2 6 5
|
||
|
||
>>> df.pivot(index="lev1", columns=["lev2", "lev3"],values="values")
|
||
lev2 1 2
|
||
lev3 1 2 1 2
|
||
lev1
|
||
1 0.0 1.0 2.0 NaN
|
||
2 4.0 3.0 NaN 5.0
|
||
|
||
>>> df.pivot(index=["lev1", "lev2"], columns=["lev3"],values="values")
|
||
lev3 1 2
|
||
lev1 lev2
|
||
1 1 0.0 1.0
|
||
2 2.0 NaN
|
||
2 1 4.0 3.0
|
||
2 NaN 5.0
|
||
|
||
A ValueError is raised if there are any duplicates.
|
||
|
||
>>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
|
||
... "bar": ['A', 'A', 'B', 'C'],
|
||
... "baz": [1, 2, 3, 4]})
|
||
>>> df
|
||
foo bar baz
|
||
0 one A 1
|
||
1 one A 2
|
||
2 two B 3
|
||
3 two C 4
|
||
|
||
Notice that the first two rows are the same for our `index`
|
||
and `columns` arguments.
|
||
|
||
>>> df.pivot(index='foo', columns='bar', values='baz')
|
||
Traceback (most recent call last):
|
||
...
|
||
ValueError: Index contains duplicate entries, cannot reshape
|
||
"""
|
||
|
||
@Substitution("")
|
||
@Appender(_shared_docs["pivot"])
|
||
def pivot(self, index=None, columns=None, values=None) -> DataFrame:
|
||
from pandas.core.reshape.pivot import pivot
|
||
|
||
return pivot(self, index=index, columns=columns, values=values)
|
||
|
||
_shared_docs[
|
||
"pivot_table"
|
||
] = """
|
||
Create a spreadsheet-style pivot table as a DataFrame.
|
||
|
||
The levels in the pivot table will be stored in MultiIndex objects
|
||
(hierarchical indexes) on the index and columns of the result DataFrame.
|
||
|
||
Parameters
|
||
----------%s
|
||
values : column to aggregate, optional
|
||
index : column, Grouper, array, or list of the previous
|
||
If an array is passed, it must be the same length as the data. The
|
||
list can contain any of the other types (except list).
|
||
Keys to group by on the pivot table index. If an array is passed,
|
||
it is being used as the same manner as column values.
|
||
columns : column, Grouper, array, or list of the previous
|
||
If an array is passed, it must be the same length as the data. The
|
||
list can contain any of the other types (except list).
|
||
Keys to group by on the pivot table column. If an array is passed,
|
||
it is being used as the same manner as column values.
|
||
aggfunc : function, list of functions, dict, default numpy.mean
|
||
If list of functions passed, the resulting pivot table will have
|
||
hierarchical columns whose top level are the function names
|
||
(inferred from the function objects themselves)
|
||
If dict is passed, the key is column to aggregate and value
|
||
is function or list of functions.
|
||
fill_value : scalar, default None
|
||
Value to replace missing values with (in the resulting pivot table,
|
||
after aggregation).
|
||
margins : bool, default False
|
||
Add all row / columns (e.g. for subtotal / grand totals).
|
||
dropna : bool, default True
|
||
Do not include columns whose entries are all NaN.
|
||
margins_name : str, default 'All'
|
||
Name of the row / column that will contain the totals
|
||
when margins is True.
|
||
observed : bool, default False
|
||
This only applies if any of the groupers are Categoricals.
|
||
If True: only show observed values for categorical groupers.
|
||
If False: show all values for categorical groupers.
|
||
|
||
.. versionchanged:: 0.25.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
An Excel style pivot table.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.pivot : Pivot without aggregation that can handle
|
||
non-numeric data.
|
||
DataFrame.melt: Unpivot a DataFrame from wide to long format,
|
||
optionally leaving identifiers set.
|
||
wide_to_long : Wide panel to long format. Less flexible but more
|
||
user-friendly than melt.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
|
||
... "bar", "bar", "bar", "bar"],
|
||
... "B": ["one", "one", "one", "two", "two",
|
||
... "one", "one", "two", "two"],
|
||
... "C": ["small", "large", "large", "small",
|
||
... "small", "large", "small", "small",
|
||
... "large"],
|
||
... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
|
||
... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
|
||
>>> df
|
||
A B C D E
|
||
0 foo one small 1 2
|
||
1 foo one large 2 4
|
||
2 foo one large 2 5
|
||
3 foo two small 3 5
|
||
4 foo two small 3 6
|
||
5 bar one large 4 6
|
||
6 bar one small 5 8
|
||
7 bar two small 6 9
|
||
8 bar two large 7 9
|
||
|
||
This first example aggregates values by taking the sum.
|
||
|
||
>>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
|
||
... columns=['C'], aggfunc=np.sum)
|
||
>>> table
|
||
C large small
|
||
A B
|
||
bar one 4.0 5.0
|
||
two 7.0 6.0
|
||
foo one 4.0 1.0
|
||
two NaN 6.0
|
||
|
||
We can also fill missing values using the `fill_value` parameter.
|
||
|
||
>>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
|
||
... columns=['C'], aggfunc=np.sum, fill_value=0)
|
||
>>> table
|
||
C large small
|
||
A B
|
||
bar one 4 5
|
||
two 7 6
|
||
foo one 4 1
|
||
two 0 6
|
||
|
||
The next example aggregates by taking the mean across multiple columns.
|
||
|
||
>>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
|
||
... aggfunc={'D': np.mean,
|
||
... 'E': np.mean})
|
||
>>> table
|
||
D E
|
||
A C
|
||
bar large 5.500000 7.500000
|
||
small 5.500000 8.500000
|
||
foo large 2.000000 4.500000
|
||
small 2.333333 4.333333
|
||
|
||
We can also calculate multiple types of aggregations for any given
|
||
value column.
|
||
|
||
>>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
|
||
... aggfunc={'D': np.mean,
|
||
... 'E': [min, max, np.mean]})
|
||
>>> table
|
||
D E
|
||
mean max mean min
|
||
A C
|
||
bar large 5.500000 9.0 7.500000 6.0
|
||
small 5.500000 9.0 8.500000 8.0
|
||
foo large 2.000000 5.0 4.500000 4.0
|
||
small 2.333333 6.0 4.333333 2.0
|
||
"""
|
||
|
||
@Substitution("")
|
||
@Appender(_shared_docs["pivot_table"])
|
||
def pivot_table(
|
||
self,
|
||
values=None,
|
||
index=None,
|
||
columns=None,
|
||
aggfunc="mean",
|
||
fill_value=None,
|
||
margins=False,
|
||
dropna=True,
|
||
margins_name="All",
|
||
observed=False,
|
||
) -> DataFrame:
|
||
from pandas.core.reshape.pivot import pivot_table
|
||
|
||
return pivot_table(
|
||
self,
|
||
values=values,
|
||
index=index,
|
||
columns=columns,
|
||
aggfunc=aggfunc,
|
||
fill_value=fill_value,
|
||
margins=margins,
|
||
dropna=dropna,
|
||
margins_name=margins_name,
|
||
observed=observed,
|
||
)
|
||
|
||
def stack(self, level=-1, dropna=True):
|
||
"""
|
||
Stack the prescribed level(s) from columns to index.
|
||
|
||
Return a reshaped DataFrame or Series having a multi-level
|
||
index with one or more new inner-most levels compared to the current
|
||
DataFrame. The new inner-most levels are created by pivoting the
|
||
columns of the current dataframe:
|
||
|
||
- if the columns have a single level, the output is a Series;
|
||
- if the columns have multiple levels, the new index
|
||
level(s) is (are) taken from the prescribed level(s) and
|
||
the output is a DataFrame.
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, list, default -1
|
||
Level(s) to stack from the column axis onto the index
|
||
axis, defined as one index or label, or a list of indices
|
||
or labels.
|
||
dropna : bool, default True
|
||
Whether to drop rows in the resulting Frame/Series with
|
||
missing values. Stacking a column level onto the index
|
||
axis can create combinations of index and column values
|
||
that are missing from the original dataframe. See Examples
|
||
section.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame or Series
|
||
Stacked dataframe or series.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.unstack : Unstack prescribed level(s) from index axis
|
||
onto column axis.
|
||
DataFrame.pivot : Reshape dataframe from long format to wide
|
||
format.
|
||
DataFrame.pivot_table : Create a spreadsheet-style pivot table
|
||
as a DataFrame.
|
||
|
||
Notes
|
||
-----
|
||
The function is named by analogy with a collection of books
|
||
being reorganized from being side by side on a horizontal
|
||
position (the columns of the dataframe) to being stacked
|
||
vertically on top of each other (in the index of the
|
||
dataframe).
|
||
|
||
Examples
|
||
--------
|
||
**Single level columns**
|
||
|
||
>>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
|
||
... index=['cat', 'dog'],
|
||
... columns=['weight', 'height'])
|
||
|
||
Stacking a dataframe with a single level column axis returns a Series:
|
||
|
||
>>> df_single_level_cols
|
||
weight height
|
||
cat 0 1
|
||
dog 2 3
|
||
>>> df_single_level_cols.stack()
|
||
cat weight 0
|
||
height 1
|
||
dog weight 2
|
||
height 3
|
||
dtype: int64
|
||
|
||
**Multi level columns: simple case**
|
||
|
||
>>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
|
||
... ('weight', 'pounds')])
|
||
>>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
|
||
... index=['cat', 'dog'],
|
||
... columns=multicol1)
|
||
|
||
Stacking a dataframe with a multi-level column axis:
|
||
|
||
>>> df_multi_level_cols1
|
||
weight
|
||
kg pounds
|
||
cat 1 2
|
||
dog 2 4
|
||
>>> df_multi_level_cols1.stack()
|
||
weight
|
||
cat kg 1
|
||
pounds 2
|
||
dog kg 2
|
||
pounds 4
|
||
|
||
**Missing values**
|
||
|
||
>>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
|
||
... ('height', 'm')])
|
||
>>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
|
||
... index=['cat', 'dog'],
|
||
... columns=multicol2)
|
||
|
||
It is common to have missing values when stacking a dataframe
|
||
with multi-level columns, as the stacked dataframe typically
|
||
has more values than the original dataframe. Missing values
|
||
are filled with NaNs:
|
||
|
||
>>> df_multi_level_cols2
|
||
weight height
|
||
kg m
|
||
cat 1.0 2.0
|
||
dog 3.0 4.0
|
||
>>> df_multi_level_cols2.stack()
|
||
height weight
|
||
cat kg NaN 1.0
|
||
m 2.0 NaN
|
||
dog kg NaN 3.0
|
||
m 4.0 NaN
|
||
|
||
**Prescribing the level(s) to be stacked**
|
||
|
||
The first parameter controls which level or levels are stacked:
|
||
|
||
>>> df_multi_level_cols2.stack(0)
|
||
kg m
|
||
cat height NaN 2.0
|
||
weight 1.0 NaN
|
||
dog height NaN 4.0
|
||
weight 3.0 NaN
|
||
>>> df_multi_level_cols2.stack([0, 1])
|
||
cat height m 2.0
|
||
weight kg 1.0
|
||
dog height m 4.0
|
||
weight kg 3.0
|
||
dtype: float64
|
||
|
||
**Dropping missing values**
|
||
|
||
>>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
|
||
... index=['cat', 'dog'],
|
||
... columns=multicol2)
|
||
|
||
Note that rows where all values are missing are dropped by
|
||
default but this behaviour can be controlled via the dropna
|
||
keyword parameter:
|
||
|
||
>>> df_multi_level_cols3
|
||
weight height
|
||
kg m
|
||
cat NaN 1.0
|
||
dog 2.0 3.0
|
||
>>> df_multi_level_cols3.stack(dropna=False)
|
||
height weight
|
||
cat kg NaN NaN
|
||
m 1.0 NaN
|
||
dog kg NaN 2.0
|
||
m 3.0 NaN
|
||
>>> df_multi_level_cols3.stack(dropna=True)
|
||
height weight
|
||
cat m 1.0 NaN
|
||
dog kg NaN 2.0
|
||
m 3.0 NaN
|
||
"""
|
||
from pandas.core.reshape.reshape import stack, stack_multiple
|
||
|
||
if isinstance(level, (tuple, list)):
|
||
result = stack_multiple(self, level, dropna=dropna)
|
||
else:
|
||
result = stack(self, level, dropna=dropna)
|
||
|
||
return result.__finalize__(self, method="stack")
|
||
|
||
def explode(
|
||
self, column: Union[str, Tuple], ignore_index: bool = False
|
||
) -> DataFrame:
|
||
"""
|
||
Transform each element of a list-like to a row, replicating index values.
|
||
|
||
.. versionadded:: 0.25.0
|
||
|
||
Parameters
|
||
----------
|
||
column : str or tuple
|
||
Column to explode.
|
||
ignore_index : bool, default False
|
||
If True, the resulting index will be labeled 0, 1, …, n - 1.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Exploded lists to rows of the subset columns;
|
||
index will be duplicated for these rows.
|
||
|
||
Raises
|
||
------
|
||
ValueError :
|
||
if columns of the frame are not unique.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
|
||
index labels.
|
||
DataFrame.melt : Unpivot a DataFrame from wide format to long format.
|
||
Series.explode : Explode a DataFrame from list-like columns to long format.
|
||
|
||
Notes
|
||
-----
|
||
This routine will explode list-likes including lists, tuples, sets,
|
||
Series, and np.ndarray. The result dtype of the subset rows will
|
||
be object. Scalars will be returned unchanged, and empty list-likes will
|
||
result in a np.nan for that row. In addition, the ordering of rows in the
|
||
output will be non-deterministic when exploding sets.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
|
||
>>> df
|
||
A B
|
||
0 [1, 2, 3] 1
|
||
1 foo 1
|
||
2 [] 1
|
||
3 [3, 4] 1
|
||
|
||
>>> df.explode('A')
|
||
A B
|
||
0 1 1
|
||
0 2 1
|
||
0 3 1
|
||
1 foo 1
|
||
2 NaN 1
|
||
3 3 1
|
||
3 4 1
|
||
"""
|
||
if not (is_scalar(column) or isinstance(column, tuple)):
|
||
raise ValueError("column must be a scalar")
|
||
if not self.columns.is_unique:
|
||
raise ValueError("columns must be unique")
|
||
|
||
df = self.reset_index(drop=True)
|
||
result = df[column].explode()
|
||
result = df.drop([column], axis=1).join(result)
|
||
if ignore_index:
|
||
result.index = ibase.default_index(len(result))
|
||
else:
|
||
result.index = self.index.take(result.index)
|
||
result = result.reindex(columns=self.columns, copy=False)
|
||
|
||
return result
|
||
|
||
def unstack(self, level=-1, fill_value=None):
|
||
"""
|
||
Pivot a level of the (necessarily hierarchical) index labels.
|
||
|
||
Returns a DataFrame having a new level of column labels whose inner-most level
|
||
consists of the pivoted index labels.
|
||
|
||
If the index is not a MultiIndex, the output will be a Series
|
||
(the analogue of stack when the columns are not a MultiIndex).
|
||
|
||
Parameters
|
||
----------
|
||
level : int, str, or list of these, default -1 (last level)
|
||
Level(s) of index to unstack, can pass level name.
|
||
fill_value : int, str or dict
|
||
Replace NaN with this value if the unstack produces missing values.
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.pivot : Pivot a table based on column values.
|
||
DataFrame.stack : Pivot a level of the column labels (inverse operation
|
||
from `unstack`).
|
||
|
||
Examples
|
||
--------
|
||
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
|
||
... ('two', 'a'), ('two', 'b')])
|
||
>>> s = pd.Series(np.arange(1.0, 5.0), index=index)
|
||
>>> s
|
||
one a 1.0
|
||
b 2.0
|
||
two a 3.0
|
||
b 4.0
|
||
dtype: float64
|
||
|
||
>>> s.unstack(level=-1)
|
||
a b
|
||
one 1.0 2.0
|
||
two 3.0 4.0
|
||
|
||
>>> s.unstack(level=0)
|
||
one two
|
||
a 1.0 3.0
|
||
b 2.0 4.0
|
||
|
||
>>> df = s.unstack(level=0)
|
||
>>> df.unstack()
|
||
one a 1.0
|
||
b 2.0
|
||
two a 3.0
|
||
b 4.0
|
||
dtype: float64
|
||
"""
|
||
from pandas.core.reshape.reshape import unstack
|
||
|
||
result = unstack(self, level, fill_value)
|
||
|
||
return result.__finalize__(self, method="unstack")
|
||
|
||
@Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
|
||
def melt(
|
||
self,
|
||
id_vars=None,
|
||
value_vars=None,
|
||
var_name=None,
|
||
value_name="value",
|
||
col_level=None,
|
||
ignore_index=True,
|
||
) -> DataFrame:
|
||
|
||
return melt(
|
||
self,
|
||
id_vars=id_vars,
|
||
value_vars=value_vars,
|
||
var_name=var_name,
|
||
value_name=value_name,
|
||
col_level=col_level,
|
||
ignore_index=ignore_index,
|
||
)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Time series-related
|
||
|
||
@doc(
|
||
Series.diff,
|
||
klass="Dataframe",
|
||
extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
|
||
"Take difference over rows (0) or columns (1).\n",
|
||
other_klass="Series",
|
||
examples=dedent(
|
||
"""
|
||
Difference with previous row
|
||
|
||
>>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
|
||
... 'b': [1, 1, 2, 3, 5, 8],
|
||
... 'c': [1, 4, 9, 16, 25, 36]})
|
||
>>> df
|
||
a b c
|
||
0 1 1 1
|
||
1 2 1 4
|
||
2 3 2 9
|
||
3 4 3 16
|
||
4 5 5 25
|
||
5 6 8 36
|
||
|
||
>>> df.diff()
|
||
a b c
|
||
0 NaN NaN NaN
|
||
1 1.0 0.0 3.0
|
||
2 1.0 1.0 5.0
|
||
3 1.0 1.0 7.0
|
||
4 1.0 2.0 9.0
|
||
5 1.0 3.0 11.0
|
||
|
||
Difference with previous column
|
||
|
||
>>> df.diff(axis=1)
|
||
a b c
|
||
0 NaN 0 0
|
||
1 NaN -1 3
|
||
2 NaN -1 7
|
||
3 NaN -1 13
|
||
4 NaN 0 20
|
||
5 NaN 2 28
|
||
|
||
Difference with 3rd previous row
|
||
|
||
>>> df.diff(periods=3)
|
||
a b c
|
||
0 NaN NaN NaN
|
||
1 NaN NaN NaN
|
||
2 NaN NaN NaN
|
||
3 3.0 2.0 15.0
|
||
4 3.0 4.0 21.0
|
||
5 3.0 6.0 27.0
|
||
|
||
Difference with following row
|
||
|
||
>>> df.diff(periods=-1)
|
||
a b c
|
||
0 -1.0 0.0 -3.0
|
||
1 -1.0 -1.0 -5.0
|
||
2 -1.0 -1.0 -7.0
|
||
3 -1.0 -2.0 -9.0
|
||
4 -1.0 -3.0 -11.0
|
||
5 NaN NaN NaN
|
||
|
||
Overflow in input dtype
|
||
|
||
>>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
|
||
>>> df.diff()
|
||
a
|
||
0 NaN
|
||
1 255.0"""
|
||
),
|
||
)
|
||
def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
|
||
if not isinstance(periods, int):
|
||
if not (is_float(periods) and periods.is_integer()):
|
||
raise ValueError("periods must be an integer")
|
||
periods = int(periods)
|
||
|
||
bm_axis = self._get_block_manager_axis(axis)
|
||
|
||
if bm_axis == 0 and periods != 0:
|
||
return self - self.shift(periods, axis=axis)
|
||
|
||
new_data = self._mgr.diff(n=periods, axis=bm_axis)
|
||
return self._constructor(new_data).__finalize__(self, "diff")
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Function application
|
||
|
||
def _gotitem(
|
||
self,
|
||
key: Union[Label, List[Label]],
|
||
ndim: int,
|
||
subset: Optional[FrameOrSeriesUnion] = None,
|
||
) -> FrameOrSeriesUnion:
|
||
"""
|
||
Sub-classes to define. Return a sliced object.
|
||
|
||
Parameters
|
||
----------
|
||
key : string / list of selections
|
||
ndim : 1,2
|
||
requested ndim of result
|
||
subset : object, default None
|
||
subset to act on
|
||
"""
|
||
if subset is None:
|
||
subset = self
|
||
elif subset.ndim == 1: # is Series
|
||
return subset
|
||
|
||
# TODO: _shallow_copy(subset)?
|
||
return subset[key]
|
||
|
||
_agg_summary_and_see_also_doc = dedent(
|
||
"""
|
||
The aggregation operations are always performed over an axis, either the
|
||
index (default) or the column axis. This behavior is different from
|
||
`numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
|
||
`var`), where the default is to compute the aggregation of the flattened
|
||
array, e.g., ``numpy.mean(arr_2d)`` as opposed to
|
||
``numpy.mean(arr_2d, axis=0)``.
|
||
|
||
`agg` is an alias for `aggregate`. Use the alias.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.apply : Perform any type of operations.
|
||
DataFrame.transform : Perform transformation type operations.
|
||
core.groupby.GroupBy : Perform operations over groups.
|
||
core.resample.Resampler : Perform operations over resampled bins.
|
||
core.window.Rolling : Perform operations over rolling window.
|
||
core.window.Expanding : Perform operations over expanding window.
|
||
core.window.ExponentialMovingWindow : Perform operation over exponential weighted
|
||
window.
|
||
"""
|
||
)
|
||
|
||
_agg_examples_doc = dedent(
|
||
"""
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([[1, 2, 3],
|
||
... [4, 5, 6],
|
||
... [7, 8, 9],
|
||
... [np.nan, np.nan, np.nan]],
|
||
... columns=['A', 'B', 'C'])
|
||
|
||
Aggregate these functions over the rows.
|
||
|
||
>>> df.agg(['sum', 'min'])
|
||
A B C
|
||
sum 12.0 15.0 18.0
|
||
min 1.0 2.0 3.0
|
||
|
||
Different aggregations per column.
|
||
|
||
>>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
|
||
A B
|
||
sum 12.0 NaN
|
||
min 1.0 2.0
|
||
max NaN 8.0
|
||
|
||
Aggregate different functions over the columns and rename the index of the resulting
|
||
DataFrame.
|
||
|
||
>>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
|
||
A B C
|
||
x 7.0 NaN NaN
|
||
y NaN 2.0 NaN
|
||
z NaN NaN 6.0
|
||
|
||
Aggregate over the columns.
|
||
|
||
>>> df.agg("mean", axis="columns")
|
||
0 2.0
|
||
1 5.0
|
||
2 8.0
|
||
3 NaN
|
||
dtype: float64
|
||
"""
|
||
)
|
||
|
||
@doc(
|
||
_shared_docs["aggregate"],
|
||
klass=_shared_doc_kwargs["klass"],
|
||
axis=_shared_doc_kwargs["axis"],
|
||
see_also=_agg_summary_and_see_also_doc,
|
||
examples=_agg_examples_doc,
|
||
)
|
||
def aggregate(self, func=None, axis=0, *args, **kwargs):
|
||
axis = self._get_axis_number(axis)
|
||
|
||
relabeling, func, columns, order = reconstruct_func(func, **kwargs)
|
||
|
||
result = None
|
||
try:
|
||
result, how = self._aggregate(func, axis, *args, **kwargs)
|
||
except TypeError as err:
|
||
exc = TypeError(
|
||
"DataFrame constructor called with "
|
||
f"incompatible data and dtype: {err}"
|
||
)
|
||
raise exc from err
|
||
if result is None:
|
||
return self.apply(func, axis=axis, args=args, **kwargs)
|
||
|
||
if relabeling:
|
||
# This is to keep the order to columns occurrence unchanged, and also
|
||
# keep the order of new columns occurrence unchanged
|
||
|
||
# For the return values of reconstruct_func, if relabeling is
|
||
# False, columns and order will be None.
|
||
assert columns is not None
|
||
assert order is not None
|
||
|
||
result_in_dict = relabel_result(result, func, columns, order)
|
||
result = DataFrame(result_in_dict, index=columns)
|
||
|
||
return result
|
||
|
||
def _aggregate(self, arg, axis=0, *args, **kwargs):
|
||
if axis == 1:
|
||
# NDFrame.aggregate returns a tuple, and we need to transpose
|
||
# only result
|
||
result, how = aggregate(self.T, arg, *args, **kwargs)
|
||
result = result.T if result is not None else result
|
||
return result, how
|
||
return aggregate(self, arg, *args, **kwargs)
|
||
|
||
agg = aggregate
|
||
|
||
@doc(
|
||
_shared_docs["transform"],
|
||
klass=_shared_doc_kwargs["klass"],
|
||
axis=_shared_doc_kwargs["axis"],
|
||
)
|
||
def transform(
|
||
self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
|
||
) -> DataFrame:
|
||
result = transform(self, func, axis, *args, **kwargs)
|
||
assert isinstance(result, DataFrame)
|
||
return result
|
||
|
||
def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
|
||
"""
|
||
Apply a function along an axis of the DataFrame.
|
||
|
||
Objects passed to the function are Series objects whose index is
|
||
either the DataFrame's index (``axis=0``) or the DataFrame's columns
|
||
(``axis=1``). By default (``result_type=None``), the final return type
|
||
is inferred from the return type of the applied function. Otherwise,
|
||
it depends on the `result_type` argument.
|
||
|
||
Parameters
|
||
----------
|
||
func : function
|
||
Function to apply to each column or row.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
Axis along which the function is applied:
|
||
|
||
* 0 or 'index': apply function to each column.
|
||
* 1 or 'columns': apply function to each row.
|
||
|
||
raw : bool, default False
|
||
Determines if row or column is passed as a Series or ndarray object:
|
||
|
||
* ``False`` : passes each row or column as a Series to the
|
||
function.
|
||
* ``True`` : the passed function will receive ndarray objects
|
||
instead.
|
||
If you are just applying a NumPy reduction function this will
|
||
achieve much better performance.
|
||
|
||
result_type : {'expand', 'reduce', 'broadcast', None}, default None
|
||
These only act when ``axis=1`` (columns):
|
||
|
||
* 'expand' : list-like results will be turned into columns.
|
||
* 'reduce' : returns a Series if possible rather than expanding
|
||
list-like results. This is the opposite of 'expand'.
|
||
* 'broadcast' : results will be broadcast to the original shape
|
||
of the DataFrame, the original index and columns will be
|
||
retained.
|
||
|
||
The default behaviour (None) depends on the return value of the
|
||
applied function: list-like results will be returned as a Series
|
||
of those. However if the apply function returns a Series these
|
||
are expanded to columns.
|
||
args : tuple
|
||
Positional arguments to pass to `func` in addition to the
|
||
array/series.
|
||
**kwds
|
||
Additional keyword arguments to pass as keywords arguments to
|
||
`func`.
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
Result of applying ``func`` along the given axis of the
|
||
DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.applymap: For elementwise operations.
|
||
DataFrame.aggregate: Only perform aggregating type operations.
|
||
DataFrame.transform: Only perform transforming type operations.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
|
||
>>> df
|
||
A B
|
||
0 4 9
|
||
1 4 9
|
||
2 4 9
|
||
|
||
Using a numpy universal function (in this case the same as
|
||
``np.sqrt(df)``):
|
||
|
||
>>> df.apply(np.sqrt)
|
||
A B
|
||
0 2.0 3.0
|
||
1 2.0 3.0
|
||
2 2.0 3.0
|
||
|
||
Using a reducing function on either axis
|
||
|
||
>>> df.apply(np.sum, axis=0)
|
||
A 12
|
||
B 27
|
||
dtype: int64
|
||
|
||
>>> df.apply(np.sum, axis=1)
|
||
0 13
|
||
1 13
|
||
2 13
|
||
dtype: int64
|
||
|
||
Returning a list-like will result in a Series
|
||
|
||
>>> df.apply(lambda x: [1, 2], axis=1)
|
||
0 [1, 2]
|
||
1 [1, 2]
|
||
2 [1, 2]
|
||
dtype: object
|
||
|
||
Passing ``result_type='expand'`` will expand list-like results
|
||
to columns of a Dataframe
|
||
|
||
>>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
|
||
0 1
|
||
0 1 2
|
||
1 1 2
|
||
2 1 2
|
||
|
||
Returning a Series inside the function is similar to passing
|
||
``result_type='expand'``. The resulting column names
|
||
will be the Series index.
|
||
|
||
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
|
||
foo bar
|
||
0 1 2
|
||
1 1 2
|
||
2 1 2
|
||
|
||
Passing ``result_type='broadcast'`` will ensure the same shape
|
||
result, whether list-like or scalar is returned by the function,
|
||
and broadcast it along the axis. The resulting column names will
|
||
be the originals.
|
||
|
||
>>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
|
||
A B
|
||
0 1 2
|
||
1 1 2
|
||
2 1 2
|
||
"""
|
||
from pandas.core.apply import frame_apply
|
||
|
||
op = frame_apply(
|
||
self,
|
||
func=func,
|
||
axis=axis,
|
||
raw=raw,
|
||
result_type=result_type,
|
||
args=args,
|
||
kwds=kwds,
|
||
)
|
||
return op.get_result()
|
||
|
||
def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
|
||
"""
|
||
Apply a function to a Dataframe elementwise.
|
||
|
||
This method applies a function that accepts and returns a scalar
|
||
to every element of a DataFrame.
|
||
|
||
Parameters
|
||
----------
|
||
func : callable
|
||
Python function, returns a single value from a single value.
|
||
na_action : {None, 'ignore'}, default None
|
||
If ‘ignore’, propagate NaN values, without passing them to func.
|
||
|
||
.. versionadded:: 1.2
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Transformed DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.apply : Apply a function along input axis of DataFrame.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
|
||
>>> df
|
||
0 1
|
||
0 1.000 2.120
|
||
1 3.356 4.567
|
||
|
||
>>> df.applymap(lambda x: len(str(x)))
|
||
0 1
|
||
0 3 4
|
||
1 5 5
|
||
|
||
Like Series.map, NA values can be ignored:
|
||
|
||
>>> df_copy = df.copy()
|
||
>>> df_copy.iloc[0, 0] = pd.NA
|
||
>>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
|
||
0 1
|
||
0 <NA> 4
|
||
1 5 5
|
||
|
||
Note that a vectorized version of `func` often exists, which will
|
||
be much faster. You could square each number elementwise.
|
||
|
||
>>> df.applymap(lambda x: x**2)
|
||
0 1
|
||
0 1.000000 4.494400
|
||
1 11.262736 20.857489
|
||
|
||
But it's better to avoid applymap in that case.
|
||
|
||
>>> df ** 2
|
||
0 1
|
||
0 1.000000 4.494400
|
||
1 11.262736 20.857489
|
||
"""
|
||
if na_action not in {"ignore", None}:
|
||
raise ValueError(
|
||
f"na_action must be 'ignore' or None. Got {repr(na_action)}"
|
||
)
|
||
ignore_na = na_action == "ignore"
|
||
|
||
# if we have a dtype == 'M8[ns]', provide boxed values
|
||
def infer(x):
|
||
if x.empty:
|
||
return lib.map_infer(x, func, ignore_na=ignore_na)
|
||
return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
|
||
|
||
return self.apply(infer).__finalize__(self, "applymap")
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Merging / joining methods
|
||
|
||
def append(
|
||
self, other, ignore_index=False, verify_integrity=False, sort=False
|
||
) -> DataFrame:
|
||
"""
|
||
Append rows of `other` to the end of caller, returning a new object.
|
||
|
||
Columns in `other` that are not in the caller are added as new columns.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame or Series/dict-like object, or list of these
|
||
The data to append.
|
||
ignore_index : bool, default False
|
||
If True, the resulting axis will be labeled 0, 1, …, n - 1.
|
||
verify_integrity : bool, default False
|
||
If True, raise ValueError on creating index with duplicates.
|
||
sort : bool, default False
|
||
Sort columns if the columns of `self` and `other` are not aligned.
|
||
|
||
.. versionchanged:: 1.0.0
|
||
|
||
Changed to not sort by default.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
|
||
See Also
|
||
--------
|
||
concat : General function to concatenate DataFrame or Series objects.
|
||
|
||
Notes
|
||
-----
|
||
If a list of dict/series is passed and the keys are all contained in
|
||
the DataFrame's index, the order of the columns in the resulting
|
||
DataFrame will be unchanged.
|
||
|
||
Iteratively appending rows to a DataFrame can be more computationally
|
||
intensive than a single concatenate. A better solution is to append
|
||
those rows to a list and then concatenate the list with the original
|
||
DataFrame all at once.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
|
||
>>> df
|
||
A B
|
||
0 1 2
|
||
1 3 4
|
||
>>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
|
||
>>> df.append(df2)
|
||
A B
|
||
0 1 2
|
||
1 3 4
|
||
0 5 6
|
||
1 7 8
|
||
|
||
With `ignore_index` set to True:
|
||
|
||
>>> df.append(df2, ignore_index=True)
|
||
A B
|
||
0 1 2
|
||
1 3 4
|
||
2 5 6
|
||
3 7 8
|
||
|
||
The following, while not recommended methods for generating DataFrames,
|
||
show two ways to generate a DataFrame from multiple data sources.
|
||
|
||
Less efficient:
|
||
|
||
>>> df = pd.DataFrame(columns=['A'])
|
||
>>> for i in range(5):
|
||
... df = df.append({'A': i}, ignore_index=True)
|
||
>>> df
|
||
A
|
||
0 0
|
||
1 1
|
||
2 2
|
||
3 3
|
||
4 4
|
||
|
||
More efficient:
|
||
|
||
>>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
|
||
... ignore_index=True)
|
||
A
|
||
0 0
|
||
1 1
|
||
2 2
|
||
3 3
|
||
4 4
|
||
"""
|
||
if isinstance(other, (Series, dict)):
|
||
if isinstance(other, dict):
|
||
if not ignore_index:
|
||
raise TypeError("Can only append a dict if ignore_index=True")
|
||
other = Series(other)
|
||
if other.name is None and not ignore_index:
|
||
raise TypeError(
|
||
"Can only append a Series if ignore_index=True "
|
||
"or if the Series has a name"
|
||
)
|
||
|
||
index = Index([other.name], name=self.index.name)
|
||
idx_diff = other.index.difference(self.columns)
|
||
try:
|
||
combined_columns = self.columns.append(idx_diff)
|
||
except TypeError:
|
||
combined_columns = self.columns.astype(object).append(idx_diff)
|
||
other = (
|
||
other.reindex(combined_columns, copy=False)
|
||
.to_frame()
|
||
.T.infer_objects()
|
||
.rename_axis(index.names, copy=False)
|
||
)
|
||
if not self.columns.equals(combined_columns):
|
||
self = self.reindex(columns=combined_columns)
|
||
elif isinstance(other, list):
|
||
if not other:
|
||
pass
|
||
elif not isinstance(other[0], DataFrame):
|
||
other = DataFrame(other)
|
||
if (self.columns.get_indexer(other.columns) >= 0).all():
|
||
other = other.reindex(columns=self.columns)
|
||
|
||
from pandas.core.reshape.concat import concat
|
||
|
||
if isinstance(other, (list, tuple)):
|
||
to_concat = [self, *other]
|
||
else:
|
||
to_concat = [self, other]
|
||
return (
|
||
concat(
|
||
to_concat,
|
||
ignore_index=ignore_index,
|
||
verify_integrity=verify_integrity,
|
||
sort=sort,
|
||
)
|
||
).__finalize__(self, method="append")
|
||
|
||
def join(
|
||
self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
|
||
) -> DataFrame:
|
||
"""
|
||
Join columns of another DataFrame.
|
||
|
||
Join columns with `other` DataFrame either on index or on a key
|
||
column. Efficiently join multiple DataFrame objects by index at once by
|
||
passing a list.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame, Series, or list of DataFrame
|
||
Index should be similar to one of the columns in this one. If a
|
||
Series is passed, its name attribute must be set, and that will be
|
||
used as the column name in the resulting joined DataFrame.
|
||
on : str, list of str, or array-like, optional
|
||
Column or index level name(s) in the caller to join on the index
|
||
in `other`, otherwise joins index-on-index. If multiple
|
||
values given, the `other` DataFrame must have a MultiIndex. Can
|
||
pass an array as the join key if it is not already contained in
|
||
the calling DataFrame. Like an Excel VLOOKUP operation.
|
||
how : {'left', 'right', 'outer', 'inner'}, default 'left'
|
||
How to handle the operation of the two objects.
|
||
|
||
* left: use calling frame's index (or column if on is specified)
|
||
* right: use `other`'s index.
|
||
* outer: form union of calling frame's index (or column if on is
|
||
specified) with `other`'s index, and sort it.
|
||
lexicographically.
|
||
* inner: form intersection of calling frame's index (or column if
|
||
on is specified) with `other`'s index, preserving the order
|
||
of the calling's one.
|
||
lsuffix : str, default ''
|
||
Suffix to use from left frame's overlapping columns.
|
||
rsuffix : str, default ''
|
||
Suffix to use from right frame's overlapping columns.
|
||
sort : bool, default False
|
||
Order result DataFrame lexicographically by the join key. If False,
|
||
the order of the join key depends on the join type (how keyword).
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
A dataframe containing columns from both the caller and `other`.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.merge : For column(s)-on-column(s) operations.
|
||
|
||
Notes
|
||
-----
|
||
Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
|
||
passing a list of `DataFrame` objects.
|
||
|
||
Support for specifying index levels as the `on` parameter was added
|
||
in version 0.23.0.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
|
||
... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
|
||
|
||
>>> df
|
||
key A
|
||
0 K0 A0
|
||
1 K1 A1
|
||
2 K2 A2
|
||
3 K3 A3
|
||
4 K4 A4
|
||
5 K5 A5
|
||
|
||
>>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
|
||
... 'B': ['B0', 'B1', 'B2']})
|
||
|
||
>>> other
|
||
key B
|
||
0 K0 B0
|
||
1 K1 B1
|
||
2 K2 B2
|
||
|
||
Join DataFrames using their indexes.
|
||
|
||
>>> df.join(other, lsuffix='_caller', rsuffix='_other')
|
||
key_caller A key_other B
|
||
0 K0 A0 K0 B0
|
||
1 K1 A1 K1 B1
|
||
2 K2 A2 K2 B2
|
||
3 K3 A3 NaN NaN
|
||
4 K4 A4 NaN NaN
|
||
5 K5 A5 NaN NaN
|
||
|
||
If we want to join using the key columns, we need to set key to be
|
||
the index in both `df` and `other`. The joined DataFrame will have
|
||
key as its index.
|
||
|
||
>>> df.set_index('key').join(other.set_index('key'))
|
||
A B
|
||
key
|
||
K0 A0 B0
|
||
K1 A1 B1
|
||
K2 A2 B2
|
||
K3 A3 NaN
|
||
K4 A4 NaN
|
||
K5 A5 NaN
|
||
|
||
Another option to join using the key columns is to use the `on`
|
||
parameter. DataFrame.join always uses `other`'s index but we can use
|
||
any column in `df`. This method preserves the original DataFrame's
|
||
index in the result.
|
||
|
||
>>> df.join(other.set_index('key'), on='key')
|
||
key A B
|
||
0 K0 A0 B0
|
||
1 K1 A1 B1
|
||
2 K2 A2 B2
|
||
3 K3 A3 NaN
|
||
4 K4 A4 NaN
|
||
5 K5 A5 NaN
|
||
"""
|
||
return self._join_compat(
|
||
other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
|
||
)
|
||
|
||
def _join_compat(
|
||
self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
|
||
):
|
||
from pandas.core.reshape.concat import concat
|
||
from pandas.core.reshape.merge import merge
|
||
|
||
if isinstance(other, Series):
|
||
if other.name is None:
|
||
raise ValueError("Other Series must have a name")
|
||
other = DataFrame({other.name: other})
|
||
|
||
if isinstance(other, DataFrame):
|
||
if how == "cross":
|
||
return merge(
|
||
self,
|
||
other,
|
||
how=how,
|
||
on=on,
|
||
suffixes=(lsuffix, rsuffix),
|
||
sort=sort,
|
||
)
|
||
return merge(
|
||
self,
|
||
other,
|
||
left_on=on,
|
||
how=how,
|
||
left_index=on is None,
|
||
right_index=True,
|
||
suffixes=(lsuffix, rsuffix),
|
||
sort=sort,
|
||
)
|
||
else:
|
||
if on is not None:
|
||
raise ValueError(
|
||
"Joining multiple DataFrames only supported for joining on index"
|
||
)
|
||
|
||
frames = [self] + list(other)
|
||
|
||
can_concat = all(df.index.is_unique for df in frames)
|
||
|
||
# join indexes only using concat
|
||
if can_concat:
|
||
if how == "left":
|
||
res = concat(
|
||
frames, axis=1, join="outer", verify_integrity=True, sort=sort
|
||
)
|
||
return res.reindex(self.index, copy=False)
|
||
else:
|
||
return concat(
|
||
frames, axis=1, join=how, verify_integrity=True, sort=sort
|
||
)
|
||
|
||
joined = frames[0]
|
||
|
||
for frame in frames[1:]:
|
||
joined = merge(
|
||
joined, frame, how=how, left_index=True, right_index=True
|
||
)
|
||
|
||
return joined
|
||
|
||
@Substitution("")
|
||
@Appender(_merge_doc, indents=2)
|
||
def merge(
|
||
self,
|
||
right,
|
||
how="inner",
|
||
on=None,
|
||
left_on=None,
|
||
right_on=None,
|
||
left_index=False,
|
||
right_index=False,
|
||
sort=False,
|
||
suffixes=("_x", "_y"),
|
||
copy=True,
|
||
indicator=False,
|
||
validate=None,
|
||
) -> DataFrame:
|
||
from pandas.core.reshape.merge import merge
|
||
|
||
return merge(
|
||
self,
|
||
right,
|
||
how=how,
|
||
on=on,
|
||
left_on=left_on,
|
||
right_on=right_on,
|
||
left_index=left_index,
|
||
right_index=right_index,
|
||
sort=sort,
|
||
suffixes=suffixes,
|
||
copy=copy,
|
||
indicator=indicator,
|
||
validate=validate,
|
||
)
|
||
|
||
def round(self, decimals=0, *args, **kwargs) -> DataFrame:
|
||
"""
|
||
Round a DataFrame to a variable number of decimal places.
|
||
|
||
Parameters
|
||
----------
|
||
decimals : int, dict, Series
|
||
Number of decimal places to round each column to. If an int is
|
||
given, round each column to the same number of places.
|
||
Otherwise dict and Series round to variable numbers of places.
|
||
Column names should be in the keys if `decimals` is a
|
||
dict-like, or in the index if `decimals` is a Series. Any
|
||
columns not included in `decimals` will be left as is. Elements
|
||
of `decimals` which are not columns of the input will be
|
||
ignored.
|
||
*args
|
||
Additional keywords have no effect but might be accepted for
|
||
compatibility with numpy.
|
||
**kwargs
|
||
Additional keywords have no effect but might be accepted for
|
||
compatibility with numpy.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
A DataFrame with the affected columns rounded to the specified
|
||
number of decimal places.
|
||
|
||
See Also
|
||
--------
|
||
numpy.around : Round a numpy array to the given number of decimals.
|
||
Series.round : Round a Series to the given number of decimals.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
|
||
... columns=['dogs', 'cats'])
|
||
>>> df
|
||
dogs cats
|
||
0 0.21 0.32
|
||
1 0.01 0.67
|
||
2 0.66 0.03
|
||
3 0.21 0.18
|
||
|
||
By providing an integer each column is rounded to the same number
|
||
of decimal places
|
||
|
||
>>> df.round(1)
|
||
dogs cats
|
||
0 0.2 0.3
|
||
1 0.0 0.7
|
||
2 0.7 0.0
|
||
3 0.2 0.2
|
||
|
||
With a dict, the number of places for specific columns can be
|
||
specified with the column names as key and the number of decimal
|
||
places as value
|
||
|
||
>>> df.round({'dogs': 1, 'cats': 0})
|
||
dogs cats
|
||
0 0.2 0.0
|
||
1 0.0 1.0
|
||
2 0.7 0.0
|
||
3 0.2 0.0
|
||
|
||
Using a Series, the number of places for specific columns can be
|
||
specified with the column names as index and the number of
|
||
decimal places as value
|
||
|
||
>>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
|
||
>>> df.round(decimals)
|
||
dogs cats
|
||
0 0.2 0.0
|
||
1 0.0 1.0
|
||
2 0.7 0.0
|
||
3 0.2 0.0
|
||
"""
|
||
from pandas.core.reshape.concat import concat
|
||
|
||
def _dict_round(df, decimals):
|
||
for col, vals in df.items():
|
||
try:
|
||
yield _series_round(vals, decimals[col])
|
||
except KeyError:
|
||
yield vals
|
||
|
||
def _series_round(s, decimals):
|
||
if is_integer_dtype(s) or is_float_dtype(s):
|
||
return s.round(decimals)
|
||
return s
|
||
|
||
nv.validate_round(args, kwargs)
|
||
|
||
if isinstance(decimals, (dict, Series)):
|
||
if isinstance(decimals, Series):
|
||
if not decimals.index.is_unique:
|
||
raise ValueError("Index of decimals must be unique")
|
||
new_cols = list(_dict_round(self, decimals))
|
||
elif is_integer(decimals):
|
||
# Dispatch to Series.round
|
||
new_cols = [_series_round(v, decimals) for _, v in self.items()]
|
||
else:
|
||
raise TypeError("decimals must be an integer, a dict-like or a Series")
|
||
|
||
if len(new_cols) > 0:
|
||
return self._constructor(
|
||
concat(new_cols, axis=1), index=self.index, columns=self.columns
|
||
)
|
||
else:
|
||
return self
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Statistical methods, etc.
|
||
|
||
def corr(self, method="pearson", min_periods=1) -> DataFrame:
|
||
"""
|
||
Compute pairwise correlation of columns, excluding NA/null values.
|
||
|
||
Parameters
|
||
----------
|
||
method : {'pearson', 'kendall', 'spearman'} or callable
|
||
Method of correlation:
|
||
|
||
* pearson : standard correlation coefficient
|
||
* kendall : Kendall Tau correlation coefficient
|
||
* spearman : Spearman rank correlation
|
||
* callable: callable with input two 1d ndarrays
|
||
and returning a float. Note that the returned matrix from corr
|
||
will have 1 along the diagonals and will be symmetric
|
||
regardless of the callable's behavior.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
min_periods : int, optional
|
||
Minimum number of observations required per pair of columns
|
||
to have a valid result. Currently only available for Pearson
|
||
and Spearman correlation.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
Correlation matrix.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.corrwith : Compute pairwise correlation with another
|
||
DataFrame or Series.
|
||
Series.corr : Compute the correlation between two Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> def histogram_intersection(a, b):
|
||
... v = np.minimum(a, b).sum().round(decimals=1)
|
||
... return v
|
||
>>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
|
||
... columns=['dogs', 'cats'])
|
||
>>> df.corr(method=histogram_intersection)
|
||
dogs cats
|
||
dogs 1.0 0.3
|
||
cats 0.3 1.0
|
||
"""
|
||
numeric_df = self._get_numeric_data()
|
||
cols = numeric_df.columns
|
||
idx = cols.copy()
|
||
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
|
||
|
||
if method == "pearson":
|
||
correl = libalgos.nancorr(mat, minp=min_periods)
|
||
elif method == "spearman":
|
||
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
|
||
elif method == "kendall" or callable(method):
|
||
if min_periods is None:
|
||
min_periods = 1
|
||
mat = mat.T
|
||
corrf = nanops.get_corr_func(method)
|
||
K = len(cols)
|
||
correl = np.empty((K, K), dtype=float)
|
||
mask = np.isfinite(mat)
|
||
for i, ac in enumerate(mat):
|
||
for j, bc in enumerate(mat):
|
||
if i > j:
|
||
continue
|
||
|
||
valid = mask[i] & mask[j]
|
||
if valid.sum() < min_periods:
|
||
c = np.nan
|
||
elif i == j:
|
||
c = 1.0
|
||
elif not valid.all():
|
||
c = corrf(ac[valid], bc[valid])
|
||
else:
|
||
c = corrf(ac, bc)
|
||
correl[i, j] = c
|
||
correl[j, i] = c
|
||
else:
|
||
raise ValueError(
|
||
"method must be either 'pearson', "
|
||
"'spearman', 'kendall', or a callable, "
|
||
f"'{method}' was supplied"
|
||
)
|
||
|
||
return self._constructor(correl, index=idx, columns=cols)
|
||
|
||
def cov(
|
||
self, min_periods: Optional[int] = None, ddof: Optional[int] = 1
|
||
) -> DataFrame:
|
||
"""
|
||
Compute pairwise covariance of columns, excluding NA/null values.
|
||
|
||
Compute the pairwise covariance among the series of a DataFrame.
|
||
The returned data frame is the `covariance matrix
|
||
<https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
|
||
of the DataFrame.
|
||
|
||
Both NA and null values are automatically excluded from the
|
||
calculation. (See the note below about bias from missing values.)
|
||
A threshold can be set for the minimum number of
|
||
observations for each value created. Comparisons with observations
|
||
below this threshold will be returned as ``NaN``.
|
||
|
||
This method is generally used for the analysis of time series data to
|
||
understand the relationship between different measures
|
||
across time.
|
||
|
||
Parameters
|
||
----------
|
||
min_periods : int, optional
|
||
Minimum number of observations required per pair of columns
|
||
to have a valid result.
|
||
|
||
ddof : int, default 1
|
||
Delta degrees of freedom. The divisor used in calculations
|
||
is ``N - ddof``, where ``N`` represents the number of elements.
|
||
|
||
.. versionadded:: 1.1.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
The covariance matrix of the series of the DataFrame.
|
||
|
||
See Also
|
||
--------
|
||
Series.cov : Compute covariance with another Series.
|
||
core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance.
|
||
core.window.Expanding.cov : Expanding sample covariance.
|
||
core.window.Rolling.cov : Rolling sample covariance.
|
||
|
||
Notes
|
||
-----
|
||
Returns the covariance matrix of the DataFrame's time series.
|
||
The covariance is normalized by N-ddof.
|
||
|
||
For DataFrames that have Series that are missing data (assuming that
|
||
data is `missing at random
|
||
<https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
|
||
the returned covariance matrix will be an unbiased estimate
|
||
of the variance and covariance between the member Series.
|
||
|
||
However, for many applications this estimate may not be acceptable
|
||
because the estimate covariance matrix is not guaranteed to be positive
|
||
semi-definite. This could lead to estimate correlations having
|
||
absolute values which are greater than one, and/or a non-invertible
|
||
covariance matrix. See `Estimation of covariance matrices
|
||
<https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
|
||
matrices>`__ for more details.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
|
||
... columns=['dogs', 'cats'])
|
||
>>> df.cov()
|
||
dogs cats
|
||
dogs 0.666667 -1.000000
|
||
cats -1.000000 1.666667
|
||
|
||
>>> np.random.seed(42)
|
||
>>> df = pd.DataFrame(np.random.randn(1000, 5),
|
||
... columns=['a', 'b', 'c', 'd', 'e'])
|
||
>>> df.cov()
|
||
a b c d e
|
||
a 0.998438 -0.020161 0.059277 -0.008943 0.014144
|
||
b -0.020161 1.059352 -0.008543 -0.024738 0.009826
|
||
c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
|
||
d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
|
||
e 0.014144 0.009826 -0.000271 -0.013692 0.977795
|
||
|
||
**Minimum number of periods**
|
||
|
||
This method also supports an optional ``min_periods`` keyword
|
||
that specifies the required minimum number of non-NA observations for
|
||
each column pair in order to have a valid result:
|
||
|
||
>>> np.random.seed(42)
|
||
>>> df = pd.DataFrame(np.random.randn(20, 3),
|
||
... columns=['a', 'b', 'c'])
|
||
>>> df.loc[df.index[:5], 'a'] = np.nan
|
||
>>> df.loc[df.index[5:10], 'b'] = np.nan
|
||
>>> df.cov(min_periods=12)
|
||
a b c
|
||
a 0.316741 NaN -0.150812
|
||
b NaN 1.248003 0.191417
|
||
c -0.150812 0.191417 0.895202
|
||
"""
|
||
numeric_df = self._get_numeric_data()
|
||
cols = numeric_df.columns
|
||
idx = cols.copy()
|
||
mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False)
|
||
|
||
if notna(mat).all():
|
||
if min_periods is not None and min_periods > len(mat):
|
||
base_cov = np.empty((mat.shape[1], mat.shape[1]))
|
||
base_cov.fill(np.nan)
|
||
else:
|
||
base_cov = np.cov(mat.T, ddof=ddof)
|
||
base_cov = base_cov.reshape((len(cols), len(cols)))
|
||
else:
|
||
base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
|
||
|
||
return self._constructor(base_cov, index=idx, columns=cols)
|
||
|
||
def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
|
||
"""
|
||
Compute pairwise correlation.
|
||
|
||
Pairwise correlation is computed between rows or columns of
|
||
DataFrame with rows or columns of Series or DataFrame. DataFrames
|
||
are first aligned along both axes before computing the
|
||
correlations.
|
||
|
||
Parameters
|
||
----------
|
||
other : DataFrame, Series
|
||
Object with which to compute correlations.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
|
||
row-wise.
|
||
drop : bool, default False
|
||
Drop missing indices from result.
|
||
method : {'pearson', 'kendall', 'spearman'} or callable
|
||
Method of correlation:
|
||
|
||
* pearson : standard correlation coefficient
|
||
* kendall : Kendall Tau correlation coefficient
|
||
* spearman : Spearman rank correlation
|
||
* callable: callable with input two 1d ndarrays
|
||
and returning a float.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Pairwise correlations.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.corr : Compute pairwise correlation of columns.
|
||
"""
|
||
axis = self._get_axis_number(axis)
|
||
this = self._get_numeric_data()
|
||
|
||
if isinstance(other, Series):
|
||
return this.apply(lambda x: other.corr(x, method=method), axis=axis)
|
||
|
||
other = other._get_numeric_data()
|
||
left, right = this.align(other, join="inner", copy=False)
|
||
|
||
if axis == 1:
|
||
left = left.T
|
||
right = right.T
|
||
|
||
if method == "pearson":
|
||
# mask missing values
|
||
left = left + right * 0
|
||
right = right + left * 0
|
||
|
||
# demeaned data
|
||
ldem = left - left.mean()
|
||
rdem = right - right.mean()
|
||
|
||
num = (ldem * rdem).sum()
|
||
dom = (left.count() - 1) * left.std() * right.std()
|
||
|
||
correl = num / dom
|
||
|
||
elif method in ["kendall", "spearman"] or callable(method):
|
||
|
||
def c(x):
|
||
return nanops.nancorr(x[0], x[1], method=method)
|
||
|
||
correl = self._constructor_sliced(
|
||
map(c, zip(left.values.T, right.values.T)), index=left.columns
|
||
)
|
||
|
||
else:
|
||
raise ValueError(
|
||
f"Invalid method {method} was passed, "
|
||
"valid methods are: 'pearson', 'kendall', "
|
||
"'spearman', or callable"
|
||
)
|
||
|
||
if not drop:
|
||
# Find non-matching labels along the given axis
|
||
# and append missing correlations (GH 22375)
|
||
raxis = 1 if axis == 0 else 0
|
||
result_index = this._get_axis(raxis).union(other._get_axis(raxis))
|
||
idx_diff = result_index.difference(correl.index)
|
||
|
||
if len(idx_diff) > 0:
|
||
correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
|
||
|
||
return correl
|
||
|
||
# ----------------------------------------------------------------------
|
||
# ndarray-like stats methods
|
||
|
||
def count(self, axis=0, level=None, numeric_only=False):
|
||
"""
|
||
Count non-NA cells for each column or row.
|
||
|
||
The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
|
||
on `pandas.options.mode.use_inf_as_na`) are considered NA.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
If 0 or 'index' counts are generated for each column.
|
||
If 1 or 'columns' counts are generated for each row.
|
||
level : int or str, optional
|
||
If the axis is a `MultiIndex` (hierarchical), count along a
|
||
particular `level`, collapsing into a `DataFrame`.
|
||
A `str` specifies the level name.
|
||
numeric_only : bool, default False
|
||
Include only `float`, `int` or `boolean` data.
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
For each column/row the number of non-NA/null entries.
|
||
If `level` is specified returns a `DataFrame`.
|
||
|
||
See Also
|
||
--------
|
||
Series.count: Number of non-NA elements in a Series.
|
||
DataFrame.value_counts: Count unique combinations of columns.
|
||
DataFrame.shape: Number of DataFrame rows and columns (including NA
|
||
elements).
|
||
DataFrame.isna: Boolean same-sized DataFrame showing places of NA
|
||
elements.
|
||
|
||
Examples
|
||
--------
|
||
Constructing DataFrame from a dictionary:
|
||
|
||
>>> df = pd.DataFrame({"Person":
|
||
... ["John", "Myla", "Lewis", "John", "Myla"],
|
||
... "Age": [24., np.nan, 21., 33, 26],
|
||
... "Single": [False, True, True, True, False]})
|
||
>>> df
|
||
Person Age Single
|
||
0 John 24.0 False
|
||
1 Myla NaN True
|
||
2 Lewis 21.0 True
|
||
3 John 33.0 True
|
||
4 Myla 26.0 False
|
||
|
||
Notice the uncounted NA values:
|
||
|
||
>>> df.count()
|
||
Person 5
|
||
Age 4
|
||
Single 5
|
||
dtype: int64
|
||
|
||
Counts for each **row**:
|
||
|
||
>>> df.count(axis='columns')
|
||
0 3
|
||
1 2
|
||
2 3
|
||
3 3
|
||
4 3
|
||
dtype: int64
|
||
|
||
Counts for one level of a `MultiIndex`:
|
||
|
||
>>> df.set_index(["Person", "Single"]).count(level="Person")
|
||
Age
|
||
Person
|
||
John 2
|
||
Lewis 1
|
||
Myla 1
|
||
"""
|
||
axis = self._get_axis_number(axis)
|
||
if level is not None:
|
||
return self._count_level(level, axis=axis, numeric_only=numeric_only)
|
||
|
||
if numeric_only:
|
||
frame = self._get_numeric_data()
|
||
else:
|
||
frame = self
|
||
|
||
# GH #423
|
||
if len(frame._get_axis(axis)) == 0:
|
||
result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
|
||
else:
|
||
if frame._is_mixed_type or frame._mgr.any_extension_types:
|
||
# the or any_extension_types is really only hit for single-
|
||
# column frames with an extension array
|
||
result = notna(frame).sum(axis=axis)
|
||
else:
|
||
# GH13407
|
||
series_counts = notna(frame).sum(axis=axis)
|
||
counts = series_counts.values
|
||
result = self._constructor_sliced(
|
||
counts, index=frame._get_agg_axis(axis)
|
||
)
|
||
|
||
return result.astype("int64")
|
||
|
||
def _count_level(self, level, axis=0, numeric_only=False):
|
||
if numeric_only:
|
||
frame = self._get_numeric_data()
|
||
else:
|
||
frame = self
|
||
|
||
count_axis = frame._get_axis(axis)
|
||
agg_axis = frame._get_agg_axis(axis)
|
||
|
||
if not isinstance(count_axis, MultiIndex):
|
||
raise TypeError(
|
||
f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
|
||
)
|
||
|
||
# Mask NaNs: Mask rows or columns where the index level is NaN, and all
|
||
# values in the DataFrame that are NaN
|
||
if frame._is_mixed_type:
|
||
# Since we have mixed types, calling notna(frame.values) might
|
||
# upcast everything to object
|
||
values_mask = notna(frame).values
|
||
else:
|
||
# But use the speedup when we have homogeneous dtypes
|
||
values_mask = notna(frame.values)
|
||
|
||
index_mask = notna(count_axis.get_level_values(level=level))
|
||
if axis == 1:
|
||
mask = index_mask & values_mask
|
||
else:
|
||
mask = index_mask.reshape(-1, 1) & values_mask
|
||
|
||
if isinstance(level, str):
|
||
level = count_axis._get_level_number(level)
|
||
|
||
level_name = count_axis._names[level]
|
||
level_index = count_axis.levels[level]._shallow_copy(name=level_name)
|
||
level_codes = ensure_int64(count_axis.codes[level])
|
||
counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis)
|
||
|
||
if axis == 1:
|
||
result = self._constructor(counts, index=agg_axis, columns=level_index)
|
||
else:
|
||
result = self._constructor(counts, index=level_index, columns=agg_axis)
|
||
|
||
return result
|
||
|
||
def _reduce(
|
||
self,
|
||
op,
|
||
name: str,
|
||
*,
|
||
axis=0,
|
||
skipna=True,
|
||
numeric_only=None,
|
||
filter_type=None,
|
||
**kwds,
|
||
):
|
||
|
||
min_count = kwds.get("min_count", 0)
|
||
assert filter_type is None or filter_type == "bool", filter_type
|
||
out_dtype = "bool" if filter_type == "bool" else None
|
||
|
||
own_dtypes = [arr.dtype for arr in self._iter_column_arrays()]
|
||
|
||
dtype_is_dt = np.array(
|
||
[is_datetime64_any_dtype(dtype) for dtype in own_dtypes],
|
||
dtype=bool,
|
||
)
|
||
if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
|
||
warnings.warn(
|
||
"DataFrame.mean and DataFrame.median with numeric_only=None "
|
||
"will include datetime64 and datetime64tz columns in a "
|
||
"future version.",
|
||
FutureWarning,
|
||
stacklevel=5,
|
||
)
|
||
cols = self.columns[~dtype_is_dt]
|
||
self = self[cols]
|
||
|
||
# TODO: Make other agg func handle axis=None properly GH#21597
|
||
axis = self._get_axis_number(axis)
|
||
labels = self._get_agg_axis(axis)
|
||
assert axis in [0, 1]
|
||
|
||
def func(values: np.ndarray):
|
||
# We only use this in the case that operates on self.values
|
||
return op(values, axis=axis, skipna=skipna, **kwds)
|
||
|
||
def blk_func(values):
|
||
if isinstance(values, ExtensionArray):
|
||
return values._reduce(name, skipna=skipna, **kwds)
|
||
else:
|
||
return op(values, axis=1, skipna=skipna, **kwds)
|
||
|
||
def _get_data() -> DataFrame:
|
||
if filter_type is None:
|
||
data = self._get_numeric_data()
|
||
else:
|
||
# GH#25101, GH#24434
|
||
assert filter_type == "bool"
|
||
data = self._get_bool_data()
|
||
return data
|
||
|
||
if (numeric_only is not None or axis == 0) and min_count == 0:
|
||
# For numeric_only non-None and axis non-None, we know
|
||
# which blocks to use and no try/except is needed.
|
||
# For numeric_only=None only the case with axis==0 and no object
|
||
# dtypes are unambiguous can be handled with BlockManager.reduce
|
||
# Case with EAs see GH#35881
|
||
df = self
|
||
if numeric_only is True:
|
||
df = _get_data()
|
||
if axis == 1:
|
||
df = df.T
|
||
axis = 0
|
||
|
||
ignore_failures = numeric_only is None
|
||
|
||
# After possibly _get_data and transposing, we are now in the
|
||
# simple case where we can use BlockManager.reduce
|
||
res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures)
|
||
out = df._constructor(res).iloc[0]
|
||
if out_dtype is not None:
|
||
out = out.astype(out_dtype)
|
||
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
|
||
# Even if we are object dtype, follow numpy and return
|
||
# float64, see test_apply_funcs_over_empty
|
||
out = out.astype(np.float64)
|
||
return out
|
||
|
||
assert numeric_only is None
|
||
|
||
data = self
|
||
values = data.values
|
||
|
||
try:
|
||
result = func(values)
|
||
|
||
except TypeError:
|
||
# e.g. in nanops trying to convert strs to float
|
||
|
||
data = _get_data()
|
||
labels = data._get_agg_axis(axis)
|
||
|
||
values = data.values
|
||
with np.errstate(all="ignore"):
|
||
result = func(values)
|
||
|
||
if hasattr(result, "dtype"):
|
||
if filter_type == "bool" and notna(result).all():
|
||
result = result.astype(np.bool_)
|
||
elif filter_type is None and is_object_dtype(result.dtype):
|
||
try:
|
||
result = result.astype(np.float64)
|
||
except (ValueError, TypeError):
|
||
# try to coerce to the original dtypes item by item if we can
|
||
pass
|
||
|
||
result = self._constructor_sliced(result, index=labels)
|
||
return result
|
||
|
||
def nunique(self, axis=0, dropna=True) -> Series:
|
||
"""
|
||
Count distinct observations over requested axis.
|
||
|
||
Return Series with number of distinct observations. Can ignore NaN
|
||
values.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
|
||
column-wise.
|
||
dropna : bool, default True
|
||
Don't include NaN in the counts.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
|
||
See Also
|
||
--------
|
||
Series.nunique: Method nunique for Series.
|
||
DataFrame.count: Count non-NA cells for each column or row.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
|
||
>>> df.nunique()
|
||
A 3
|
||
B 1
|
||
dtype: int64
|
||
|
||
>>> df.nunique(axis=1)
|
||
0 1
|
||
1 2
|
||
2 2
|
||
dtype: int64
|
||
"""
|
||
return self.apply(Series.nunique, axis=axis, dropna=dropna)
|
||
|
||
def idxmin(self, axis=0, skipna=True) -> Series:
|
||
"""
|
||
Return index of first occurrence of minimum over requested axis.
|
||
|
||
NA/null values are excluded.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||
skipna : bool, default True
|
||
Exclude NA/null values. If an entire row/column is NA, the result
|
||
will be NA.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Indexes of minima along the specified axis.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
* If the row/column is empty
|
||
|
||
See Also
|
||
--------
|
||
Series.idxmin : Return index of the minimum element.
|
||
|
||
Notes
|
||
-----
|
||
This method is the DataFrame version of ``ndarray.argmin``.
|
||
|
||
Examples
|
||
--------
|
||
Consider a dataset containing food consumption in Argentina.
|
||
|
||
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
|
||
... 'co2_emissions': [37.2, 19.66, 1712]},
|
||
... index=['Pork', 'Wheat Products', 'Beef'])
|
||
|
||
>>> df
|
||
consumption co2_emissions
|
||
Pork 10.51 37.20
|
||
Wheat Products 103.11 19.66
|
||
Beef 55.48 1712.00
|
||
|
||
By default, it returns the index for the minimum value in each column.
|
||
|
||
>>> df.idxmin()
|
||
consumption Pork
|
||
co2_emissions Wheat Products
|
||
dtype: object
|
||
|
||
To return the index for the minimum value in each row, use ``axis="columns"``.
|
||
|
||
>>> df.idxmin(axis="columns")
|
||
Pork consumption
|
||
Wheat Products co2_emissions
|
||
Beef consumption
|
||
dtype: object
|
||
"""
|
||
axis = self._get_axis_number(axis)
|
||
|
||
res = self._reduce(
|
||
nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
|
||
)
|
||
indices = res._values
|
||
|
||
# indices will always be np.ndarray since axis is not None and
|
||
# values is a 2d array for DataFrame
|
||
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
|
||
assert isinstance(indices, np.ndarray) # for mypy
|
||
|
||
index = self._get_axis(axis)
|
||
result = [index[i] if i >= 0 else np.nan for i in indices]
|
||
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
|
||
|
||
def idxmax(self, axis=0, skipna=True) -> Series:
|
||
"""
|
||
Return index of first occurrence of maximum over requested axis.
|
||
|
||
NA/null values are excluded.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||
skipna : bool, default True
|
||
Exclude NA/null values. If an entire row/column is NA, the result
|
||
will be NA.
|
||
|
||
Returns
|
||
-------
|
||
Series
|
||
Indexes of maxima along the specified axis.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
* If the row/column is empty
|
||
|
||
See Also
|
||
--------
|
||
Series.idxmax : Return index of the maximum element.
|
||
|
||
Notes
|
||
-----
|
||
This method is the DataFrame version of ``ndarray.argmax``.
|
||
|
||
Examples
|
||
--------
|
||
Consider a dataset containing food consumption in Argentina.
|
||
|
||
>>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
|
||
... 'co2_emissions': [37.2, 19.66, 1712]},
|
||
... index=['Pork', 'Wheat Products', 'Beef'])
|
||
|
||
>>> df
|
||
consumption co2_emissions
|
||
Pork 10.51 37.20
|
||
Wheat Products 103.11 19.66
|
||
Beef 55.48 1712.00
|
||
|
||
By default, it returns the index for the maximum value in each column.
|
||
|
||
>>> df.idxmax()
|
||
consumption Wheat Products
|
||
co2_emissions Beef
|
||
dtype: object
|
||
|
||
To return the index for the maximum value in each row, use ``axis="columns"``.
|
||
|
||
>>> df.idxmax(axis="columns")
|
||
Pork co2_emissions
|
||
Wheat Products consumption
|
||
Beef co2_emissions
|
||
dtype: object
|
||
"""
|
||
axis = self._get_axis_number(axis)
|
||
|
||
res = self._reduce(
|
||
nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
|
||
)
|
||
indices = res._values
|
||
|
||
# indices will always be np.ndarray since axis is not None and
|
||
# values is a 2d array for DataFrame
|
||
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
|
||
assert isinstance(indices, np.ndarray) # for mypy
|
||
|
||
index = self._get_axis(axis)
|
||
result = [index[i] if i >= 0 else np.nan for i in indices]
|
||
return self._constructor_sliced(result, index=self._get_agg_axis(axis))
|
||
|
||
def _get_agg_axis(self, axis_num: int) -> Index:
|
||
"""
|
||
Let's be explicit about this.
|
||
"""
|
||
if axis_num == 0:
|
||
return self.columns
|
||
elif axis_num == 1:
|
||
return self.index
|
||
else:
|
||
raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
|
||
|
||
def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame:
|
||
"""
|
||
Get the mode(s) of each element along the selected axis.
|
||
|
||
The mode of a set of values is the value that appears most often.
|
||
It can be multiple values.
|
||
|
||
Parameters
|
||
----------
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to iterate over while searching for the mode:
|
||
|
||
* 0 or 'index' : get mode of each column
|
||
* 1 or 'columns' : get mode of each row.
|
||
|
||
numeric_only : bool, default False
|
||
If True, only apply to numeric columns.
|
||
dropna : bool, default True
|
||
Don't consider counts of NaN/NaT.
|
||
|
||
.. versionadded:: 0.24.0
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
The modes of each column or row.
|
||
|
||
See Also
|
||
--------
|
||
Series.mode : Return the highest frequency value in a Series.
|
||
Series.value_counts : Return the counts of values in a Series.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame([('bird', 2, 2),
|
||
... ('mammal', 4, np.nan),
|
||
... ('arthropod', 8, 0),
|
||
... ('bird', 2, np.nan)],
|
||
... index=('falcon', 'horse', 'spider', 'ostrich'),
|
||
... columns=('species', 'legs', 'wings'))
|
||
>>> df
|
||
species legs wings
|
||
falcon bird 2 2.0
|
||
horse mammal 4 NaN
|
||
spider arthropod 8 0.0
|
||
ostrich bird 2 NaN
|
||
|
||
By default, missing values are not considered, and the mode of wings
|
||
are both 0 and 2. Because the resulting DataFrame has two rows,
|
||
the second row of ``species`` and ``legs`` contains ``NaN``.
|
||
|
||
>>> df.mode()
|
||
species legs wings
|
||
0 bird 2.0 0.0
|
||
1 NaN NaN 2.0
|
||
|
||
Setting ``dropna=False`` ``NaN`` values are considered and they can be
|
||
the mode (like for wings).
|
||
|
||
>>> df.mode(dropna=False)
|
||
species legs wings
|
||
0 bird 2 NaN
|
||
|
||
Setting ``numeric_only=True``, only the mode of numeric columns is
|
||
computed, and columns of other types are ignored.
|
||
|
||
>>> df.mode(numeric_only=True)
|
||
legs wings
|
||
0 2.0 0.0
|
||
1 NaN 2.0
|
||
|
||
To compute the mode over columns and not rows, use the axis parameter:
|
||
|
||
>>> df.mode(axis='columns', numeric_only=True)
|
||
0 1
|
||
falcon 2.0 NaN
|
||
horse 4.0 NaN
|
||
spider 0.0 8.0
|
||
ostrich 2.0 NaN
|
||
"""
|
||
data = self if not numeric_only else self._get_numeric_data()
|
||
|
||
def f(s):
|
||
return s.mode(dropna=dropna)
|
||
|
||
return data.apply(f, axis=axis)
|
||
|
||
def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
|
||
"""
|
||
Return values at the given quantile over requested axis.
|
||
|
||
Parameters
|
||
----------
|
||
q : float or array-like, default 0.5 (50% quantile)
|
||
Value between 0 <= q <= 1, the quantile(s) to compute.
|
||
axis : {0, 1, 'index', 'columns'}, default 0
|
||
Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
|
||
numeric_only : bool, default True
|
||
If False, the quantile of datetime and timedelta data will be
|
||
computed as well.
|
||
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
|
||
This optional parameter specifies the interpolation method to use,
|
||
when the desired quantile lies between two data points `i` and `j`:
|
||
|
||
* linear: `i + (j - i) * fraction`, where `fraction` is the
|
||
fractional part of the index surrounded by `i` and `j`.
|
||
* lower: `i`.
|
||
* higher: `j`.
|
||
* nearest: `i` or `j` whichever is nearest.
|
||
* midpoint: (`i` + `j`) / 2.
|
||
|
||
Returns
|
||
-------
|
||
Series or DataFrame
|
||
|
||
If ``q`` is an array, a DataFrame will be returned where the
|
||
index is ``q``, the columns are the columns of self, and the
|
||
values are the quantiles.
|
||
If ``q`` is a float, a Series will be returned where the
|
||
index is the columns of self and the values are the quantiles.
|
||
|
||
See Also
|
||
--------
|
||
core.window.Rolling.quantile: Rolling quantile.
|
||
numpy.percentile: Numpy function to compute the percentile.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
|
||
... columns=['a', 'b'])
|
||
>>> df.quantile(.1)
|
||
a 1.3
|
||
b 3.7
|
||
Name: 0.1, dtype: float64
|
||
>>> df.quantile([.1, .5])
|
||
a b
|
||
0.1 1.3 3.7
|
||
0.5 2.5 55.0
|
||
|
||
Specifying `numeric_only=False` will also compute the quantile of
|
||
datetime and timedelta data.
|
||
|
||
>>> df = pd.DataFrame({'A': [1, 2],
|
||
... 'B': [pd.Timestamp('2010'),
|
||
... pd.Timestamp('2011')],
|
||
... 'C': [pd.Timedelta('1 days'),
|
||
... pd.Timedelta('2 days')]})
|
||
>>> df.quantile(0.5, numeric_only=False)
|
||
A 1.5
|
||
B 2010-07-02 12:00:00
|
||
C 1 days 12:00:00
|
||
Name: 0.5, dtype: object
|
||
"""
|
||
validate_percentile(q)
|
||
|
||
data = self._get_numeric_data() if numeric_only else self
|
||
axis = self._get_axis_number(axis)
|
||
is_transposed = axis == 1
|
||
|
||
if is_transposed:
|
||
data = data.T
|
||
|
||
if len(data.columns) == 0:
|
||
# GH#23925 _get_numeric_data may have dropped all columns
|
||
cols = Index([], name=self.columns.name)
|
||
if is_list_like(q):
|
||
return self._constructor([], index=q, columns=cols)
|
||
return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)
|
||
|
||
result = data._mgr.quantile(
|
||
qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
|
||
)
|
||
|
||
if result.ndim == 2:
|
||
result = self._constructor(result)
|
||
else:
|
||
result = self._constructor_sliced(result, name=q)
|
||
|
||
if is_transposed:
|
||
result = result.T
|
||
|
||
return result
|
||
|
||
def to_timestamp(
|
||
self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True
|
||
) -> DataFrame:
|
||
"""
|
||
Cast to DatetimeIndex of timestamps, at *beginning* of period.
|
||
|
||
Parameters
|
||
----------
|
||
freq : str, default frequency of PeriodIndex
|
||
Desired frequency.
|
||
how : {'s', 'e', 'start', 'end'}
|
||
Convention for converting period to timestamp; start of period
|
||
vs. end.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to convert (the index by default).
|
||
copy : bool, default True
|
||
If False then underlying input data is not copied.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame with DatetimeIndex
|
||
"""
|
||
new_obj = self.copy(deep=copy)
|
||
|
||
axis_name = self._get_axis_name(axis)
|
||
old_ax = getattr(self, axis_name)
|
||
if not isinstance(old_ax, PeriodIndex):
|
||
raise TypeError(f"unsupported Type {type(old_ax).__name__}")
|
||
|
||
new_ax = old_ax.to_timestamp(freq=freq, how=how)
|
||
|
||
setattr(new_obj, axis_name, new_ax)
|
||
return new_obj
|
||
|
||
def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame:
|
||
"""
|
||
Convert DataFrame from DatetimeIndex to PeriodIndex.
|
||
|
||
Convert DataFrame from DatetimeIndex to PeriodIndex with desired
|
||
frequency (inferred from index if not passed).
|
||
|
||
Parameters
|
||
----------
|
||
freq : str, default
|
||
Frequency of the PeriodIndex.
|
||
axis : {0 or 'index', 1 or 'columns'}, default 0
|
||
The axis to convert (the index by default).
|
||
copy : bool, default True
|
||
If False then underlying input data is not copied.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame with PeriodIndex
|
||
"""
|
||
new_obj = self.copy(deep=copy)
|
||
|
||
axis_name = self._get_axis_name(axis)
|
||
old_ax = getattr(self, axis_name)
|
||
if not isinstance(old_ax, DatetimeIndex):
|
||
raise TypeError(f"unsupported Type {type(old_ax).__name__}")
|
||
|
||
new_ax = old_ax.to_period(freq=freq)
|
||
|
||
setattr(new_obj, axis_name, new_ax)
|
||
return new_obj
|
||
|
||
def isin(self, values) -> DataFrame:
|
||
"""
|
||
Whether each element in the DataFrame is contained in values.
|
||
|
||
Parameters
|
||
----------
|
||
values : iterable, Series, DataFrame or dict
|
||
The result will only be true at a location if all the
|
||
labels match. If `values` is a Series, that's the index. If
|
||
`values` is a dict, the keys must be the column names,
|
||
which must match. If `values` is a DataFrame,
|
||
then both the index and column labels must match.
|
||
|
||
Returns
|
||
-------
|
||
DataFrame
|
||
DataFrame of booleans showing whether each element in the DataFrame
|
||
is contained in values.
|
||
|
||
See Also
|
||
--------
|
||
DataFrame.eq: Equality test for DataFrame.
|
||
Series.isin: Equivalent method on Series.
|
||
Series.str.contains: Test if pattern or regex is contained within a
|
||
string of a Series or Index.
|
||
|
||
Examples
|
||
--------
|
||
>>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
|
||
... index=['falcon', 'dog'])
|
||
>>> df
|
||
num_legs num_wings
|
||
falcon 2 2
|
||
dog 4 0
|
||
|
||
When ``values`` is a list check whether every value in the DataFrame
|
||
is present in the list (which animals have 0 or 2 legs or wings)
|
||
|
||
>>> df.isin([0, 2])
|
||
num_legs num_wings
|
||
falcon True True
|
||
dog False True
|
||
|
||
When ``values`` is a dict, we can pass values to check for each
|
||
column separately:
|
||
|
||
>>> df.isin({'num_wings': [0, 3]})
|
||
num_legs num_wings
|
||
falcon False False
|
||
dog False True
|
||
|
||
When ``values`` is a Series or DataFrame the index and column must
|
||
match. Note that 'falcon' does not match based on the number of legs
|
||
in df2.
|
||
|
||
>>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
|
||
... index=['spider', 'falcon'])
|
||
>>> df.isin(other)
|
||
num_legs num_wings
|
||
falcon True True
|
||
dog False False
|
||
"""
|
||
if isinstance(values, dict):
|
||
from pandas.core.reshape.concat import concat
|
||
|
||
values = collections.defaultdict(list, values)
|
||
return concat(
|
||
(
|
||
self.iloc[:, [i]].isin(values[col])
|
||
for i, col in enumerate(self.columns)
|
||
),
|
||
axis=1,
|
||
)
|
||
elif isinstance(values, Series):
|
||
if not values.index.is_unique:
|
||
raise ValueError("cannot compute isin with a duplicate axis.")
|
||
return self.eq(values.reindex_like(self), axis="index")
|
||
elif isinstance(values, DataFrame):
|
||
if not (values.columns.is_unique and values.index.is_unique):
|
||
raise ValueError("cannot compute isin with a duplicate axis.")
|
||
return self.eq(values.reindex_like(self))
|
||
else:
|
||
if not is_list_like(values):
|
||
raise TypeError(
|
||
"only list-like or dict-like objects are allowed "
|
||
"to be passed to DataFrame.isin(), "
|
||
f"you passed a '{type(values).__name__}'"
|
||
)
|
||
return self._constructor(
|
||
algorithms.isin(self.values.ravel(), values).reshape(self.shape),
|
||
self.index,
|
||
self.columns,
|
||
)
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Add index and columns
|
||
_AXIS_ORDERS = ["index", "columns"]
|
||
_AXIS_TO_AXIS_NUMBER: Dict[Axis, int] = {
|
||
**NDFrame._AXIS_TO_AXIS_NUMBER,
|
||
1: 1,
|
||
"columns": 1,
|
||
}
|
||
_AXIS_REVERSED = True
|
||
_AXIS_LEN = len(_AXIS_ORDERS)
|
||
_info_axis_number = 1
|
||
_info_axis_name = "columns"
|
||
|
||
index: Index = properties.AxisProperty(
|
||
axis=1, doc="The index (row labels) of the DataFrame."
|
||
)
|
||
columns: Index = properties.AxisProperty(
|
||
axis=0, doc="The column labels of the DataFrame."
|
||
)
|
||
|
||
@property
|
||
def _AXIS_NUMBERS(self) -> Dict[str, int]:
|
||
""".. deprecated:: 1.1.0"""
|
||
super()._AXIS_NUMBERS
|
||
return {"index": 0, "columns": 1}
|
||
|
||
@property
|
||
def _AXIS_NAMES(self) -> Dict[int, str]:
|
||
""".. deprecated:: 1.1.0"""
|
||
super()._AXIS_NAMES
|
||
return {0: "index", 1: "columns"}
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Add plotting methods to DataFrame
|
||
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
|
||
hist = pandas.plotting.hist_frame
|
||
boxplot = pandas.plotting.boxplot_frame
|
||
sparse = CachedAccessor("sparse", SparseFrameAccessor)
|
||
|
||
|
||
DataFrame._add_numeric_operations()
|
||
|
||
ops.add_flex_arithmetic_methods(DataFrame)
|
||
|
||
|
||
def _from_nested_dict(data) -> collections.defaultdict:
|
||
new_data: collections.defaultdict = collections.defaultdict(dict)
|
||
for index, s in data.items():
|
||
for col, v in s.items():
|
||
new_data[col][index] = v
|
||
return new_data
|