7233 lines
238 KiB
Python
7233 lines
238 KiB
Python
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
import functools
|
|
from itertools import zip_longest
|
|
import operator
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Callable,
|
|
ClassVar,
|
|
Hashable,
|
|
Iterable,
|
|
Literal,
|
|
NoReturn,
|
|
Sequence,
|
|
TypeVar,
|
|
cast,
|
|
final,
|
|
overload,
|
|
)
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
from pandas._config import get_option
|
|
|
|
from pandas._libs import (
|
|
NaT,
|
|
algos as libalgos,
|
|
index as libindex,
|
|
lib,
|
|
)
|
|
from pandas._libs.internals import BlockValuesRefs
|
|
import pandas._libs.join as libjoin
|
|
from pandas._libs.lib import (
|
|
is_datetime_array,
|
|
no_default,
|
|
)
|
|
from pandas._libs.missing import is_float_nan
|
|
from pandas._libs.tslibs import (
|
|
IncompatibleFrequency,
|
|
OutOfBoundsDatetime,
|
|
Timestamp,
|
|
tz_compare,
|
|
)
|
|
from pandas._typing import (
|
|
AnyAll,
|
|
ArrayLike,
|
|
Axes,
|
|
Axis,
|
|
DropKeep,
|
|
DtypeObj,
|
|
F,
|
|
IgnoreRaise,
|
|
IndexLabel,
|
|
JoinHow,
|
|
Level,
|
|
Shape,
|
|
npt,
|
|
)
|
|
from pandas.compat.numpy import function as nv
|
|
from pandas.errors import (
|
|
DuplicateLabelError,
|
|
InvalidIndexError,
|
|
)
|
|
from pandas.util._decorators import (
|
|
Appender,
|
|
cache_readonly,
|
|
doc,
|
|
)
|
|
from pandas.util._exceptions import (
|
|
find_stack_level,
|
|
rewrite_exception,
|
|
)
|
|
|
|
from pandas.core.dtypes.astype import (
|
|
astype_array,
|
|
astype_is_view,
|
|
)
|
|
from pandas.core.dtypes.cast import (
|
|
LossySetitemError,
|
|
can_hold_element,
|
|
common_dtype_categorical_compat,
|
|
find_result_type,
|
|
infer_dtype_from,
|
|
maybe_cast_pointwise_result,
|
|
np_can_hold_element,
|
|
)
|
|
from pandas.core.dtypes.common import (
|
|
ensure_int64,
|
|
ensure_object,
|
|
ensure_platform_int,
|
|
is_any_real_numeric_dtype,
|
|
is_bool_dtype,
|
|
is_categorical_dtype,
|
|
is_dtype_equal,
|
|
is_ea_or_datetimelike_dtype,
|
|
is_extension_array_dtype,
|
|
is_float,
|
|
is_float_dtype,
|
|
is_hashable,
|
|
is_integer,
|
|
is_integer_dtype,
|
|
is_interval_dtype,
|
|
is_iterator,
|
|
is_list_like,
|
|
is_numeric_dtype,
|
|
is_object_dtype,
|
|
is_scalar,
|
|
is_signed_integer_dtype,
|
|
is_string_dtype,
|
|
needs_i8_conversion,
|
|
pandas_dtype,
|
|
validate_all_hashable,
|
|
)
|
|
from pandas.core.dtypes.concat import concat_compat
|
|
from pandas.core.dtypes.dtypes import (
|
|
CategoricalDtype,
|
|
DatetimeTZDtype,
|
|
ExtensionDtype,
|
|
IntervalDtype,
|
|
PeriodDtype,
|
|
)
|
|
from pandas.core.dtypes.generic import (
|
|
ABCDataFrame,
|
|
ABCDatetimeIndex,
|
|
ABCMultiIndex,
|
|
ABCPeriodIndex,
|
|
ABCSeries,
|
|
ABCTimedeltaIndex,
|
|
)
|
|
from pandas.core.dtypes.inference import is_dict_like
|
|
from pandas.core.dtypes.missing import (
|
|
array_equivalent,
|
|
is_valid_na_for_dtype,
|
|
isna,
|
|
)
|
|
|
|
from pandas.core import (
|
|
arraylike,
|
|
ops,
|
|
)
|
|
from pandas.core.accessor import CachedAccessor
|
|
import pandas.core.algorithms as algos
|
|
from pandas.core.array_algos.putmask import (
|
|
setitem_datetimelike_compat,
|
|
validate_putmask,
|
|
)
|
|
from pandas.core.arrays import (
|
|
ArrowExtensionArray,
|
|
BaseMaskedArray,
|
|
Categorical,
|
|
ExtensionArray,
|
|
)
|
|
from pandas.core.arrays.string_ import StringArray
|
|
from pandas.core.base import (
|
|
IndexOpsMixin,
|
|
PandasObject,
|
|
)
|
|
import pandas.core.common as com
|
|
from pandas.core.construction import (
|
|
ensure_wrapped_if_datetimelike,
|
|
extract_array,
|
|
sanitize_array,
|
|
)
|
|
from pandas.core.indexers import disallow_ndim_indexing
|
|
from pandas.core.indexes.frozen import FrozenList
|
|
from pandas.core.missing import clean_reindex_fill_method
|
|
from pandas.core.ops import get_op_result_name
|
|
from pandas.core.ops.invalid import make_invalid_op
|
|
from pandas.core.sorting import (
|
|
ensure_key_mapped,
|
|
get_group_index_sorter,
|
|
nargsort,
|
|
)
|
|
from pandas.core.strings.accessor import StringMethods
|
|
|
|
from pandas.io.formats.printing import (
|
|
PrettyDict,
|
|
default_pprint,
|
|
format_object_summary,
|
|
pprint_thing,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from pandas import (
|
|
CategoricalIndex,
|
|
DataFrame,
|
|
MultiIndex,
|
|
Series,
|
|
)
|
|
from pandas.core.arrays import PeriodArray
|
|
|
|
|
|
__all__ = ["Index"]
|
|
|
|
_unsortable_types = frozenset(("mixed", "mixed-integer"))
|
|
|
|
_index_doc_kwargs: dict[str, str] = {
|
|
"klass": "Index",
|
|
"inplace": "",
|
|
"target_klass": "Index",
|
|
"raises_section": "",
|
|
"unique": "Index",
|
|
"duplicated": "np.ndarray",
|
|
}
|
|
_index_shared_docs: dict[str, str] = {}
|
|
str_t = str
|
|
|
|
|
|
_dtype_obj = np.dtype("object")
|
|
|
|
_masked_engines = {
|
|
"Complex128": libindex.MaskedComplex128Engine,
|
|
"Complex64": libindex.MaskedComplex64Engine,
|
|
"Float64": libindex.MaskedFloat64Engine,
|
|
"Float32": libindex.MaskedFloat32Engine,
|
|
"UInt64": libindex.MaskedUInt64Engine,
|
|
"UInt32": libindex.MaskedUInt32Engine,
|
|
"UInt16": libindex.MaskedUInt16Engine,
|
|
"UInt8": libindex.MaskedUInt8Engine,
|
|
"Int64": libindex.MaskedInt64Engine,
|
|
"Int32": libindex.MaskedInt32Engine,
|
|
"Int16": libindex.MaskedInt16Engine,
|
|
"Int8": libindex.MaskedInt8Engine,
|
|
"boolean": libindex.MaskedBoolEngine,
|
|
"double[pyarrow]": libindex.MaskedFloat64Engine,
|
|
"float64[pyarrow]": libindex.MaskedFloat64Engine,
|
|
"float32[pyarrow]": libindex.MaskedFloat32Engine,
|
|
"float[pyarrow]": libindex.MaskedFloat32Engine,
|
|
"uint64[pyarrow]": libindex.MaskedUInt64Engine,
|
|
"uint32[pyarrow]": libindex.MaskedUInt32Engine,
|
|
"uint16[pyarrow]": libindex.MaskedUInt16Engine,
|
|
"uint8[pyarrow]": libindex.MaskedUInt8Engine,
|
|
"int64[pyarrow]": libindex.MaskedInt64Engine,
|
|
"int32[pyarrow]": libindex.MaskedInt32Engine,
|
|
"int16[pyarrow]": libindex.MaskedInt16Engine,
|
|
"int8[pyarrow]": libindex.MaskedInt8Engine,
|
|
"bool[pyarrow]": libindex.MaskedBoolEngine,
|
|
}
|
|
|
|
|
|
def _maybe_return_indexers(meth: F) -> F:
|
|
"""
|
|
Decorator to simplify 'return_indexers' checks in Index.join.
|
|
"""
|
|
|
|
@functools.wraps(meth)
|
|
def join(
|
|
self,
|
|
other: Index,
|
|
*,
|
|
how: JoinHow = "left",
|
|
level=None,
|
|
return_indexers: bool = False,
|
|
sort: bool = False,
|
|
):
|
|
join_index, lidx, ridx = meth(self, other, how=how, level=level, sort=sort)
|
|
if not return_indexers:
|
|
return join_index
|
|
|
|
if lidx is not None:
|
|
lidx = ensure_platform_int(lidx)
|
|
if ridx is not None:
|
|
ridx = ensure_platform_int(ridx)
|
|
return join_index, lidx, ridx
|
|
|
|
return cast(F, join)
|
|
|
|
|
|
def _new_Index(cls, d):
|
|
"""
|
|
This is called upon unpickling, rather than the default which doesn't
|
|
have arguments and breaks __new__.
|
|
"""
|
|
# required for backward compat, because PI can't be instantiated with
|
|
# ordinals through __new__ GH #13277
|
|
if issubclass(cls, ABCPeriodIndex):
|
|
from pandas.core.indexes.period import _new_PeriodIndex
|
|
|
|
return _new_PeriodIndex(cls, **d)
|
|
|
|
if issubclass(cls, ABCMultiIndex):
|
|
if "labels" in d and "codes" not in d:
|
|
# GH#23752 "labels" kwarg has been replaced with "codes"
|
|
d["codes"] = d.pop("labels")
|
|
|
|
# Since this was a valid MultiIndex at pickle-time, we don't need to
|
|
# check validty at un-pickle time.
|
|
d["verify_integrity"] = False
|
|
|
|
elif "dtype" not in d and "data" in d:
|
|
# Prevent Index.__new__ from conducting inference;
|
|
# "data" key not in RangeIndex
|
|
d["dtype"] = d["data"].dtype
|
|
return cls.__new__(cls, **d)
|
|
|
|
|
|
_IndexT = TypeVar("_IndexT", bound="Index")
|
|
|
|
|
|
class Index(IndexOpsMixin, PandasObject):
|
|
"""
|
|
Immutable sequence used for indexing and alignment.
|
|
|
|
The basic object storing axis labels for all pandas objects.
|
|
|
|
.. versionchanged:: 2.0.0
|
|
|
|
Index can hold all numpy numeric dtypes (except float16). Previously only
|
|
int64/uint64/float64 dtypes were accepted.
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like (1-dimensional)
|
|
dtype : NumPy dtype (default: object)
|
|
If dtype is None, we find the dtype that best fits the data.
|
|
If an actual dtype is provided, we coerce to that dtype if it's safe.
|
|
Otherwise, an error will be raised.
|
|
copy : bool
|
|
Make a copy of input ndarray.
|
|
name : object
|
|
Name to be stored in the index.
|
|
tupleize_cols : bool (default: True)
|
|
When True, attempt to create a MultiIndex if possible.
|
|
|
|
See Also
|
|
--------
|
|
RangeIndex : Index implementing a monotonic integer range.
|
|
CategoricalIndex : Index of :class:`Categorical` s.
|
|
MultiIndex : A multi-level, or hierarchical Index.
|
|
IntervalIndex : An Index of :class:`Interval` s.
|
|
DatetimeIndex : Index of datetime64 data.
|
|
TimedeltaIndex : Index of timedelta64 data.
|
|
PeriodIndex : Index of Period data.
|
|
|
|
Notes
|
|
-----
|
|
An Index instance can **only** contain hashable objects.
|
|
An Index instance *can not* hold numpy float16 dtype.
|
|
|
|
Examples
|
|
--------
|
|
>>> pd.Index([1, 2, 3])
|
|
Index([1, 2, 3], dtype='int64')
|
|
|
|
>>> pd.Index(list('abc'))
|
|
Index(['a', 'b', 'c'], dtype='object')
|
|
|
|
>>> pd.Index([1, 2, 3], dtype="uint8")
|
|
Index([1, 2, 3], dtype='uint8')
|
|
"""
|
|
|
|
# To hand over control to subclasses
|
|
_join_precedence = 1
|
|
|
|
# Cython methods; see github.com/cython/cython/issues/2647
|
|
# for why we need to wrap these instead of making them class attributes
|
|
# Moreover, cython will choose the appropriate-dtyped sub-function
|
|
# given the dtypes of the passed arguments
|
|
|
|
@final
|
|
def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
|
|
# Caller is responsible for ensuring other.dtype == self.dtype
|
|
sv = self._get_join_target()
|
|
ov = other._get_join_target()
|
|
# can_use_libjoin assures sv and ov are ndarrays
|
|
sv = cast(np.ndarray, sv)
|
|
ov = cast(np.ndarray, ov)
|
|
# similar but not identical to ov.searchsorted(sv)
|
|
return libjoin.left_join_indexer_unique(sv, ov)
|
|
|
|
@final
|
|
def _left_indexer(
|
|
self: _IndexT, other: _IndexT
|
|
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
# Caller is responsible for ensuring other.dtype == self.dtype
|
|
sv = self._get_join_target()
|
|
ov = other._get_join_target()
|
|
# can_use_libjoin assures sv and ov are ndarrays
|
|
sv = cast(np.ndarray, sv)
|
|
ov = cast(np.ndarray, ov)
|
|
joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov)
|
|
joined = self._from_join_target(joined_ndarray)
|
|
return joined, lidx, ridx
|
|
|
|
@final
|
|
def _inner_indexer(
|
|
self: _IndexT, other: _IndexT
|
|
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
# Caller is responsible for ensuring other.dtype == self.dtype
|
|
sv = self._get_join_target()
|
|
ov = other._get_join_target()
|
|
# can_use_libjoin assures sv and ov are ndarrays
|
|
sv = cast(np.ndarray, sv)
|
|
ov = cast(np.ndarray, ov)
|
|
joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov)
|
|
joined = self._from_join_target(joined_ndarray)
|
|
return joined, lidx, ridx
|
|
|
|
@final
|
|
def _outer_indexer(
|
|
self: _IndexT, other: _IndexT
|
|
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
# Caller is responsible for ensuring other.dtype == self.dtype
|
|
sv = self._get_join_target()
|
|
ov = other._get_join_target()
|
|
# can_use_libjoin assures sv and ov are ndarrays
|
|
sv = cast(np.ndarray, sv)
|
|
ov = cast(np.ndarray, ov)
|
|
joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov)
|
|
joined = self._from_join_target(joined_ndarray)
|
|
return joined, lidx, ridx
|
|
|
|
_typ: str = "index"
|
|
_data: ExtensionArray | np.ndarray
|
|
_data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = (
|
|
np.ndarray,
|
|
ExtensionArray,
|
|
)
|
|
_id: object | None = None
|
|
_name: Hashable = None
|
|
# MultiIndex.levels previously allowed setting the index name. We
|
|
# don't allow this anymore, and raise if it happens rather than
|
|
# failing silently.
|
|
_no_setting_name: bool = False
|
|
_comparables: list[str] = ["name"]
|
|
_attributes: list[str] = ["name"]
|
|
|
|
@cache_readonly
|
|
def _can_hold_strings(self) -> bool:
|
|
return not is_numeric_dtype(self)
|
|
|
|
_engine_types: dict[np.dtype | ExtensionDtype, type[libindex.IndexEngine]] = {
|
|
np.dtype(np.int8): libindex.Int8Engine,
|
|
np.dtype(np.int16): libindex.Int16Engine,
|
|
np.dtype(np.int32): libindex.Int32Engine,
|
|
np.dtype(np.int64): libindex.Int64Engine,
|
|
np.dtype(np.uint8): libindex.UInt8Engine,
|
|
np.dtype(np.uint16): libindex.UInt16Engine,
|
|
np.dtype(np.uint32): libindex.UInt32Engine,
|
|
np.dtype(np.uint64): libindex.UInt64Engine,
|
|
np.dtype(np.float32): libindex.Float32Engine,
|
|
np.dtype(np.float64): libindex.Float64Engine,
|
|
np.dtype(np.complex64): libindex.Complex64Engine,
|
|
np.dtype(np.complex128): libindex.Complex128Engine,
|
|
}
|
|
|
|
@property
|
|
def _engine_type(
|
|
self,
|
|
) -> type[libindex.IndexEngine] | type[libindex.ExtensionEngine]:
|
|
return self._engine_types.get(self.dtype, libindex.ObjectEngine)
|
|
|
|
# whether we support partial string indexing. Overridden
|
|
# in DatetimeIndex and PeriodIndex
|
|
_supports_partial_string_indexing = False
|
|
|
|
_accessors = {"str"}
|
|
|
|
str = CachedAccessor("str", StringMethods)
|
|
|
|
_references = None
|
|
|
|
# --------------------------------------------------------------------
|
|
# Constructors
|
|
|
|
def __new__(
|
|
cls,
|
|
data=None,
|
|
dtype=None,
|
|
copy: bool = False,
|
|
name=None,
|
|
tupleize_cols: bool = True,
|
|
) -> Index:
|
|
from pandas.core.indexes.range import RangeIndex
|
|
|
|
name = maybe_extract_name(name, data, cls)
|
|
|
|
if dtype is not None:
|
|
dtype = pandas_dtype(dtype)
|
|
|
|
data_dtype = getattr(data, "dtype", None)
|
|
|
|
refs = None
|
|
if not copy and isinstance(data, (ABCSeries, Index)):
|
|
refs = data._references
|
|
|
|
# range
|
|
if isinstance(data, (range, RangeIndex)):
|
|
result = RangeIndex(start=data, copy=copy, name=name)
|
|
if dtype is not None:
|
|
return result.astype(dtype, copy=False)
|
|
return result
|
|
|
|
elif is_ea_or_datetimelike_dtype(dtype):
|
|
# non-EA dtype indexes have special casting logic, so we punt here
|
|
pass
|
|
|
|
elif is_ea_or_datetimelike_dtype(data_dtype):
|
|
pass
|
|
|
|
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
|
|
if isinstance(data, ABCMultiIndex):
|
|
data = data._values
|
|
|
|
if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]:
|
|
# GH#11836 we need to avoid having numpy coerce
|
|
# things that look like ints/floats to ints unless
|
|
# they are actually ints, e.g. '0' and 0.0
|
|
# should not be coerced
|
|
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
|
|
|
|
elif is_scalar(data):
|
|
raise cls._raise_scalar_data_error(data)
|
|
elif hasattr(data, "__array__"):
|
|
return Index(np.asarray(data), dtype=dtype, copy=copy, name=name)
|
|
elif not is_list_like(data) and not isinstance(data, memoryview):
|
|
# 2022-11-16 the memoryview check is only necessary on some CI
|
|
# builds, not clear why
|
|
raise cls._raise_scalar_data_error(data)
|
|
|
|
else:
|
|
if tupleize_cols:
|
|
# GH21470: convert iterable to list before determining if empty
|
|
if is_iterator(data):
|
|
data = list(data)
|
|
|
|
if data and all(isinstance(e, tuple) for e in data):
|
|
# we must be all tuples, otherwise don't construct
|
|
# 10697
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
return MultiIndex.from_tuples(data, names=name)
|
|
# other iterable of some kind
|
|
|
|
if not isinstance(data, (list, tuple)):
|
|
# we allow set/frozenset, which Series/sanitize_array does not, so
|
|
# cast to list here
|
|
data = list(data)
|
|
if len(data) == 0:
|
|
# unlike Series, we default to object dtype:
|
|
data = np.array(data, dtype=object)
|
|
|
|
if len(data) and isinstance(data[0], tuple):
|
|
# Ensure we get 1-D array of tuples instead of 2D array.
|
|
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
|
|
|
|
try:
|
|
arr = sanitize_array(data, None, dtype=dtype, copy=copy)
|
|
except ValueError as err:
|
|
if "index must be specified when data is not list-like" in str(err):
|
|
raise cls._raise_scalar_data_error(data) from err
|
|
if "Data must be 1-dimensional" in str(err):
|
|
raise ValueError("Index data must be 1-dimensional") from err
|
|
raise
|
|
arr = ensure_wrapped_if_datetimelike(arr)
|
|
|
|
klass = cls._dtype_to_subclass(arr.dtype)
|
|
|
|
arr = klass._ensure_array(arr, arr.dtype, copy=False)
|
|
return klass._simple_new(arr, name, refs=refs)
|
|
|
|
@classmethod
|
|
def _ensure_array(cls, data, dtype, copy: bool):
|
|
"""
|
|
Ensure we have a valid array to pass to _simple_new.
|
|
"""
|
|
if data.ndim > 1:
|
|
# GH#13601, GH#20285, GH#27125
|
|
raise ValueError("Index data must be 1-dimensional")
|
|
elif dtype == np.float16:
|
|
# float16 not supported (no indexing engine)
|
|
raise NotImplementedError("float16 indexes are not supported")
|
|
|
|
if copy:
|
|
# asarray_tuplesafe does not always copy underlying data,
|
|
# so need to make sure that this happens
|
|
data = data.copy()
|
|
return data
|
|
|
|
@final
|
|
@classmethod
|
|
def _dtype_to_subclass(cls, dtype: DtypeObj):
|
|
# Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
|
|
|
|
if isinstance(dtype, ExtensionDtype):
|
|
if isinstance(dtype, DatetimeTZDtype):
|
|
from pandas import DatetimeIndex
|
|
|
|
return DatetimeIndex
|
|
elif isinstance(dtype, CategoricalDtype):
|
|
from pandas import CategoricalIndex
|
|
|
|
return CategoricalIndex
|
|
elif isinstance(dtype, IntervalDtype):
|
|
from pandas import IntervalIndex
|
|
|
|
return IntervalIndex
|
|
elif isinstance(dtype, PeriodDtype):
|
|
from pandas import PeriodIndex
|
|
|
|
return PeriodIndex
|
|
|
|
return Index
|
|
|
|
if dtype.kind == "M":
|
|
from pandas import DatetimeIndex
|
|
|
|
return DatetimeIndex
|
|
|
|
elif dtype.kind == "m":
|
|
from pandas import TimedeltaIndex
|
|
|
|
return TimedeltaIndex
|
|
|
|
elif dtype.kind == "O":
|
|
# NB: assuming away MultiIndex
|
|
return Index
|
|
|
|
elif issubclass(dtype.type, str) or is_numeric_dtype(dtype):
|
|
return Index
|
|
|
|
raise NotImplementedError(dtype)
|
|
|
|
# NOTE for new Index creation:
|
|
|
|
# - _simple_new: It returns new Index with the same type as the caller.
|
|
# All metadata (such as name) must be provided by caller's responsibility.
|
|
# Using _shallow_copy is recommended because it fills these metadata
|
|
# otherwise specified.
|
|
|
|
# - _shallow_copy: It returns new Index with the same type (using
|
|
# _simple_new), but fills caller's metadata otherwise specified. Passed
|
|
# kwargs will overwrite corresponding metadata.
|
|
|
|
# See each method's docstring.
|
|
|
|
@classmethod
|
|
def _simple_new(
|
|
cls: type[_IndexT], values: ArrayLike, name: Hashable = None, refs=None
|
|
) -> _IndexT:
|
|
"""
|
|
We require that we have a dtype compat for the values. If we are passed
|
|
a non-dtype compat, then coerce using the constructor.
|
|
|
|
Must be careful not to recurse.
|
|
"""
|
|
assert isinstance(values, cls._data_cls), type(values)
|
|
|
|
result = object.__new__(cls)
|
|
result._data = values
|
|
result._name = name
|
|
result._cache = {}
|
|
result._reset_identity()
|
|
if refs is not None:
|
|
result._references = refs
|
|
else:
|
|
result._references = BlockValuesRefs()
|
|
result._references.add_index_reference(result)
|
|
|
|
return result
|
|
|
|
@classmethod
|
|
def _with_infer(cls, *args, **kwargs):
|
|
"""
|
|
Constructor that uses the 1.0.x behavior inferring numeric dtypes
|
|
for ndarray[object] inputs.
|
|
"""
|
|
result = cls(*args, **kwargs)
|
|
|
|
if result.dtype == _dtype_obj and not result._is_multi:
|
|
# error: Argument 1 to "maybe_convert_objects" has incompatible type
|
|
# "Union[ExtensionArray, ndarray[Any, Any]]"; expected
|
|
# "ndarray[Any, Any]"
|
|
values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type]
|
|
if values.dtype.kind in ["i", "u", "f", "b"]:
|
|
return Index(values, name=result.name)
|
|
|
|
return result
|
|
|
|
@cache_readonly
|
|
def _constructor(self: _IndexT) -> type[_IndexT]:
|
|
return type(self)
|
|
|
|
@final
|
|
def _maybe_check_unique(self) -> None:
|
|
"""
|
|
Check that an Index has no duplicates.
|
|
|
|
This is typically only called via
|
|
`NDFrame.flags.allows_duplicate_labels.setter` when it's set to
|
|
True (duplicates aren't allowed).
|
|
|
|
Raises
|
|
------
|
|
DuplicateLabelError
|
|
When the index is not unique.
|
|
"""
|
|
if not self.is_unique:
|
|
msg = """Index has duplicates."""
|
|
duplicates = self._format_duplicate_message()
|
|
msg += f"\n{duplicates}"
|
|
|
|
raise DuplicateLabelError(msg)
|
|
|
|
@final
|
|
def _format_duplicate_message(self) -> DataFrame:
|
|
"""
|
|
Construct the DataFrame for a DuplicateLabelError.
|
|
|
|
This returns a DataFrame indicating the labels and positions
|
|
of duplicates in an index. This should only be called when it's
|
|
already known that duplicates are present.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['a', 'b', 'a'])
|
|
>>> idx._format_duplicate_message()
|
|
positions
|
|
label
|
|
a [0, 2]
|
|
"""
|
|
from pandas import Series
|
|
|
|
duplicates = self[self.duplicated(keep="first")].unique()
|
|
assert len(duplicates)
|
|
|
|
out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates]
|
|
if self._is_multi:
|
|
# test_format_duplicate_labels_message_multi
|
|
# error: "Type[Index]" has no attribute "from_tuples" [attr-defined]
|
|
out.index = type(self).from_tuples(out.index) # type: ignore[attr-defined]
|
|
|
|
if self.nlevels == 1:
|
|
out = out.rename_axis("label")
|
|
return out.to_frame(name="positions")
|
|
|
|
# --------------------------------------------------------------------
|
|
# Index Internals Methods
|
|
|
|
def _shallow_copy(self: _IndexT, values, name: Hashable = no_default) -> _IndexT:
|
|
"""
|
|
Create a new Index with the same class as the caller, don't copy the
|
|
data, use the same object attributes with passed in attributes taking
|
|
precedence.
|
|
|
|
*this is an internal non-public method*
|
|
|
|
Parameters
|
|
----------
|
|
values : the values to create the new Index, optional
|
|
name : Label, defaults to self.name
|
|
"""
|
|
name = self._name if name is no_default else name
|
|
|
|
return self._simple_new(values, name=name, refs=self._references)
|
|
|
|
def _view(self: _IndexT) -> _IndexT:
|
|
"""
|
|
fastpath to make a shallow copy, i.e. new object with same data.
|
|
"""
|
|
result = self._simple_new(self._values, name=self._name, refs=self._references)
|
|
|
|
result._cache = self._cache
|
|
return result
|
|
|
|
@final
|
|
def _rename(self: _IndexT, name: Hashable) -> _IndexT:
|
|
"""
|
|
fastpath for rename if new name is already validated.
|
|
"""
|
|
result = self._view()
|
|
result._name = name
|
|
return result
|
|
|
|
@final
|
|
def is_(self, other) -> bool:
|
|
"""
|
|
More flexible, faster check like ``is`` but that works through views.
|
|
|
|
Note: this is *not* the same as ``Index.identical()``, which checks
|
|
that metadata is also the same.
|
|
|
|
Parameters
|
|
----------
|
|
other : object
|
|
Other object to compare against.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if both have same underlying data, False otherwise.
|
|
|
|
See Also
|
|
--------
|
|
Index.identical : Works like ``Index.is_`` but also checks metadata.
|
|
"""
|
|
if self is other:
|
|
return True
|
|
elif not hasattr(other, "_id"):
|
|
return False
|
|
elif self._id is None or other._id is None:
|
|
return False
|
|
else:
|
|
return self._id is other._id
|
|
|
|
@final
|
|
def _reset_identity(self) -> None:
|
|
"""
|
|
Initializes or resets ``_id`` attribute with new object.
|
|
"""
|
|
self._id = object()
|
|
|
|
@final
|
|
def _cleanup(self) -> None:
|
|
self._engine.clear_mapping()
|
|
|
|
@cache_readonly
|
|
def _engine(
|
|
self,
|
|
) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine:
|
|
# For base class (object dtype) we get ObjectEngine
|
|
target_values = self._get_engine_target()
|
|
if isinstance(target_values, ExtensionArray):
|
|
if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)):
|
|
try:
|
|
return _masked_engines[target_values.dtype.name](target_values)
|
|
except KeyError:
|
|
# Not supported yet e.g. decimal
|
|
pass
|
|
elif self._engine_type is libindex.ObjectEngine:
|
|
return libindex.ExtensionEngine(target_values)
|
|
|
|
target_values = cast(np.ndarray, target_values)
|
|
# to avoid a reference cycle, bind `target_values` to a local variable, so
|
|
# `self` is not passed into the lambda.
|
|
if target_values.dtype == bool:
|
|
return libindex.BoolEngine(target_values)
|
|
elif target_values.dtype == np.complex64:
|
|
return libindex.Complex64Engine(target_values)
|
|
elif target_values.dtype == np.complex128:
|
|
return libindex.Complex128Engine(target_values)
|
|
elif needs_i8_conversion(self.dtype):
|
|
# We need to keep M8/m8 dtype when initializing the Engine,
|
|
# but don't want to change _get_engine_target bc it is used
|
|
# elsewhere
|
|
# error: Item "ExtensionArray" of "Union[ExtensionArray,
|
|
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
|
|
target_values = self._data._ndarray # type: ignore[union-attr]
|
|
|
|
# error: Argument 1 to "ExtensionEngine" has incompatible type
|
|
# "ndarray[Any, Any]"; expected "ExtensionArray"
|
|
return self._engine_type(target_values) # type: ignore[arg-type]
|
|
|
|
@final
|
|
@cache_readonly
|
|
def _dir_additions_for_owner(self) -> set[str_t]:
|
|
"""
|
|
Add the string-like labels to the owner dataframe/series dir output.
|
|
|
|
If this is a MultiIndex, it's first level values are used.
|
|
"""
|
|
return {
|
|
c
|
|
for c in self.unique(level=0)[: get_option("display.max_dir_items")]
|
|
if isinstance(c, str) and c.isidentifier()
|
|
}
|
|
|
|
# --------------------------------------------------------------------
|
|
# Array-Like Methods
|
|
|
|
# ndarray compat
|
|
def __len__(self) -> int:
|
|
"""
|
|
Return the length of the Index.
|
|
"""
|
|
return len(self._data)
|
|
|
|
def __array__(self, dtype=None) -> np.ndarray:
|
|
"""
|
|
The array interface, return my values.
|
|
"""
|
|
return np.asarray(self._data, dtype=dtype)
|
|
|
|
def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
|
|
if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
|
|
return NotImplemented
|
|
|
|
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
|
|
self, ufunc, method, *inputs, **kwargs
|
|
)
|
|
if result is not NotImplemented:
|
|
return result
|
|
|
|
if "out" in kwargs:
|
|
# e.g. test_dti_isub_tdi
|
|
return arraylike.dispatch_ufunc_with_out(
|
|
self, ufunc, method, *inputs, **kwargs
|
|
)
|
|
|
|
if method == "reduce":
|
|
result = arraylike.dispatch_reduction_ufunc(
|
|
self, ufunc, method, *inputs, **kwargs
|
|
)
|
|
if result is not NotImplemented:
|
|
return result
|
|
|
|
new_inputs = [x if x is not self else x._values for x in inputs]
|
|
result = getattr(ufunc, method)(*new_inputs, **kwargs)
|
|
if ufunc.nout == 2:
|
|
# i.e. np.divmod, np.modf, np.frexp
|
|
return tuple(self.__array_wrap__(x) for x in result)
|
|
|
|
if result.dtype == np.float16:
|
|
result = result.astype(np.float32)
|
|
|
|
return self.__array_wrap__(result)
|
|
|
|
def __array_wrap__(self, result, context=None):
|
|
"""
|
|
Gets called after a ufunc and other functions e.g. np.split.
|
|
"""
|
|
result = lib.item_from_zerodim(result)
|
|
if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1:
|
|
return result
|
|
|
|
return Index(result, name=self.name)
|
|
|
|
@cache_readonly
|
|
def dtype(self) -> DtypeObj:
|
|
"""
|
|
Return the dtype object of the underlying data.
|
|
"""
|
|
return self._data.dtype
|
|
|
|
@final
|
|
def ravel(self, order: str_t = "C") -> Index:
|
|
"""
|
|
Return a view on self.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
See Also
|
|
--------
|
|
numpy.ndarray.ravel : Return a flattened array.
|
|
"""
|
|
return self[:]
|
|
|
|
def view(self, cls=None):
|
|
# we need to see if we are subclassing an
|
|
# index type here
|
|
if cls is not None and not hasattr(cls, "_typ"):
|
|
dtype = cls
|
|
if isinstance(cls, str):
|
|
dtype = pandas_dtype(cls)
|
|
|
|
if isinstance(dtype, (np.dtype, ExtensionDtype)) and needs_i8_conversion(
|
|
dtype
|
|
):
|
|
if dtype.kind == "m" and dtype != "m8[ns]":
|
|
# e.g. m8[s]
|
|
return self._data.view(cls)
|
|
|
|
idx_cls = self._dtype_to_subclass(dtype)
|
|
# NB: we only get here for subclasses that override
|
|
# _data_cls such that it is a type and not a tuple
|
|
# of types.
|
|
arr_cls = idx_cls._data_cls
|
|
arr = arr_cls(self._data.view("i8"), dtype=dtype)
|
|
return idx_cls._simple_new(arr, name=self.name, refs=self._references)
|
|
|
|
result = self._data.view(cls)
|
|
else:
|
|
result = self._view()
|
|
if isinstance(result, Index):
|
|
result._id = self._id
|
|
return result
|
|
|
|
def astype(self, dtype, copy: bool = True):
|
|
"""
|
|
Create an Index with values cast to dtypes.
|
|
|
|
The class of a new Index is determined by dtype. When conversion is
|
|
impossible, a TypeError exception is raised.
|
|
|
|
Parameters
|
|
----------
|
|
dtype : numpy dtype or pandas type
|
|
Note that any signed integer `dtype` is treated as ``'int64'``,
|
|
and any unsigned integer `dtype` is treated as ``'uint64'``,
|
|
regardless of the size.
|
|
copy : bool, default True
|
|
By default, astype always returns a newly allocated object.
|
|
If copy is set to False and internal requirements on dtype are
|
|
satisfied, the original data is used to create a new Index
|
|
or the original Index is returned.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
Index with values cast to specified dtype.
|
|
"""
|
|
if dtype is not None:
|
|
dtype = pandas_dtype(dtype)
|
|
|
|
if is_dtype_equal(self.dtype, dtype):
|
|
# Ensure that self.astype(self.dtype) is self
|
|
return self.copy() if copy else self
|
|
|
|
values = self._data
|
|
if isinstance(values, ExtensionArray):
|
|
with rewrite_exception(type(values).__name__, type(self).__name__):
|
|
new_values = values.astype(dtype, copy=copy)
|
|
|
|
elif isinstance(dtype, ExtensionDtype):
|
|
cls = dtype.construct_array_type()
|
|
# Note: for RangeIndex and CategoricalDtype self vs self._values
|
|
# behaves differently here.
|
|
new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
|
|
|
|
else:
|
|
# GH#13149 specifically use astype_array instead of astype
|
|
new_values = astype_array(values, dtype=dtype, copy=copy)
|
|
|
|
# pass copy=False because any copying will be done in the astype above
|
|
result = Index(new_values, name=self.name, dtype=new_values.dtype, copy=False)
|
|
if (
|
|
not copy
|
|
and self._references is not None
|
|
and astype_is_view(self.dtype, dtype)
|
|
):
|
|
result._references = self._references
|
|
result._references.add_index_reference(result)
|
|
return result
|
|
|
|
_index_shared_docs[
|
|
"take"
|
|
] = """
|
|
Return a new %(klass)s of the values selected by the indices.
|
|
|
|
For internal compatibility with numpy arrays.
|
|
|
|
Parameters
|
|
----------
|
|
indices : array-like
|
|
Indices to be taken.
|
|
axis : int, optional
|
|
The axis over which to select values, always 0.
|
|
allow_fill : bool, default True
|
|
fill_value : scalar, default None
|
|
If allow_fill=True and fill_value is not None, indices specified by
|
|
-1 are regarded as NA. If Index doesn't hold NA, raise ValueError.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
An index formed of elements at the given indices. Will be the same
|
|
type as self, except for RangeIndex.
|
|
|
|
See Also
|
|
--------
|
|
numpy.ndarray.take: Return an array formed from the
|
|
elements of a at the given indices.
|
|
"""
|
|
|
|
@Appender(_index_shared_docs["take"] % _index_doc_kwargs)
|
|
def take(
|
|
self,
|
|
indices,
|
|
axis: Axis = 0,
|
|
allow_fill: bool = True,
|
|
fill_value=None,
|
|
**kwargs,
|
|
):
|
|
if kwargs:
|
|
nv.validate_take((), kwargs)
|
|
if is_scalar(indices):
|
|
raise TypeError("Expected indices to be array-like")
|
|
indices = ensure_platform_int(indices)
|
|
allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices)
|
|
|
|
# Note: we discard fill_value and use self._na_value, only relevant
|
|
# in the case where allow_fill is True and fill_value is not None
|
|
values = self._values
|
|
if isinstance(values, np.ndarray):
|
|
taken = algos.take(
|
|
values, indices, allow_fill=allow_fill, fill_value=self._na_value
|
|
)
|
|
else:
|
|
# algos.take passes 'axis' keyword which not all EAs accept
|
|
taken = values.take(
|
|
indices, allow_fill=allow_fill, fill_value=self._na_value
|
|
)
|
|
# _constructor so RangeIndex-> Index with an int64 dtype
|
|
return self._constructor._simple_new(taken, name=self.name)
|
|
|
|
@final
|
|
def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool:
|
|
"""
|
|
We only use pandas-style take when allow_fill is True _and_
|
|
fill_value is not None.
|
|
"""
|
|
if allow_fill and fill_value is not None:
|
|
# only fill if we are passing a non-None fill_value
|
|
if self._can_hold_na:
|
|
if (indices < -1).any():
|
|
raise ValueError(
|
|
"When allow_fill=True and fill_value is not None, "
|
|
"all indices must be >= -1"
|
|
)
|
|
else:
|
|
cls_name = type(self).__name__
|
|
raise ValueError(
|
|
f"Unable to fill values because {cls_name} cannot contain NA"
|
|
)
|
|
else:
|
|
allow_fill = False
|
|
return allow_fill
|
|
|
|
_index_shared_docs[
|
|
"repeat"
|
|
] = """
|
|
Repeat elements of a %(klass)s.
|
|
|
|
Returns a new %(klass)s where each element of the current %(klass)s
|
|
is repeated consecutively a given number of times.
|
|
|
|
Parameters
|
|
----------
|
|
repeats : int or array of ints
|
|
The number of repetitions for each element. This should be a
|
|
non-negative integer. Repeating 0 times will return an empty
|
|
%(klass)s.
|
|
axis : None
|
|
Must be ``None``. Has no effect but is accepted for compatibility
|
|
with numpy.
|
|
|
|
Returns
|
|
-------
|
|
%(klass)s
|
|
Newly created %(klass)s with repeated elements.
|
|
|
|
See Also
|
|
--------
|
|
Series.repeat : Equivalent function for Series.
|
|
numpy.repeat : Similar method for :class:`numpy.ndarray`.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['a', 'b', 'c'])
|
|
>>> idx
|
|
Index(['a', 'b', 'c'], dtype='object')
|
|
>>> idx.repeat(2)
|
|
Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object')
|
|
>>> idx.repeat([1, 2, 3])
|
|
Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object')
|
|
"""
|
|
|
|
@Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
|
|
def repeat(self, repeats, axis=None):
|
|
repeats = ensure_platform_int(repeats)
|
|
nv.validate_repeat((), {"axis": axis})
|
|
res_values = self._values.repeat(repeats)
|
|
|
|
# _constructor so RangeIndex-> Index with an int64 dtype
|
|
return self._constructor._simple_new(res_values, name=self.name)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Copying Methods
|
|
|
|
def copy(
|
|
self: _IndexT,
|
|
name: Hashable | None = None,
|
|
deep: bool = False,
|
|
) -> _IndexT:
|
|
"""
|
|
Make a copy of this object.
|
|
|
|
Name is set on the new object.
|
|
|
|
Parameters
|
|
----------
|
|
name : Label, optional
|
|
Set name for new object.
|
|
deep : bool, default False
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
Index refer to new object which is a copy of this object.
|
|
|
|
Notes
|
|
-----
|
|
In most cases, there should be no functional difference from using
|
|
``deep``, but if ``deep`` is passed it will attempt to deepcopy.
|
|
"""
|
|
|
|
name = self._validate_names(name=name, deep=deep)[0]
|
|
if deep:
|
|
new_data = self._data.copy()
|
|
new_index = type(self)._simple_new(new_data, name=name)
|
|
else:
|
|
new_index = self._rename(name=name)
|
|
return new_index
|
|
|
|
@final
|
|
def __copy__(self: _IndexT, **kwargs) -> _IndexT:
|
|
return self.copy(**kwargs)
|
|
|
|
@final
|
|
def __deepcopy__(self: _IndexT, memo=None) -> _IndexT:
|
|
"""
|
|
Parameters
|
|
----------
|
|
memo, default None
|
|
Standard signature. Unused
|
|
"""
|
|
return self.copy(deep=True)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Rendering Methods
|
|
|
|
@final
|
|
def __repr__(self) -> str_t:
|
|
"""
|
|
Return a string representation for this object.
|
|
"""
|
|
klass_name = type(self).__name__
|
|
data = self._format_data()
|
|
attrs = self._format_attrs()
|
|
space = self._format_space()
|
|
attrs_str = [f"{k}={v}" for k, v in attrs]
|
|
prepr = f",{space}".join(attrs_str)
|
|
|
|
# no data provided, just attributes
|
|
if data is None:
|
|
data = ""
|
|
|
|
return f"{klass_name}({data}{prepr})"
|
|
|
|
def _format_space(self) -> str_t:
|
|
# using space here controls if the attributes
|
|
# are line separated or not (the default)
|
|
|
|
# max_seq_items = get_option('display.max_seq_items')
|
|
# if len(self) > max_seq_items:
|
|
# space = "\n%s" % (' ' * (len(klass) + 1))
|
|
return " "
|
|
|
|
@property
|
|
def _formatter_func(self):
|
|
"""
|
|
Return the formatter function.
|
|
"""
|
|
return default_pprint
|
|
|
|
def _format_data(self, name=None) -> str_t:
|
|
"""
|
|
Return the formatted data as a unicode string.
|
|
"""
|
|
# do we want to justify (only do so for non-objects)
|
|
is_justify = True
|
|
|
|
if self.inferred_type == "string":
|
|
is_justify = False
|
|
elif self.inferred_type == "categorical":
|
|
self = cast("CategoricalIndex", self)
|
|
if is_object_dtype(self.categories):
|
|
is_justify = False
|
|
|
|
return format_object_summary(
|
|
self,
|
|
self._formatter_func,
|
|
is_justify=is_justify,
|
|
name=name,
|
|
line_break_each_value=self._is_multi,
|
|
)
|
|
|
|
def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]:
|
|
"""
|
|
Return a list of tuples of the (attr,formatted_value).
|
|
"""
|
|
attrs: list[tuple[str_t, str_t | int | bool | None]] = []
|
|
|
|
if not self._is_multi:
|
|
attrs.append(("dtype", f"'{self.dtype}'"))
|
|
|
|
if self.name is not None:
|
|
attrs.append(("name", default_pprint(self.name)))
|
|
elif self._is_multi and any(x is not None for x in self.names):
|
|
attrs.append(("names", default_pprint(self.names)))
|
|
|
|
max_seq_items = get_option("display.max_seq_items") or len(self)
|
|
if len(self) > max_seq_items:
|
|
attrs.append(("length", len(self)))
|
|
return attrs
|
|
|
|
@final
|
|
def _get_level_names(self) -> Hashable | Sequence[Hashable]:
|
|
"""
|
|
Return a name or list of names with None replaced by the level number.
|
|
"""
|
|
if self._is_multi:
|
|
return [
|
|
level if name is None else name for level, name in enumerate(self.names)
|
|
]
|
|
else:
|
|
return 0 if self.name is None else self.name
|
|
|
|
@final
|
|
def _mpl_repr(self) -> np.ndarray:
|
|
# how to represent ourselves to matplotlib
|
|
if isinstance(self.dtype, np.dtype) and self.dtype.kind != "M":
|
|
return cast(np.ndarray, self.values)
|
|
return self.astype(object, copy=False)._values
|
|
|
|
def format(
|
|
self,
|
|
name: bool = False,
|
|
formatter: Callable | None = None,
|
|
na_rep: str_t = "NaN",
|
|
) -> list[str_t]:
|
|
"""
|
|
Render a string representation of the Index.
|
|
"""
|
|
header = []
|
|
if name:
|
|
header.append(
|
|
pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
|
|
if self.name is not None
|
|
else ""
|
|
)
|
|
|
|
if formatter is not None:
|
|
return header + list(self.map(formatter))
|
|
|
|
return self._format_with_header(header, na_rep=na_rep)
|
|
|
|
def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]:
|
|
from pandas.io.formats.format import format_array
|
|
|
|
values = self._values
|
|
|
|
if is_object_dtype(values.dtype):
|
|
values = cast(np.ndarray, values)
|
|
values = lib.maybe_convert_objects(values, safe=True)
|
|
|
|
result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values]
|
|
|
|
# could have nans
|
|
mask = is_float_nan(values)
|
|
if mask.any():
|
|
result_arr = np.array(result)
|
|
result_arr[mask] = na_rep
|
|
result = result_arr.tolist()
|
|
else:
|
|
result = trim_front(format_array(values, None, justify="left"))
|
|
return header + result
|
|
|
|
def _format_native_types(
|
|
self,
|
|
*,
|
|
na_rep: str_t = "",
|
|
decimal: str_t = ".",
|
|
float_format=None,
|
|
date_format=None,
|
|
quoting=None,
|
|
) -> npt.NDArray[np.object_]:
|
|
"""
|
|
Actually format specific types of the index.
|
|
"""
|
|
from pandas.io.formats.format import FloatArrayFormatter
|
|
|
|
if is_float_dtype(self.dtype) and not is_extension_array_dtype(self.dtype):
|
|
formatter = FloatArrayFormatter(
|
|
self._values,
|
|
na_rep=na_rep,
|
|
float_format=float_format,
|
|
decimal=decimal,
|
|
quoting=quoting,
|
|
fixed_width=False,
|
|
)
|
|
return formatter.get_result_as_array()
|
|
|
|
mask = isna(self)
|
|
if not is_object_dtype(self) and not quoting:
|
|
values = np.asarray(self).astype(str)
|
|
else:
|
|
values = np.array(self, dtype=object, copy=True)
|
|
|
|
values[mask] = na_rep
|
|
return values
|
|
|
|
def _summary(self, name=None) -> str_t:
|
|
"""
|
|
Return a summarized representation.
|
|
|
|
Parameters
|
|
----------
|
|
name : str
|
|
name to use in the summary representation
|
|
|
|
Returns
|
|
-------
|
|
String with a summarized representation of the index
|
|
"""
|
|
if len(self) > 0:
|
|
head = self[0]
|
|
if hasattr(head, "format") and not isinstance(head, str):
|
|
head = head.format()
|
|
elif needs_i8_conversion(self.dtype):
|
|
# e.g. Timedelta, display as values, not quoted
|
|
head = self._formatter_func(head).replace("'", "")
|
|
tail = self[-1]
|
|
if hasattr(tail, "format") and not isinstance(tail, str):
|
|
tail = tail.format()
|
|
elif needs_i8_conversion(self.dtype):
|
|
# e.g. Timedelta, display as values, not quoted
|
|
tail = self._formatter_func(tail).replace("'", "")
|
|
|
|
index_summary = f", {head} to {tail}"
|
|
else:
|
|
index_summary = ""
|
|
|
|
if name is None:
|
|
name = type(self).__name__
|
|
return f"{name}: {len(self)} entries{index_summary}"
|
|
|
|
# --------------------------------------------------------------------
|
|
# Conversion Methods
|
|
|
|
def to_flat_index(self: _IndexT) -> _IndexT:
|
|
"""
|
|
Identity method.
|
|
|
|
This is implemented for compatibility with subclass implementations
|
|
when chaining.
|
|
|
|
Returns
|
|
-------
|
|
pd.Index
|
|
Caller.
|
|
|
|
See Also
|
|
--------
|
|
MultiIndex.to_flat_index : Subclass implementation.
|
|
"""
|
|
return self
|
|
|
|
@final
|
|
def to_series(self, index=None, name: Hashable = None) -> Series:
|
|
"""
|
|
Create a Series with both index and values equal to the index keys.
|
|
|
|
Useful with map for returning an indexer based on an index.
|
|
|
|
Parameters
|
|
----------
|
|
index : Index, optional
|
|
Index of resulting Series. If None, defaults to original index.
|
|
name : str, optional
|
|
Name of resulting Series. If None, defaults to name of original
|
|
index.
|
|
|
|
Returns
|
|
-------
|
|
Series
|
|
The dtype will be based on the type of the Index values.
|
|
|
|
See Also
|
|
--------
|
|
Index.to_frame : Convert an Index to a DataFrame.
|
|
Series.to_frame : Convert Series to DataFrame.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')
|
|
|
|
By default, the original Index and original name is reused.
|
|
|
|
>>> idx.to_series()
|
|
animal
|
|
Ant Ant
|
|
Bear Bear
|
|
Cow Cow
|
|
Name: animal, dtype: object
|
|
|
|
To enforce a new Index, specify new labels to ``index``:
|
|
|
|
>>> idx.to_series(index=[0, 1, 2])
|
|
0 Ant
|
|
1 Bear
|
|
2 Cow
|
|
Name: animal, dtype: object
|
|
|
|
To override the name of the resulting column, specify `name`:
|
|
|
|
>>> idx.to_series(name='zoo')
|
|
animal
|
|
Ant Ant
|
|
Bear Bear
|
|
Cow Cow
|
|
Name: zoo, dtype: object
|
|
"""
|
|
from pandas import Series
|
|
|
|
if index is None:
|
|
index = self._view()
|
|
if name is None:
|
|
name = self.name
|
|
|
|
return Series(self._values.copy(), index=index, name=name)
|
|
|
|
def to_frame(
|
|
self, index: bool = True, name: Hashable = lib.no_default
|
|
) -> DataFrame:
|
|
"""
|
|
Create a DataFrame with a column containing the Index.
|
|
|
|
Parameters
|
|
----------
|
|
index : bool, default True
|
|
Set the index of the returned DataFrame as the original Index.
|
|
|
|
name : object, defaults to index.name
|
|
The passed name should substitute for the index name (if it has
|
|
one).
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
DataFrame containing the original Index data.
|
|
|
|
See Also
|
|
--------
|
|
Index.to_series : Convert an Index to a Series.
|
|
Series.to_frame : Convert Series to DataFrame.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')
|
|
>>> idx.to_frame()
|
|
animal
|
|
animal
|
|
Ant Ant
|
|
Bear Bear
|
|
Cow Cow
|
|
|
|
By default, the original Index is reused. To enforce a new Index:
|
|
|
|
>>> idx.to_frame(index=False)
|
|
animal
|
|
0 Ant
|
|
1 Bear
|
|
2 Cow
|
|
|
|
To override the name of the resulting column, specify `name`:
|
|
|
|
>>> idx.to_frame(index=False, name='zoo')
|
|
zoo
|
|
0 Ant
|
|
1 Bear
|
|
2 Cow
|
|
"""
|
|
from pandas import DataFrame
|
|
|
|
if name is lib.no_default:
|
|
name = self._get_level_names()
|
|
result = DataFrame({name: self._values.copy()})
|
|
|
|
if index:
|
|
result.index = self
|
|
return result
|
|
|
|
# --------------------------------------------------------------------
|
|
# Name-Centric Methods
|
|
|
|
@property
|
|
def name(self) -> Hashable:
|
|
"""
|
|
Return Index or MultiIndex name.
|
|
"""
|
|
return self._name
|
|
|
|
@name.setter
|
|
def name(self, value: Hashable) -> None:
|
|
if self._no_setting_name:
|
|
# Used in MultiIndex.levels to avoid silently ignoring name updates.
|
|
raise RuntimeError(
|
|
"Cannot set name on a level of a MultiIndex. Use "
|
|
"'MultiIndex.set_names' instead."
|
|
)
|
|
maybe_extract_name(value, None, type(self))
|
|
self._name = value
|
|
|
|
@final
|
|
def _validate_names(
|
|
self, name=None, names=None, deep: bool = False
|
|
) -> list[Hashable]:
|
|
"""
|
|
Handles the quirks of having a singular 'name' parameter for general
|
|
Index and plural 'names' parameter for MultiIndex.
|
|
"""
|
|
from copy import deepcopy
|
|
|
|
if names is not None and name is not None:
|
|
raise TypeError("Can only provide one of `names` and `name`")
|
|
if names is None and name is None:
|
|
new_names = deepcopy(self.names) if deep else self.names
|
|
elif names is not None:
|
|
if not is_list_like(names):
|
|
raise TypeError("Must pass list-like as `names`.")
|
|
new_names = names
|
|
elif not is_list_like(name):
|
|
new_names = [name]
|
|
else:
|
|
new_names = name
|
|
|
|
if len(new_names) != len(self.names):
|
|
raise ValueError(
|
|
f"Length of new names must be {len(self.names)}, got {len(new_names)}"
|
|
)
|
|
|
|
# All items in 'new_names' need to be hashable
|
|
validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name")
|
|
|
|
return new_names
|
|
|
|
def _get_default_index_names(
|
|
self, names: Hashable | Sequence[Hashable] | None = None, default=None
|
|
) -> list[Hashable]:
|
|
"""
|
|
Get names of index.
|
|
|
|
Parameters
|
|
----------
|
|
names : int, str or 1-dimensional list, default None
|
|
Index names to set.
|
|
default : str
|
|
Default name of index.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
if names not str or list-like
|
|
"""
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
if names is not None:
|
|
if isinstance(names, (int, str)):
|
|
names = [names]
|
|
|
|
if not isinstance(names, list) and names is not None:
|
|
raise ValueError("Index names must be str or 1-dimensional list")
|
|
|
|
if not names:
|
|
if isinstance(self, MultiIndex):
|
|
names = com.fill_missing_names(self.names)
|
|
else:
|
|
names = [default] if self.name is None else [self.name]
|
|
|
|
return names
|
|
|
|
def _get_names(self) -> FrozenList:
|
|
return FrozenList((self.name,))
|
|
|
|
def _set_names(self, values, *, level=None) -> None:
|
|
"""
|
|
Set new names on index. Each name has to be a hashable type.
|
|
|
|
Parameters
|
|
----------
|
|
values : str or sequence
|
|
name(s) to set
|
|
level : int, level name, or sequence of int/level names (default None)
|
|
If the index is a MultiIndex (hierarchical), level(s) to set (None
|
|
for all levels). Otherwise level must be None
|
|
|
|
Raises
|
|
------
|
|
TypeError if each name is not hashable.
|
|
"""
|
|
if not is_list_like(values):
|
|
raise ValueError("Names must be a list-like")
|
|
if len(values) != 1:
|
|
raise ValueError(f"Length of new names must be 1, got {len(values)}")
|
|
|
|
# GH 20527
|
|
# All items in 'name' need to be hashable:
|
|
validate_all_hashable(*values, error_name=f"{type(self).__name__}.name")
|
|
|
|
self._name = values[0]
|
|
|
|
names = property(fset=_set_names, fget=_get_names)
|
|
|
|
@overload
|
|
def set_names(
|
|
self: _IndexT, names, *, level=..., inplace: Literal[False] = ...
|
|
) -> _IndexT:
|
|
...
|
|
|
|
@overload
|
|
def set_names(self, names, *, level=..., inplace: Literal[True]) -> None:
|
|
...
|
|
|
|
@overload
|
|
def set_names(
|
|
self: _IndexT, names, *, level=..., inplace: bool = ...
|
|
) -> _IndexT | None:
|
|
...
|
|
|
|
def set_names(
|
|
self: _IndexT, names, *, level=None, inplace: bool = False
|
|
) -> _IndexT | None:
|
|
"""
|
|
Set Index or MultiIndex name.
|
|
|
|
Able to set new names partially and by level.
|
|
|
|
Parameters
|
|
----------
|
|
|
|
names : label or list of label or dict-like for MultiIndex
|
|
Name(s) to set.
|
|
|
|
.. versionchanged:: 1.3.0
|
|
|
|
level : int, label or list of int or label, optional
|
|
If the index is a MultiIndex and names is not dict-like, level(s) to set
|
|
(None for all levels). Otherwise level must be None.
|
|
|
|
.. versionchanged:: 1.3.0
|
|
|
|
inplace : bool, default False
|
|
Modifies the object directly, instead of creating a new Index or
|
|
MultiIndex.
|
|
|
|
Returns
|
|
-------
|
|
Index or None
|
|
The same type as the caller or None if ``inplace=True``.
|
|
|
|
See Also
|
|
--------
|
|
Index.rename : Able to set new names without level.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1, 2, 3, 4])
|
|
>>> idx
|
|
Index([1, 2, 3, 4], dtype='int64')
|
|
>>> idx.set_names('quarter')
|
|
Index([1, 2, 3, 4], dtype='int64', name='quarter')
|
|
|
|
>>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
|
|
... [2018, 2019]])
|
|
>>> idx
|
|
MultiIndex([('python', 2018),
|
|
('python', 2019),
|
|
( 'cobra', 2018),
|
|
( 'cobra', 2019)],
|
|
)
|
|
>>> idx = idx.set_names(['kind', 'year'])
|
|
>>> idx.set_names('species', level=0)
|
|
MultiIndex([('python', 2018),
|
|
('python', 2019),
|
|
( 'cobra', 2018),
|
|
( 'cobra', 2019)],
|
|
names=['species', 'year'])
|
|
|
|
When renaming levels with a dict, levels can not be passed.
|
|
|
|
>>> idx.set_names({'kind': 'snake'})
|
|
MultiIndex([('python', 2018),
|
|
('python', 2019),
|
|
( 'cobra', 2018),
|
|
( 'cobra', 2019)],
|
|
names=['snake', 'year'])
|
|
"""
|
|
if level is not None and not isinstance(self, ABCMultiIndex):
|
|
raise ValueError("Level must be None for non-MultiIndex")
|
|
|
|
if level is not None and not is_list_like(level) and is_list_like(names):
|
|
raise TypeError("Names must be a string when a single level is provided.")
|
|
|
|
if not is_list_like(names) and level is None and self.nlevels > 1:
|
|
raise TypeError("Must pass list-like as `names`.")
|
|
|
|
if is_dict_like(names) and not isinstance(self, ABCMultiIndex):
|
|
raise TypeError("Can only pass dict-like as `names` for MultiIndex.")
|
|
|
|
if is_dict_like(names) and level is not None:
|
|
raise TypeError("Can not pass level for dictlike `names`.")
|
|
|
|
if isinstance(self, ABCMultiIndex) and is_dict_like(names) and level is None:
|
|
# Transform dict to list of new names and corresponding levels
|
|
level, names_adjusted = [], []
|
|
for i, name in enumerate(self.names):
|
|
if name in names.keys():
|
|
level.append(i)
|
|
names_adjusted.append(names[name])
|
|
names = names_adjusted
|
|
|
|
if not is_list_like(names):
|
|
names = [names]
|
|
if level is not None and not is_list_like(level):
|
|
level = [level]
|
|
|
|
if inplace:
|
|
idx = self
|
|
else:
|
|
idx = self._view()
|
|
|
|
idx._set_names(names, level=level)
|
|
if not inplace:
|
|
return idx
|
|
return None
|
|
|
|
def rename(self, name, inplace: bool = False):
|
|
"""
|
|
Alter Index or MultiIndex name.
|
|
|
|
Able to set new names without level. Defaults to returning new index.
|
|
Length of names must match number of levels in MultiIndex.
|
|
|
|
Parameters
|
|
----------
|
|
name : label or list of labels
|
|
Name(s) to set.
|
|
inplace : bool, default False
|
|
Modifies the object directly, instead of creating a new Index or
|
|
MultiIndex.
|
|
|
|
Returns
|
|
-------
|
|
Index or None
|
|
The same type as the caller or None if ``inplace=True``.
|
|
|
|
See Also
|
|
--------
|
|
Index.set_names : Able to set new names partially and by level.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score')
|
|
>>> idx.rename('grade')
|
|
Index(['A', 'C', 'A', 'B'], dtype='object', name='grade')
|
|
|
|
>>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
|
|
... [2018, 2019]],
|
|
... names=['kind', 'year'])
|
|
>>> idx
|
|
MultiIndex([('python', 2018),
|
|
('python', 2019),
|
|
( 'cobra', 2018),
|
|
( 'cobra', 2019)],
|
|
names=['kind', 'year'])
|
|
>>> idx.rename(['species', 'year'])
|
|
MultiIndex([('python', 2018),
|
|
('python', 2019),
|
|
( 'cobra', 2018),
|
|
( 'cobra', 2019)],
|
|
names=['species', 'year'])
|
|
>>> idx.rename('species')
|
|
Traceback (most recent call last):
|
|
TypeError: Must pass list-like as `names`.
|
|
"""
|
|
return self.set_names([name], inplace=inplace)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Level-Centric Methods
|
|
|
|
@property
|
|
def nlevels(self) -> int:
|
|
"""
|
|
Number of levels.
|
|
"""
|
|
return 1
|
|
|
|
def _sort_levels_monotonic(self: _IndexT) -> _IndexT:
|
|
"""
|
|
Compat with MultiIndex.
|
|
"""
|
|
return self
|
|
|
|
@final
|
|
def _validate_index_level(self, level) -> None:
|
|
"""
|
|
Validate index level.
|
|
|
|
For single-level Index getting level number is a no-op, but some
|
|
verification must be done like in MultiIndex.
|
|
|
|
"""
|
|
if isinstance(level, int):
|
|
if level < 0 and level != -1:
|
|
raise IndexError(
|
|
"Too many levels: Index has only 1 level, "
|
|
f"{level} is not a valid level number"
|
|
)
|
|
if level > 0:
|
|
raise IndexError(
|
|
f"Too many levels: Index has only 1 level, not {level + 1}"
|
|
)
|
|
elif level != self.name:
|
|
raise KeyError(
|
|
f"Requested level ({level}) does not match index name ({self.name})"
|
|
)
|
|
|
|
def _get_level_number(self, level) -> int:
|
|
self._validate_index_level(level)
|
|
return 0
|
|
|
|
def sortlevel(
|
|
self, level=None, ascending: bool | list[bool] = True, sort_remaining=None
|
|
):
|
|
"""
|
|
For internal compatibility with the Index API.
|
|
|
|
Sort the Index. This is for compat with MultiIndex
|
|
|
|
Parameters
|
|
----------
|
|
ascending : bool, default True
|
|
False to sort in descending order
|
|
|
|
level, sort_remaining are compat parameters
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
if not isinstance(ascending, (list, bool)):
|
|
raise TypeError(
|
|
"ascending must be a single bool value or"
|
|
"a list of bool values of length 1"
|
|
)
|
|
|
|
if isinstance(ascending, list):
|
|
if len(ascending) != 1:
|
|
raise TypeError("ascending must be a list of bool values of length 1")
|
|
ascending = ascending[0]
|
|
|
|
if not isinstance(ascending, bool):
|
|
raise TypeError("ascending must be a bool value")
|
|
|
|
return self.sort_values(return_indexer=True, ascending=ascending)
|
|
|
|
def _get_level_values(self, level) -> Index:
|
|
"""
|
|
Return an Index of values for requested level.
|
|
|
|
This is primarily useful to get an individual level of values from a
|
|
MultiIndex, but is provided on Index as well for compatibility.
|
|
|
|
Parameters
|
|
----------
|
|
level : int or str
|
|
It is either the integer position or the name of the level.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
Calling object, as there is only one level in the Index.
|
|
|
|
See Also
|
|
--------
|
|
MultiIndex.get_level_values : Get values for a level of a MultiIndex.
|
|
|
|
Notes
|
|
-----
|
|
For Index, level should be 0, since there are no multiple levels.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(list('abc'))
|
|
>>> idx
|
|
Index(['a', 'b', 'c'], dtype='object')
|
|
|
|
Get level values by supplying `level` as integer:
|
|
|
|
>>> idx.get_level_values(0)
|
|
Index(['a', 'b', 'c'], dtype='object')
|
|
"""
|
|
self._validate_index_level(level)
|
|
return self
|
|
|
|
get_level_values = _get_level_values
|
|
|
|
@final
|
|
def droplevel(self, level: IndexLabel = 0):
|
|
"""
|
|
Return index with requested level(s) removed.
|
|
|
|
If resulting index has only 1 level left, the result will be
|
|
of Index type, not MultiIndex. The original index is not modified inplace.
|
|
|
|
Parameters
|
|
----------
|
|
level : int, str, or list-like, default 0
|
|
If a string is given, must be the name of a level
|
|
If list-like, elements must be names or indexes of levels.
|
|
|
|
Returns
|
|
-------
|
|
Index or MultiIndex
|
|
|
|
Examples
|
|
--------
|
|
>>> mi = pd.MultiIndex.from_arrays(
|
|
... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z'])
|
|
>>> mi
|
|
MultiIndex([(1, 3, 5),
|
|
(2, 4, 6)],
|
|
names=['x', 'y', 'z'])
|
|
|
|
>>> mi.droplevel()
|
|
MultiIndex([(3, 5),
|
|
(4, 6)],
|
|
names=['y', 'z'])
|
|
|
|
>>> mi.droplevel(2)
|
|
MultiIndex([(1, 3),
|
|
(2, 4)],
|
|
names=['x', 'y'])
|
|
|
|
>>> mi.droplevel('z')
|
|
MultiIndex([(1, 3),
|
|
(2, 4)],
|
|
names=['x', 'y'])
|
|
|
|
>>> mi.droplevel(['x', 'y'])
|
|
Index([5, 6], dtype='int64', name='z')
|
|
"""
|
|
if not isinstance(level, (tuple, list)):
|
|
level = [level]
|
|
|
|
levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]
|
|
|
|
return self._drop_level_numbers(levnums)
|
|
|
|
@final
|
|
def _drop_level_numbers(self, levnums: list[int]):
|
|
"""
|
|
Drop MultiIndex levels by level _number_, not name.
|
|
"""
|
|
|
|
if not levnums and not isinstance(self, ABCMultiIndex):
|
|
return self
|
|
if len(levnums) >= self.nlevels:
|
|
raise ValueError(
|
|
f"Cannot remove {len(levnums)} levels from an index with "
|
|
f"{self.nlevels} levels: at least one level must be left."
|
|
)
|
|
# The two checks above guarantee that here self is a MultiIndex
|
|
self = cast("MultiIndex", self)
|
|
|
|
new_levels = list(self.levels)
|
|
new_codes = list(self.codes)
|
|
new_names = list(self.names)
|
|
|
|
for i in levnums:
|
|
new_levels.pop(i)
|
|
new_codes.pop(i)
|
|
new_names.pop(i)
|
|
|
|
if len(new_levels) == 1:
|
|
lev = new_levels[0]
|
|
|
|
if len(lev) == 0:
|
|
# If lev is empty, lev.take will fail GH#42055
|
|
if len(new_codes[0]) == 0:
|
|
# GH#45230 preserve RangeIndex here
|
|
# see test_reset_index_empty_rangeindex
|
|
result = lev[:0]
|
|
else:
|
|
res_values = algos.take(lev._values, new_codes[0], allow_fill=True)
|
|
# _constructor instead of type(lev) for RangeIndex compat GH#35230
|
|
result = lev._constructor._simple_new(res_values, name=new_names[0])
|
|
else:
|
|
# set nan if needed
|
|
mask = new_codes[0] == -1
|
|
result = new_levels[0].take(new_codes[0])
|
|
if mask.any():
|
|
result = result.putmask(mask, np.nan)
|
|
|
|
result._name = new_names[0]
|
|
|
|
return result
|
|
else:
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
return MultiIndex(
|
|
levels=new_levels,
|
|
codes=new_codes,
|
|
names=new_names,
|
|
verify_integrity=False,
|
|
)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Introspection Methods
|
|
|
|
@cache_readonly
|
|
@final
|
|
def _can_hold_na(self) -> bool:
|
|
if isinstance(self.dtype, ExtensionDtype):
|
|
if isinstance(self.dtype, IntervalDtype):
|
|
# FIXME(GH#45720): this is inaccurate for integer-backed
|
|
# IntervalArray, but without it other.categories.take raises
|
|
# in IntervalArray._cmp_method
|
|
return True
|
|
return self.dtype._can_hold_na
|
|
if self.dtype.kind in ["i", "u", "b"]:
|
|
return False
|
|
return True
|
|
|
|
@property
|
|
def is_monotonic_increasing(self) -> bool:
|
|
"""
|
|
Return a boolean if the values are equal or increasing.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
|
|
See Also
|
|
--------
|
|
Index.is_monotonic_decreasing : Check if the values are equal or decreasing.
|
|
|
|
Examples
|
|
--------
|
|
>>> pd.Index([1, 2, 3]).is_monotonic_increasing
|
|
True
|
|
>>> pd.Index([1, 2, 2]).is_monotonic_increasing
|
|
True
|
|
>>> pd.Index([1, 3, 2]).is_monotonic_increasing
|
|
False
|
|
"""
|
|
return self._engine.is_monotonic_increasing
|
|
|
|
@property
|
|
def is_monotonic_decreasing(self) -> bool:
|
|
"""
|
|
Return a boolean if the values are equal or decreasing.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
|
|
See Also
|
|
--------
|
|
Index.is_monotonic_increasing : Check if the values are equal or increasing.
|
|
|
|
Examples
|
|
--------
|
|
>>> pd.Index([3, 2, 1]).is_monotonic_decreasing
|
|
True
|
|
>>> pd.Index([3, 2, 2]).is_monotonic_decreasing
|
|
True
|
|
>>> pd.Index([3, 1, 2]).is_monotonic_decreasing
|
|
False
|
|
"""
|
|
return self._engine.is_monotonic_decreasing
|
|
|
|
@final
|
|
@property
|
|
def _is_strictly_monotonic_increasing(self) -> bool:
|
|
"""
|
|
Return if the index is strictly monotonic increasing
|
|
(only increasing) values.
|
|
|
|
Examples
|
|
--------
|
|
>>> Index([1, 2, 3])._is_strictly_monotonic_increasing
|
|
True
|
|
>>> Index([1, 2, 2])._is_strictly_monotonic_increasing
|
|
False
|
|
>>> Index([1, 3, 2])._is_strictly_monotonic_increasing
|
|
False
|
|
"""
|
|
return self.is_unique and self.is_monotonic_increasing
|
|
|
|
@final
|
|
@property
|
|
def _is_strictly_monotonic_decreasing(self) -> bool:
|
|
"""
|
|
Return if the index is strictly monotonic decreasing
|
|
(only decreasing) values.
|
|
|
|
Examples
|
|
--------
|
|
>>> Index([3, 2, 1])._is_strictly_monotonic_decreasing
|
|
True
|
|
>>> Index([3, 2, 2])._is_strictly_monotonic_decreasing
|
|
False
|
|
>>> Index([3, 1, 2])._is_strictly_monotonic_decreasing
|
|
False
|
|
"""
|
|
return self.is_unique and self.is_monotonic_decreasing
|
|
|
|
@cache_readonly
|
|
def is_unique(self) -> bool:
|
|
"""
|
|
Return if the index has unique values.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
|
|
See Also
|
|
--------
|
|
Index.has_duplicates : Inverse method that checks if it has duplicate values.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1, 5, 7, 7])
|
|
>>> idx.is_unique
|
|
False
|
|
|
|
>>> idx = pd.Index([1, 5, 7])
|
|
>>> idx.is_unique
|
|
True
|
|
|
|
>>> idx = pd.Index(["Watermelon", "Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.is_unique
|
|
False
|
|
|
|
>>> idx = pd.Index(["Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.is_unique
|
|
True
|
|
"""
|
|
return self._engine.is_unique
|
|
|
|
@final
|
|
@property
|
|
def has_duplicates(self) -> bool:
|
|
"""
|
|
Check if the Index has duplicate values.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index has duplicate values.
|
|
|
|
See Also
|
|
--------
|
|
Index.is_unique : Inverse method that checks if it has unique values.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1, 5, 7, 7])
|
|
>>> idx.has_duplicates
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 5, 7])
|
|
>>> idx.has_duplicates
|
|
False
|
|
|
|
>>> idx = pd.Index(["Watermelon", "Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.has_duplicates
|
|
True
|
|
|
|
>>> idx = pd.Index(["Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.has_duplicates
|
|
False
|
|
"""
|
|
return not self.is_unique
|
|
|
|
@final
|
|
def is_boolean(self) -> bool:
|
|
"""
|
|
Check if the Index only consists of booleans.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.is_bool_dtype` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index only consists of booleans.
|
|
|
|
See Also
|
|
--------
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_object : Check if the Index is of the object dtype (deprecated).
|
|
is_categorical : Check if the Index holds categorical data.
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([True, False, True])
|
|
>>> idx.is_boolean() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index(["True", "False", "True"])
|
|
>>> idx.is_boolean() # doctest: +SKIP
|
|
False
|
|
|
|
>>> idx = pd.Index([True, False, "True"])
|
|
>>> idx.is_boolean() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_boolean is deprecated. "
|
|
"Use pandas.api.types.is_bool_type instead.",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self.inferred_type in ["boolean"]
|
|
|
|
@final
|
|
def is_integer(self) -> bool:
|
|
"""
|
|
Check if the Index only consists of integers.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.is_integer_dtype` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index only consists of integers.
|
|
|
|
See Also
|
|
--------
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_object : Check if the Index is of the object dtype. (deprecated).
|
|
is_categorical : Check if the Index holds categorical data (deprecated).
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1, 2, 3, 4])
|
|
>>> idx.is_integer() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
|
|
>>> idx.is_integer() # doctest: +SKIP
|
|
False
|
|
|
|
>>> idx = pd.Index(["Apple", "Mango", "Watermelon"])
|
|
>>> idx.is_integer() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_integer is deprecated. "
|
|
"Use pandas.api.types.is_integer_dtype instead.",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self.inferred_type in ["integer"]
|
|
|
|
@final
|
|
def is_floating(self) -> bool:
|
|
"""
|
|
Check if the Index is a floating type.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.is_float_dtype` instead
|
|
|
|
The Index may consist of only floats, NaNs, or a mix of floats,
|
|
integers, or NaNs.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index only consists of only consists of floats, NaNs, or
|
|
a mix of floats, integers, or NaNs.
|
|
|
|
See Also
|
|
--------
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_object : Check if the Index is of the object dtype. (deprecated).
|
|
is_categorical : Check if the Index holds categorical data (deprecated).
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
|
|
>>> idx.is_floating() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1.0, 2.0, np.nan, 4.0])
|
|
>>> idx.is_floating() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4, np.nan])
|
|
>>> idx.is_floating() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4])
|
|
>>> idx.is_floating() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_floating is deprecated. "
|
|
"Use pandas.api.types.is_float_dtype instead.",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"]
|
|
|
|
@final
|
|
def is_numeric(self) -> bool:
|
|
"""
|
|
Check if the Index only consists of numeric data.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.is_numeric_dtype` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index only consists of numeric data.
|
|
|
|
See Also
|
|
--------
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_object : Check if the Index is of the object dtype. (deprecated).
|
|
is_categorical : Check if the Index holds categorical data (deprecated).
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
|
|
>>> idx.is_numeric() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4.0])
|
|
>>> idx.is_numeric() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4])
|
|
>>> idx.is_numeric() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4.0, np.nan])
|
|
>>> idx.is_numeric() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"])
|
|
>>> idx.is_numeric() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_numeric is deprecated. "
|
|
"Use pandas.api.types.is_any_real_numeric_dtype instead",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self.inferred_type in ["integer", "floating"]
|
|
|
|
@final
|
|
def is_object(self) -> bool:
|
|
"""
|
|
Check if the Index is of the object dtype.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.is_object_dtype` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index is of the object dtype.
|
|
|
|
See Also
|
|
--------
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_categorical : Check if the Index holds categorical data (deprecated).
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(["Apple", "Mango", "Watermelon"])
|
|
>>> idx.is_object() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index(["Apple", "Mango", 2.0])
|
|
>>> idx.is_object() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index(["Watermelon", "Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.is_object() # doctest: +SKIP
|
|
False
|
|
|
|
>>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
|
|
>>> idx.is_object() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_object is deprecated."
|
|
"Use pandas.api.types.is_object_dtype instead",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return is_object_dtype(self.dtype)
|
|
|
|
@final
|
|
def is_categorical(self) -> bool:
|
|
"""
|
|
Check if the Index holds categorical data.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `isinstance(index.dtype, pd.CategoricalDtype)` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if the Index is categorical.
|
|
|
|
See Also
|
|
--------
|
|
CategoricalIndex : Index for categorical data.
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_object : Check if the Index is of the object dtype. (deprecated).
|
|
is_interval : Check if the Index holds Interval objects (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(["Watermelon", "Orange", "Apple",
|
|
... "Watermelon"]).astype("category")
|
|
>>> idx.is_categorical() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 3, 5, 7])
|
|
>>> idx.is_categorical() # doctest: +SKIP
|
|
False
|
|
|
|
>>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"])
|
|
>>> s
|
|
0 Peter
|
|
1 Victor
|
|
2 Elisabeth
|
|
3 Mar
|
|
dtype: object
|
|
>>> s.index.is_categorical() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_categorical is deprecated."
|
|
"Use pandas.api.types.is_categorical_dtype instead",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
|
|
return self.inferred_type in ["categorical"]
|
|
|
|
@final
|
|
def is_interval(self) -> bool:
|
|
"""
|
|
Check if the Index holds Interval objects.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `isinstance(index.dtype, pd.IntervalDtype)` instead.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not the Index holds Interval objects.
|
|
|
|
See Also
|
|
--------
|
|
IntervalIndex : Index for Interval objects.
|
|
is_boolean : Check if the Index only consists of booleans (deprecated).
|
|
is_integer : Check if the Index only consists of integers (deprecated).
|
|
is_floating : Check if the Index is a floating type (deprecated).
|
|
is_numeric : Check if the Index only consists of numeric data (deprecated).
|
|
is_object : Check if the Index is of the object dtype. (deprecated).
|
|
is_categorical : Check if the Index holds categorical data (deprecated).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([pd.Interval(left=0, right=5),
|
|
... pd.Interval(left=5, right=10)])
|
|
>>> idx.is_interval() # doctest: +SKIP
|
|
True
|
|
|
|
>>> idx = pd.Index([1, 3, 5, 7])
|
|
>>> idx.is_interval() # doctest: +SKIP
|
|
False
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.is_interval is deprecated."
|
|
"Use pandas.api.types.is_interval_dtype instead",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self.inferred_type in ["interval"]
|
|
|
|
@final
|
|
def _holds_integer(self) -> bool:
|
|
"""
|
|
Whether the type is an integer type.
|
|
"""
|
|
return self.inferred_type in ["integer", "mixed-integer"]
|
|
|
|
@final
|
|
def holds_integer(self) -> bool:
|
|
"""
|
|
Whether the type is an integer type.
|
|
|
|
.. deprecated:: 2.0.0
|
|
Use `pandas.api.types.infer_dtype` instead
|
|
"""
|
|
warnings.warn(
|
|
f"{type(self).__name__}.holds_integer is deprecated. "
|
|
"Use pandas.api.types.infer_dtype instead.",
|
|
FutureWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return self._holds_integer()
|
|
|
|
@cache_readonly
|
|
def inferred_type(self) -> str_t:
|
|
"""
|
|
Return a string of the type inferred from the values.
|
|
"""
|
|
return lib.infer_dtype(self._values, skipna=False)
|
|
|
|
@cache_readonly
|
|
@final
|
|
def _is_all_dates(self) -> bool:
|
|
"""
|
|
Whether or not the index values only consist of dates.
|
|
"""
|
|
if needs_i8_conversion(self.dtype):
|
|
return True
|
|
elif self.dtype != _dtype_obj:
|
|
# TODO(ExtensionIndex): 3rd party EA might override?
|
|
# Note: this includes IntervalIndex, even when the left/right
|
|
# contain datetime-like objects.
|
|
return False
|
|
elif self._is_multi:
|
|
return False
|
|
return is_datetime_array(ensure_object(self._values))
|
|
|
|
@final
|
|
@cache_readonly
|
|
def _is_multi(self) -> bool:
|
|
"""
|
|
Cached check equivalent to isinstance(self, MultiIndex)
|
|
"""
|
|
return isinstance(self, ABCMultiIndex)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Pickle Methods
|
|
|
|
def __reduce__(self):
|
|
d = {"data": self._data, "name": self.name}
|
|
return _new_Index, (type(self), d), None
|
|
|
|
# --------------------------------------------------------------------
|
|
# Null Handling Methods
|
|
|
|
@cache_readonly
|
|
def _na_value(self):
|
|
"""The expected NA value to use with this index."""
|
|
dtype = self.dtype
|
|
if isinstance(dtype, np.dtype):
|
|
if dtype.kind in ["m", "M"]:
|
|
return NaT
|
|
return np.nan
|
|
return dtype.na_value
|
|
|
|
@cache_readonly
|
|
def _isnan(self) -> npt.NDArray[np.bool_]:
|
|
"""
|
|
Return if each value is NaN.
|
|
"""
|
|
if self._can_hold_na:
|
|
return isna(self)
|
|
else:
|
|
# shouldn't reach to this condition by checking hasnans beforehand
|
|
values = np.empty(len(self), dtype=np.bool_)
|
|
values.fill(False)
|
|
return values
|
|
|
|
@cache_readonly
|
|
def hasnans(self) -> bool:
|
|
"""
|
|
Return True if there are any NaNs.
|
|
|
|
Enables various performance speedups.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
if self._can_hold_na:
|
|
return bool(self._isnan.any())
|
|
else:
|
|
return False
|
|
|
|
@final
|
|
def isna(self) -> npt.NDArray[np.bool_]:
|
|
"""
|
|
Detect missing values.
|
|
|
|
Return a boolean same-sized object indicating if the values are NA.
|
|
NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get
|
|
mapped to ``True`` values.
|
|
Everything else get mapped to ``False`` values. Characters such as
|
|
empty strings `''` or :attr:`numpy.inf` are not considered NA values
|
|
(unless you set ``pandas.options.mode.use_inf_as_na = True``).
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray[bool]
|
|
A boolean array of whether my values are NA.
|
|
|
|
See Also
|
|
--------
|
|
Index.notna : Boolean inverse of isna.
|
|
Index.dropna : Omit entries with missing values.
|
|
isna : Top-level isna.
|
|
Series.isna : Detect missing values in Series object.
|
|
|
|
Examples
|
|
--------
|
|
Show which entries in a pandas.Index are NA. The result is an
|
|
array.
|
|
|
|
>>> idx = pd.Index([5.2, 6.0, np.NaN])
|
|
>>> idx
|
|
Index([5.2, 6.0, nan], dtype='float64')
|
|
>>> idx.isna()
|
|
array([False, False, True])
|
|
|
|
Empty strings are not considered NA values. None is considered an NA
|
|
value.
|
|
|
|
>>> idx = pd.Index(['black', '', 'red', None])
|
|
>>> idx
|
|
Index(['black', '', 'red', None], dtype='object')
|
|
>>> idx.isna()
|
|
array([False, False, False, True])
|
|
|
|
For datetimes, `NaT` (Not a Time) is considered as an NA value.
|
|
|
|
>>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'),
|
|
... pd.Timestamp(''), None, pd.NaT])
|
|
>>> idx
|
|
DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'],
|
|
dtype='datetime64[ns]', freq=None)
|
|
>>> idx.isna()
|
|
array([False, True, True, True])
|
|
"""
|
|
return self._isnan
|
|
|
|
isnull = isna
|
|
|
|
@final
|
|
def notna(self) -> npt.NDArray[np.bool_]:
|
|
"""
|
|
Detect existing (non-missing) values.
|
|
|
|
Return a boolean same-sized object indicating if the values are not NA.
|
|
Non-missing values get mapped to ``True``. Characters such as empty
|
|
strings ``''`` or :attr:`numpy.inf` are not considered NA values
|
|
(unless you set ``pandas.options.mode.use_inf_as_na = True``).
|
|
NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
|
|
values.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray[bool]
|
|
Boolean array to indicate which entries are not NA.
|
|
|
|
See Also
|
|
--------
|
|
Index.notnull : Alias of notna.
|
|
Index.isna: Inverse of notna.
|
|
notna : Top-level notna.
|
|
|
|
Examples
|
|
--------
|
|
Show which entries in an Index are not NA. The result is an
|
|
array.
|
|
|
|
>>> idx = pd.Index([5.2, 6.0, np.NaN])
|
|
>>> idx
|
|
Index([5.2, 6.0, nan], dtype='float64')
|
|
>>> idx.notna()
|
|
array([ True, True, False])
|
|
|
|
Empty strings are not considered NA values. None is considered a NA
|
|
value.
|
|
|
|
>>> idx = pd.Index(['black', '', 'red', None])
|
|
>>> idx
|
|
Index(['black', '', 'red', None], dtype='object')
|
|
>>> idx.notna()
|
|
array([ True, True, True, False])
|
|
"""
|
|
return ~self.isna()
|
|
|
|
notnull = notna
|
|
|
|
def fillna(self, value=None, downcast=None):
|
|
"""
|
|
Fill NA/NaN values with the specified value.
|
|
|
|
Parameters
|
|
----------
|
|
value : scalar
|
|
Scalar value to use to fill holes (e.g. 0).
|
|
This value cannot be a list-likes.
|
|
downcast : dict, default is None
|
|
A dict of item->dtype of what to downcast if possible,
|
|
or the string 'infer' which will try to downcast to an appropriate
|
|
equal type (e.g. float64 to int64 if possible).
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
See Also
|
|
--------
|
|
DataFrame.fillna : Fill NaN values of a DataFrame.
|
|
Series.fillna : Fill NaN Values of a Series.
|
|
"""
|
|
|
|
value = self._require_scalar(value)
|
|
if self.hasnans:
|
|
result = self.putmask(self._isnan, value)
|
|
if downcast is None:
|
|
# no need to care metadata other than name
|
|
# because it can't have freq if it has NaTs
|
|
# _with_infer needed for test_fillna_categorical
|
|
return Index._with_infer(result, name=self.name)
|
|
raise NotImplementedError(
|
|
f"{type(self).__name__}.fillna does not support 'downcast' "
|
|
"argument values other than 'None'."
|
|
)
|
|
return self._view()
|
|
|
|
def dropna(self: _IndexT, how: AnyAll = "any") -> _IndexT:
|
|
"""
|
|
Return Index without NA/NaN values.
|
|
|
|
Parameters
|
|
----------
|
|
how : {'any', 'all'}, default 'any'
|
|
If the Index is a MultiIndex, drop the value when any or all levels
|
|
are NaN.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
if how not in ("any", "all"):
|
|
raise ValueError(f"invalid how option: {how}")
|
|
|
|
if self.hasnans:
|
|
res_values = self._values[~self._isnan]
|
|
return type(self)._simple_new(res_values, name=self.name)
|
|
return self._view()
|
|
|
|
# --------------------------------------------------------------------
|
|
# Uniqueness Methods
|
|
|
|
def unique(self: _IndexT, level: Hashable | None = None) -> _IndexT:
|
|
"""
|
|
Return unique values in the index.
|
|
|
|
Unique values are returned in order of appearance, this does NOT sort.
|
|
|
|
Parameters
|
|
----------
|
|
level : int or hashable, optional
|
|
Only return values from specified level (for MultiIndex).
|
|
If int, gets the level by integer position, else by level name.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
See Also
|
|
--------
|
|
unique : Numpy array of unique values in that column.
|
|
Series.unique : Return unique values of Series object.
|
|
"""
|
|
if level is not None:
|
|
self._validate_index_level(level)
|
|
|
|
if self.is_unique:
|
|
return self._view()
|
|
|
|
result = super().unique()
|
|
return self._shallow_copy(result)
|
|
|
|
def drop_duplicates(self: _IndexT, *, keep: DropKeep = "first") -> _IndexT:
|
|
"""
|
|
Return Index with duplicate values removed.
|
|
|
|
Parameters
|
|
----------
|
|
keep : {'first', 'last', ``False``}, default 'first'
|
|
- 'first' : Drop duplicates except for the first occurrence.
|
|
- 'last' : Drop duplicates except for the last occurrence.
|
|
- ``False`` : Drop all duplicates.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
See Also
|
|
--------
|
|
Series.drop_duplicates : Equivalent method on Series.
|
|
DataFrame.drop_duplicates : Equivalent method on DataFrame.
|
|
Index.duplicated : Related method on Index, indicating duplicate
|
|
Index values.
|
|
|
|
Examples
|
|
--------
|
|
Generate an pandas.Index with duplicate values.
|
|
|
|
>>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
|
|
|
|
The `keep` parameter controls which duplicate values are removed.
|
|
The value 'first' keeps the first occurrence for each
|
|
set of duplicated entries. The default value of keep is 'first'.
|
|
|
|
>>> idx.drop_duplicates(keep='first')
|
|
Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
|
|
|
|
The value 'last' keeps the last occurrence for each set of duplicated
|
|
entries.
|
|
|
|
>>> idx.drop_duplicates(keep='last')
|
|
Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object')
|
|
|
|
The value ``False`` discards all sets of duplicated entries.
|
|
|
|
>>> idx.drop_duplicates(keep=False)
|
|
Index(['cow', 'beetle', 'hippo'], dtype='object')
|
|
"""
|
|
if self.is_unique:
|
|
return self._view()
|
|
|
|
return super().drop_duplicates(keep=keep)
|
|
|
|
def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
|
|
"""
|
|
Indicate duplicate index values.
|
|
|
|
Duplicated values are indicated as ``True`` values in the resulting
|
|
array. Either all duplicates, all except the first, or all except the
|
|
last occurrence of duplicates can be indicated.
|
|
|
|
Parameters
|
|
----------
|
|
keep : {'first', 'last', False}, default 'first'
|
|
The value or values in a set of duplicates to mark as missing.
|
|
|
|
- 'first' : Mark duplicates as ``True`` except for the first
|
|
occurrence.
|
|
- 'last' : Mark duplicates as ``True`` except for the last
|
|
occurrence.
|
|
- ``False`` : Mark all duplicates as ``True``.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[bool]
|
|
|
|
See Also
|
|
--------
|
|
Series.duplicated : Equivalent method on pandas.Series.
|
|
DataFrame.duplicated : Equivalent method on pandas.DataFrame.
|
|
Index.drop_duplicates : Remove duplicate values from Index.
|
|
|
|
Examples
|
|
--------
|
|
By default, for each set of duplicated values, the first occurrence is
|
|
set to False and all others to True:
|
|
|
|
>>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama'])
|
|
>>> idx.duplicated()
|
|
array([False, False, True, False, True])
|
|
|
|
which is equivalent to
|
|
|
|
>>> idx.duplicated(keep='first')
|
|
array([False, False, True, False, True])
|
|
|
|
By using 'last', the last occurrence of each set of duplicated values
|
|
is set on False and all others on True:
|
|
|
|
>>> idx.duplicated(keep='last')
|
|
array([ True, False, True, False, False])
|
|
|
|
By setting keep on ``False``, all duplicates are True:
|
|
|
|
>>> idx.duplicated(keep=False)
|
|
array([ True, False, True, False, True])
|
|
"""
|
|
if self.is_unique:
|
|
# fastpath available bc we are immutable
|
|
return np.zeros(len(self), dtype=bool)
|
|
return self._duplicated(keep=keep)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Arithmetic & Logical Methods
|
|
|
|
def __iadd__(self, other):
|
|
# alias for __add__
|
|
return self + other
|
|
|
|
@final
|
|
def __nonzero__(self) -> NoReturn:
|
|
raise ValueError(
|
|
f"The truth value of a {type(self).__name__} is ambiguous. "
|
|
"Use a.empty, a.bool(), a.item(), a.any() or a.all()."
|
|
)
|
|
|
|
__bool__ = __nonzero__
|
|
|
|
# --------------------------------------------------------------------
|
|
# Set Operation Methods
|
|
|
|
def _get_reconciled_name_object(self, other):
|
|
"""
|
|
If the result of a set operation will be self,
|
|
return self, unless the name changes, in which
|
|
case make a shallow copy of self.
|
|
"""
|
|
name = get_op_result_name(self, other)
|
|
if self.name is not name:
|
|
return self.rename(name)
|
|
return self
|
|
|
|
@final
|
|
def _validate_sort_keyword(self, sort):
|
|
if sort not in [None, False, True]:
|
|
raise ValueError(
|
|
"The 'sort' keyword only takes the values of "
|
|
f"None, True, or False; {sort} was passed."
|
|
)
|
|
|
|
@final
|
|
def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index]:
|
|
"""
|
|
With mismatched timezones, cast both to UTC.
|
|
"""
|
|
# Caller is responsibelf or checking
|
|
# `not is_dtype_equal(self.dtype, other.dtype)`
|
|
if (
|
|
isinstance(self, ABCDatetimeIndex)
|
|
and isinstance(other, ABCDatetimeIndex)
|
|
and self.tz is not None
|
|
and other.tz is not None
|
|
):
|
|
# GH#39328, GH#45357
|
|
left = self.tz_convert("UTC")
|
|
right = other.tz_convert("UTC")
|
|
return left, right
|
|
return self, other
|
|
|
|
@final
|
|
def union(self, other, sort=None):
|
|
"""
|
|
Form the union of two Index objects.
|
|
|
|
If the Index objects are incompatible, both Index objects will be
|
|
cast to dtype('object') first.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or array-like
|
|
sort : bool or None, default None
|
|
Whether to sort the resulting Index.
|
|
|
|
* None : Sort the result, except when
|
|
|
|
1. `self` and `other` are equal.
|
|
2. `self` or `other` has length 0.
|
|
3. Some values in `self` or `other` cannot be compared.
|
|
A RuntimeWarning is issued in this case.
|
|
|
|
* False : do not sort the result.
|
|
* True : Sort the result (which may raise TypeError).
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
Examples
|
|
--------
|
|
Union matching dtypes
|
|
|
|
>>> idx1 = pd.Index([1, 2, 3, 4])
|
|
>>> idx2 = pd.Index([3, 4, 5, 6])
|
|
>>> idx1.union(idx2)
|
|
Index([1, 2, 3, 4, 5, 6], dtype='int64')
|
|
|
|
Union mismatched dtypes
|
|
|
|
>>> idx1 = pd.Index(['a', 'b', 'c', 'd'])
|
|
>>> idx2 = pd.Index([1, 2, 3, 4])
|
|
>>> idx1.union(idx2)
|
|
Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object')
|
|
|
|
MultiIndex case
|
|
|
|
>>> idx1 = pd.MultiIndex.from_arrays(
|
|
... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
|
|
... )
|
|
>>> idx1
|
|
MultiIndex([(1, 'Red'),
|
|
(1, 'Blue'),
|
|
(2, 'Red'),
|
|
(2, 'Blue')],
|
|
)
|
|
>>> idx2 = pd.MultiIndex.from_arrays(
|
|
... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]
|
|
... )
|
|
>>> idx2
|
|
MultiIndex([(3, 'Red'),
|
|
(3, 'Green'),
|
|
(2, 'Red'),
|
|
(2, 'Green')],
|
|
)
|
|
>>> idx1.union(idx2)
|
|
MultiIndex([(1, 'Blue'),
|
|
(1, 'Red'),
|
|
(2, 'Blue'),
|
|
(2, 'Green'),
|
|
(2, 'Red'),
|
|
(3, 'Green'),
|
|
(3, 'Red')],
|
|
)
|
|
>>> idx1.union(idx2, sort=False)
|
|
MultiIndex([(1, 'Red'),
|
|
(1, 'Blue'),
|
|
(2, 'Red'),
|
|
(2, 'Blue'),
|
|
(3, 'Red'),
|
|
(3, 'Green'),
|
|
(2, 'Green')],
|
|
)
|
|
"""
|
|
self._validate_sort_keyword(sort)
|
|
self._assert_can_do_setop(other)
|
|
other, result_name = self._convert_can_do_setop(other)
|
|
|
|
if not is_dtype_equal(self.dtype, other.dtype):
|
|
if (
|
|
isinstance(self, ABCMultiIndex)
|
|
and not is_object_dtype(_unpack_nested_dtype(other))
|
|
and len(other) > 0
|
|
):
|
|
raise NotImplementedError(
|
|
"Can only union MultiIndex with MultiIndex or Index of tuples, "
|
|
"try mi.to_flat_index().union(other) instead."
|
|
)
|
|
self, other = self._dti_setop_align_tzs(other, "union")
|
|
|
|
dtype = self._find_common_type_compat(other)
|
|
left = self.astype(dtype, copy=False)
|
|
right = other.astype(dtype, copy=False)
|
|
return left.union(right, sort=sort)
|
|
|
|
elif not len(other) or self.equals(other):
|
|
# NB: whether this (and the `if not len(self)` check below) come before
|
|
# or after the is_dtype_equal check above affects the returned dtype
|
|
result = self._get_reconciled_name_object(other)
|
|
if sort is True:
|
|
return result.sort_values()
|
|
return result
|
|
|
|
elif not len(self):
|
|
result = other._get_reconciled_name_object(self)
|
|
if sort is True:
|
|
return result.sort_values()
|
|
return result
|
|
|
|
result = self._union(other, sort=sort)
|
|
|
|
return self._wrap_setop_result(other, result)
|
|
|
|
def _union(self, other: Index, sort):
|
|
"""
|
|
Specific union logic should go here. In subclasses, union behavior
|
|
should be overwritten here rather than in `self.union`.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or array-like
|
|
sort : False or None, default False
|
|
Whether to sort the resulting index.
|
|
|
|
* False : do not sort the result.
|
|
* None : sort the result, except when `self` and `other` are equal
|
|
or when the values cannot be compared.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
lvals = self._values
|
|
rvals = other._values
|
|
|
|
if (
|
|
sort is None
|
|
and self.is_monotonic_increasing
|
|
and other.is_monotonic_increasing
|
|
and not (self.has_duplicates and other.has_duplicates)
|
|
and self._can_use_libjoin
|
|
):
|
|
# Both are monotonic and at least one is unique, so can use outer join
|
|
# (actually don't need either unique, but without this restriction
|
|
# test_union_same_value_duplicated_in_both fails)
|
|
try:
|
|
return self._outer_indexer(other)[0]
|
|
except (TypeError, IncompatibleFrequency):
|
|
# incomparable objects; should only be for object dtype
|
|
value_list = list(lvals)
|
|
|
|
# worth making this faster? a very unusual case
|
|
value_set = set(lvals)
|
|
value_list.extend([x for x in rvals if x not in value_set])
|
|
# If objects are unorderable, we must have object dtype.
|
|
return np.array(value_list, dtype=object)
|
|
|
|
elif not other.is_unique:
|
|
# other has duplicates
|
|
result_dups = algos.union_with_duplicates(self, other)
|
|
return _maybe_try_sort(result_dups, sort)
|
|
|
|
# The rest of this method is analogous to Index._intersection_via_get_indexer
|
|
|
|
# Self may have duplicates; other already checked as unique
|
|
# find indexes of things in "other" that are not in "self"
|
|
if self._index_as_unique:
|
|
indexer = self.get_indexer(other)
|
|
missing = (indexer == -1).nonzero()[0]
|
|
else:
|
|
missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
|
|
|
|
result: Index | MultiIndex | ArrayLike
|
|
if self._is_multi:
|
|
# Preserve MultiIndex to avoid losing dtypes
|
|
result = self.append(other.take(missing))
|
|
|
|
else:
|
|
if len(missing) > 0:
|
|
other_diff = rvals.take(missing)
|
|
result = concat_compat((lvals, other_diff))
|
|
else:
|
|
result = lvals
|
|
|
|
if not self.is_monotonic_increasing or not other.is_monotonic_increasing:
|
|
# if both are monotonic then result should already be sorted
|
|
result = _maybe_try_sort(result, sort)
|
|
|
|
return result
|
|
|
|
@final
|
|
def _wrap_setop_result(self, other: Index, result) -> Index:
|
|
name = get_op_result_name(self, other)
|
|
if isinstance(result, Index):
|
|
if result.name != name:
|
|
result = result.rename(name)
|
|
else:
|
|
result = self._shallow_copy(result, name=name)
|
|
return result
|
|
|
|
@final
|
|
def intersection(self, other, sort: bool = False):
|
|
"""
|
|
Form the intersection of two Index objects.
|
|
|
|
This returns a new Index with elements common to the index and `other`.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or array-like
|
|
sort : True, False or None, default False
|
|
Whether to sort the resulting index.
|
|
|
|
* None : sort the result, except when `self` and `other` are equal
|
|
or when the values cannot be compared.
|
|
* False : do not sort the result.
|
|
* True : Sort the result (which may raise TypeError).
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
Examples
|
|
--------
|
|
>>> idx1 = pd.Index([1, 2, 3, 4])
|
|
>>> idx2 = pd.Index([3, 4, 5, 6])
|
|
>>> idx1.intersection(idx2)
|
|
Index([3, 4], dtype='int64')
|
|
"""
|
|
self._validate_sort_keyword(sort)
|
|
self._assert_can_do_setop(other)
|
|
other, result_name = self._convert_can_do_setop(other)
|
|
|
|
if not is_dtype_equal(self.dtype, other.dtype):
|
|
self, other = self._dti_setop_align_tzs(other, "intersection")
|
|
|
|
if self.equals(other):
|
|
if self.has_duplicates:
|
|
result = self.unique()._get_reconciled_name_object(other)
|
|
else:
|
|
result = self._get_reconciled_name_object(other)
|
|
if sort is True:
|
|
result = result.sort_values()
|
|
return result
|
|
|
|
if len(self) == 0 or len(other) == 0:
|
|
# fastpath; we need to be careful about having commutativity
|
|
|
|
if self._is_multi or other._is_multi:
|
|
# _convert_can_do_setop ensures that we have both or neither
|
|
# We retain self.levels
|
|
return self[:0].rename(result_name)
|
|
|
|
dtype = self._find_common_type_compat(other)
|
|
if is_dtype_equal(self.dtype, dtype):
|
|
# Slicing allows us to retain DTI/TDI.freq, RangeIndex
|
|
|
|
# Note: self[:0] vs other[:0] affects
|
|
# 1) which index's `freq` we get in DTI/TDI cases
|
|
# This may be a historical artifact, i.e. no documented
|
|
# reason for this choice.
|
|
# 2) The `step` we get in RangeIndex cases
|
|
if len(self) == 0:
|
|
return self[:0].rename(result_name)
|
|
else:
|
|
return other[:0].rename(result_name)
|
|
|
|
return Index([], dtype=dtype, name=result_name)
|
|
|
|
elif not self._should_compare(other):
|
|
# We can infer that the intersection is empty.
|
|
if isinstance(self, ABCMultiIndex):
|
|
return self[:0].rename(result_name)
|
|
return Index([], name=result_name)
|
|
|
|
elif not is_dtype_equal(self.dtype, other.dtype):
|
|
dtype = self._find_common_type_compat(other)
|
|
this = self.astype(dtype, copy=False)
|
|
other = other.astype(dtype, copy=False)
|
|
return this.intersection(other, sort=sort)
|
|
|
|
result = self._intersection(other, sort=sort)
|
|
return self._wrap_intersection_result(other, result)
|
|
|
|
def _intersection(self, other: Index, sort: bool = False):
|
|
"""
|
|
intersection specialized to the case with matching dtypes.
|
|
"""
|
|
if (
|
|
self.is_monotonic_increasing
|
|
and other.is_monotonic_increasing
|
|
and self._can_use_libjoin
|
|
and not isinstance(self, ABCMultiIndex)
|
|
):
|
|
try:
|
|
res_indexer, indexer, _ = self._inner_indexer(other)
|
|
except TypeError:
|
|
# non-comparable; should only be for object dtype
|
|
pass
|
|
else:
|
|
# TODO: algos.unique1d should preserve DTA/TDA
|
|
if is_numeric_dtype(self):
|
|
# This is faster, because Index.unique() checks for uniqueness
|
|
# before calculating the unique values.
|
|
res = algos.unique1d(res_indexer)
|
|
else:
|
|
result = self.take(indexer)
|
|
res = result.drop_duplicates()
|
|
return ensure_wrapped_if_datetimelike(res)
|
|
|
|
res_values = self._intersection_via_get_indexer(other, sort=sort)
|
|
res_values = _maybe_try_sort(res_values, sort)
|
|
return res_values
|
|
|
|
def _wrap_intersection_result(self, other, result):
|
|
# We will override for MultiIndex to handle empty results
|
|
return self._wrap_setop_result(other, result)
|
|
|
|
@final
|
|
def _intersection_via_get_indexer(
|
|
self, other: Index | MultiIndex, sort
|
|
) -> ArrayLike | MultiIndex:
|
|
"""
|
|
Find the intersection of two Indexes using get_indexer.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray or ExtensionArray
|
|
The returned array will be unique.
|
|
"""
|
|
left_unique = self.unique()
|
|
right_unique = other.unique()
|
|
|
|
# even though we are unique, we need get_indexer_for for IntervalIndex
|
|
indexer = left_unique.get_indexer_for(right_unique)
|
|
|
|
mask = indexer != -1
|
|
|
|
taker = indexer.take(mask.nonzero()[0])
|
|
if sort is False:
|
|
# sort bc we want the elements in the same order they are in self
|
|
# unnecessary in the case with sort=None bc we will sort later
|
|
taker = np.sort(taker)
|
|
|
|
if isinstance(left_unique, ABCMultiIndex):
|
|
result = left_unique.take(taker)
|
|
else:
|
|
result = left_unique.take(taker)._values
|
|
return result
|
|
|
|
@final
|
|
def difference(self, other, sort=None):
|
|
"""
|
|
Return a new Index with elements of index not in `other`.
|
|
|
|
This is the set difference of two Index objects.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or array-like
|
|
sort : bool or None, default None
|
|
Whether to sort the resulting index. By default, the
|
|
values are attempted to be sorted, but any TypeError from
|
|
incomparable elements is caught by pandas.
|
|
|
|
* None : Attempt to sort the result, but catch any TypeErrors
|
|
from comparing incomparable elements.
|
|
* False : Do not sort the result.
|
|
* True : Sort the result (which may raise TypeError).
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
Examples
|
|
--------
|
|
>>> idx1 = pd.Index([2, 1, 3, 4])
|
|
>>> idx2 = pd.Index([3, 4, 5, 6])
|
|
>>> idx1.difference(idx2)
|
|
Index([1, 2], dtype='int64')
|
|
>>> idx1.difference(idx2, sort=False)
|
|
Index([2, 1], dtype='int64')
|
|
"""
|
|
self._validate_sort_keyword(sort)
|
|
self._assert_can_do_setop(other)
|
|
other, result_name = self._convert_can_do_setop(other)
|
|
|
|
# Note: we do NOT call _dti_setop_align_tzs here, as there
|
|
# is no requirement that .difference be commutative, so it does
|
|
# not cast to object.
|
|
|
|
if self.equals(other):
|
|
# Note: we do not (yet) sort even if sort=None GH#24959
|
|
return self[:0].rename(result_name)
|
|
|
|
if len(other) == 0:
|
|
# Note: we do not (yet) sort even if sort=None GH#24959
|
|
result = self.rename(result_name)
|
|
if sort is True:
|
|
return result.sort_values()
|
|
return result
|
|
|
|
if not self._should_compare(other):
|
|
# Nothing matches -> difference is everything
|
|
result = self.rename(result_name)
|
|
if sort is True:
|
|
return result.sort_values()
|
|
return result
|
|
|
|
result = self._difference(other, sort=sort)
|
|
return self._wrap_difference_result(other, result)
|
|
|
|
def _difference(self, other, sort):
|
|
# overridden by RangeIndex
|
|
|
|
this = self.unique()
|
|
|
|
indexer = this.get_indexer_for(other)
|
|
indexer = indexer.take((indexer != -1).nonzero()[0])
|
|
|
|
label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
|
|
|
|
the_diff: MultiIndex | ArrayLike
|
|
if isinstance(this, ABCMultiIndex):
|
|
the_diff = this.take(label_diff)
|
|
else:
|
|
the_diff = this._values.take(label_diff)
|
|
the_diff = _maybe_try_sort(the_diff, sort)
|
|
|
|
return the_diff
|
|
|
|
def _wrap_difference_result(self, other, result):
|
|
# We will override for MultiIndex to handle empty results
|
|
return self._wrap_setop_result(other, result)
|
|
|
|
def symmetric_difference(self, other, result_name=None, sort=None):
|
|
"""
|
|
Compute the symmetric difference of two Index objects.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or array-like
|
|
result_name : str
|
|
sort : bool or None, default None
|
|
Whether to sort the resulting index. By default, the
|
|
values are attempted to be sorted, but any TypeError from
|
|
incomparable elements is caught by pandas.
|
|
|
|
* None : Attempt to sort the result, but catch any TypeErrors
|
|
from comparing incomparable elements.
|
|
* False : Do not sort the result.
|
|
* True : Sort the result (which may raise TypeError).
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
Notes
|
|
-----
|
|
``symmetric_difference`` contains elements that appear in either
|
|
``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
|
|
``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
|
|
dropped.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx1 = pd.Index([1, 2, 3, 4])
|
|
>>> idx2 = pd.Index([2, 3, 4, 5])
|
|
>>> idx1.symmetric_difference(idx2)
|
|
Index([1, 5], dtype='int64')
|
|
"""
|
|
self._validate_sort_keyword(sort)
|
|
self._assert_can_do_setop(other)
|
|
other, result_name_update = self._convert_can_do_setop(other)
|
|
if result_name is None:
|
|
result_name = result_name_update
|
|
|
|
if not is_dtype_equal(self.dtype, other.dtype):
|
|
self, other = self._dti_setop_align_tzs(other, "symmetric_difference")
|
|
|
|
if not self._should_compare(other):
|
|
return self.union(other, sort=sort).rename(result_name)
|
|
|
|
elif not is_dtype_equal(self.dtype, other.dtype):
|
|
dtype = self._find_common_type_compat(other)
|
|
this = self.astype(dtype, copy=False)
|
|
that = other.astype(dtype, copy=False)
|
|
return this.symmetric_difference(that, sort=sort).rename(result_name)
|
|
|
|
this = self.unique()
|
|
other = other.unique()
|
|
indexer = this.get_indexer_for(other)
|
|
|
|
# {this} minus {other}
|
|
common_indexer = indexer.take((indexer != -1).nonzero()[0])
|
|
left_indexer = np.setdiff1d(
|
|
np.arange(this.size), common_indexer, assume_unique=True
|
|
)
|
|
left_diff = this.take(left_indexer)
|
|
|
|
# {other} minus {this}
|
|
right_indexer = (indexer == -1).nonzero()[0]
|
|
right_diff = other.take(right_indexer)
|
|
|
|
res_values = left_diff.append(right_diff)
|
|
result = _maybe_try_sort(res_values, sort)
|
|
|
|
if not self._is_multi:
|
|
return Index(result, name=result_name, dtype=res_values.dtype)
|
|
else:
|
|
left_diff = cast("MultiIndex", left_diff)
|
|
if len(result) == 0:
|
|
# result might be an Index, if other was an Index
|
|
return left_diff.remove_unused_levels().set_names(result_name)
|
|
return result.set_names(result_name)
|
|
|
|
@final
|
|
def _assert_can_do_setop(self, other) -> bool:
|
|
if not is_list_like(other):
|
|
raise TypeError("Input must be Index or array-like")
|
|
return True
|
|
|
|
def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]:
|
|
if not isinstance(other, Index):
|
|
other = Index(other, name=self.name)
|
|
result_name = self.name
|
|
else:
|
|
result_name = get_op_result_name(self, other)
|
|
return other, result_name
|
|
|
|
# --------------------------------------------------------------------
|
|
# Indexing Methods
|
|
|
|
def get_loc(self, key):
|
|
"""
|
|
Get integer location, slice or boolean mask for requested label.
|
|
|
|
Parameters
|
|
----------
|
|
key : label
|
|
|
|
Returns
|
|
-------
|
|
int if unique index, slice if monotonic index, else mask
|
|
|
|
Examples
|
|
--------
|
|
>>> unique_index = pd.Index(list('abc'))
|
|
>>> unique_index.get_loc('b')
|
|
1
|
|
|
|
>>> monotonic_index = pd.Index(list('abbc'))
|
|
>>> monotonic_index.get_loc('b')
|
|
slice(1, 3, None)
|
|
|
|
>>> non_monotonic_index = pd.Index(list('abcb'))
|
|
>>> non_monotonic_index.get_loc('b')
|
|
array([False, True, False, True])
|
|
"""
|
|
casted_key = self._maybe_cast_indexer(key)
|
|
try:
|
|
return self._engine.get_loc(casted_key)
|
|
except KeyError as err:
|
|
raise KeyError(key) from err
|
|
except TypeError:
|
|
# If we have a listlike key, _check_indexing_error will raise
|
|
# InvalidIndexError. Otherwise we fall through and re-raise
|
|
# the TypeError.
|
|
self._check_indexing_error(key)
|
|
raise
|
|
|
|
_index_shared_docs[
|
|
"get_indexer"
|
|
] = """
|
|
Compute indexer and mask for new index given the current index.
|
|
|
|
The indexer should be then used as an input to ndarray.take to align the
|
|
current data to the new index.
|
|
|
|
Parameters
|
|
----------
|
|
target : %(target_klass)s
|
|
method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
|
|
* default: exact matches only.
|
|
* pad / ffill: find the PREVIOUS index value if no exact match.
|
|
* backfill / bfill: use NEXT index value if no exact match
|
|
* nearest: use the NEAREST index value if no exact match. Tied
|
|
distances are broken by preferring the larger index value.
|
|
limit : int, optional
|
|
Maximum number of consecutive labels in ``target`` to match for
|
|
inexact matches.
|
|
tolerance : optional
|
|
Maximum distance between original and new labels for inexact
|
|
matches. The values of the index at the matching locations must
|
|
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
|
|
|
|
Tolerance may be a scalar value, which applies the same tolerance
|
|
to all values, or list-like, which applies variable tolerance per
|
|
element. List-like includes list, tuple, array, Series, and must be
|
|
the same size as the index and its dtype must exactly match the
|
|
index's type.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
Integers from 0 to n - 1 indicating that the index at these
|
|
positions matches the corresponding target values. Missing values
|
|
in the target are marked by -1.
|
|
%(raises_section)s
|
|
Notes
|
|
-----
|
|
Returns -1 for unmatched values, for further explanation see the
|
|
example below.
|
|
|
|
Examples
|
|
--------
|
|
>>> index = pd.Index(['c', 'a', 'b'])
|
|
>>> index.get_indexer(['a', 'b', 'x'])
|
|
array([ 1, 2, -1])
|
|
|
|
Notice that the return value is an array of locations in ``index``
|
|
and ``x`` is marked by -1, as it is not in ``index``.
|
|
"""
|
|
|
|
@Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
|
|
@final
|
|
def get_indexer(
|
|
self,
|
|
target,
|
|
method: str_t | None = None,
|
|
limit: int | None = None,
|
|
tolerance=None,
|
|
) -> npt.NDArray[np.intp]:
|
|
method = clean_reindex_fill_method(method)
|
|
orig_target = target
|
|
target = self._maybe_cast_listlike_indexer(target)
|
|
|
|
self._check_indexing_method(method, limit, tolerance)
|
|
|
|
if not self._index_as_unique:
|
|
raise InvalidIndexError(self._requires_unique_msg)
|
|
|
|
if len(target) == 0:
|
|
return np.array([], dtype=np.intp)
|
|
|
|
if not self._should_compare(target) and not self._should_partial_index(target):
|
|
# IntervalIndex get special treatment bc numeric scalars can be
|
|
# matched to Interval scalars
|
|
return self._get_indexer_non_comparable(target, method=method, unique=True)
|
|
|
|
if is_categorical_dtype(self.dtype):
|
|
# _maybe_cast_listlike_indexer ensures target has our dtype
|
|
# (could improve perf by doing _should_compare check earlier?)
|
|
assert is_dtype_equal(self.dtype, target.dtype)
|
|
|
|
indexer = self._engine.get_indexer(target.codes)
|
|
if self.hasnans and target.hasnans:
|
|
# After _maybe_cast_listlike_indexer, target elements which do not
|
|
# belong to some category are changed to NaNs
|
|
# Mask to track actual NaN values compared to inserted NaN values
|
|
# GH#45361
|
|
target_nans = isna(orig_target)
|
|
loc = self.get_loc(np.nan)
|
|
mask = target.isna()
|
|
indexer[target_nans] = loc
|
|
indexer[mask & ~target_nans] = -1
|
|
return indexer
|
|
|
|
if is_categorical_dtype(target.dtype):
|
|
# potential fastpath
|
|
# get an indexer for unique categories then propagate to codes via take_nd
|
|
# get_indexer instead of _get_indexer needed for MultiIndex cases
|
|
# e.g. test_append_different_columns_types
|
|
categories_indexer = self.get_indexer(target.categories)
|
|
|
|
indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1)
|
|
|
|
if (not self._is_multi and self.hasnans) and target.hasnans:
|
|
# Exclude MultiIndex because hasnans raises NotImplementedError
|
|
# we should only get here if we are unique, so loc is an integer
|
|
# GH#41934
|
|
loc = self.get_loc(np.nan)
|
|
mask = target.isna()
|
|
indexer[mask] = loc
|
|
|
|
return ensure_platform_int(indexer)
|
|
|
|
pself, ptarget = self._maybe_promote(target)
|
|
if pself is not self or ptarget is not target:
|
|
return pself.get_indexer(
|
|
ptarget, method=method, limit=limit, tolerance=tolerance
|
|
)
|
|
|
|
if is_dtype_equal(self.dtype, target.dtype) and self.equals(target):
|
|
# Only call equals if we have same dtype to avoid inference/casting
|
|
return np.arange(len(target), dtype=np.intp)
|
|
|
|
if not is_dtype_equal(
|
|
self.dtype, target.dtype
|
|
) and not self._should_partial_index(target):
|
|
# _should_partial_index e.g. IntervalIndex with numeric scalars
|
|
# that can be matched to Interval scalars.
|
|
dtype = self._find_common_type_compat(target)
|
|
|
|
this = self.astype(dtype, copy=False)
|
|
target = target.astype(dtype, copy=False)
|
|
return this._get_indexer(
|
|
target, method=method, limit=limit, tolerance=tolerance
|
|
)
|
|
|
|
return self._get_indexer(target, method, limit, tolerance)
|
|
|
|
def _get_indexer(
|
|
self,
|
|
target: Index,
|
|
method: str_t | None = None,
|
|
limit: int | None = None,
|
|
tolerance=None,
|
|
) -> npt.NDArray[np.intp]:
|
|
if tolerance is not None:
|
|
tolerance = self._convert_tolerance(tolerance, target)
|
|
|
|
if method in ["pad", "backfill"]:
|
|
indexer = self._get_fill_indexer(target, method, limit, tolerance)
|
|
elif method == "nearest":
|
|
indexer = self._get_nearest_indexer(target, limit, tolerance)
|
|
else:
|
|
if target._is_multi and self._is_multi:
|
|
engine = self._engine
|
|
# error: Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]"
|
|
# has no attribute "_extract_level_codes"
|
|
tgt_values = engine._extract_level_codes( # type: ignore[union-attr]
|
|
target
|
|
)
|
|
else:
|
|
tgt_values = target._get_engine_target()
|
|
|
|
indexer = self._engine.get_indexer(tgt_values)
|
|
|
|
return ensure_platform_int(indexer)
|
|
|
|
@final
|
|
def _should_partial_index(self, target: Index) -> bool:
|
|
"""
|
|
Should we attempt partial-matching indexing?
|
|
"""
|
|
if is_interval_dtype(self.dtype):
|
|
if is_interval_dtype(target.dtype):
|
|
return False
|
|
# See https://github.com/pandas-dev/pandas/issues/47772 the commented
|
|
# out code can be restored (instead of hardcoding `return True`)
|
|
# once that issue is fixed
|
|
# "Index" has no attribute "left"
|
|
# return self.left._should_compare(target) # type: ignore[attr-defined]
|
|
return True
|
|
return False
|
|
|
|
@final
|
|
def _check_indexing_method(
|
|
self,
|
|
method: str_t | None,
|
|
limit: int | None = None,
|
|
tolerance=None,
|
|
) -> None:
|
|
"""
|
|
Raise if we have a get_indexer `method` that is not supported or valid.
|
|
"""
|
|
if method not in [None, "bfill", "backfill", "pad", "ffill", "nearest"]:
|
|
# in practice the clean_reindex_fill_method call would raise
|
|
# before we get here
|
|
raise ValueError("Invalid fill method") # pragma: no cover
|
|
|
|
if self._is_multi:
|
|
if method == "nearest":
|
|
raise NotImplementedError(
|
|
"method='nearest' not implemented yet "
|
|
"for MultiIndex; see GitHub issue 9365"
|
|
)
|
|
if method in ("pad", "backfill"):
|
|
if tolerance is not None:
|
|
raise NotImplementedError(
|
|
"tolerance not implemented yet for MultiIndex"
|
|
)
|
|
|
|
if is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype):
|
|
# GH#37871 for now this is only for IntervalIndex and CategoricalIndex
|
|
if method is not None:
|
|
raise NotImplementedError(
|
|
f"method {method} not yet implemented for {type(self).__name__}"
|
|
)
|
|
|
|
if method is None:
|
|
if tolerance is not None:
|
|
raise ValueError(
|
|
"tolerance argument only valid if doing pad, "
|
|
"backfill or nearest reindexing"
|
|
)
|
|
if limit is not None:
|
|
raise ValueError(
|
|
"limit argument only valid if doing pad, "
|
|
"backfill or nearest reindexing"
|
|
)
|
|
|
|
def _convert_tolerance(self, tolerance, target: np.ndarray | Index) -> np.ndarray:
|
|
# override this method on subclasses
|
|
tolerance = np.asarray(tolerance)
|
|
if target.size != tolerance.size and tolerance.size > 1:
|
|
raise ValueError("list-like tolerance size must match target index size")
|
|
elif is_numeric_dtype(self) and not np.issubdtype(tolerance.dtype, np.number):
|
|
if tolerance.ndim > 0:
|
|
raise ValueError(
|
|
f"tolerance argument for {type(self).__name__} with dtype "
|
|
f"{self.dtype} must contain numeric elements if it is list type"
|
|
)
|
|
|
|
raise ValueError(
|
|
f"tolerance argument for {type(self).__name__} with dtype {self.dtype} "
|
|
f"must be numeric if it is a scalar: {repr(tolerance)}"
|
|
)
|
|
return tolerance
|
|
|
|
@final
|
|
def _get_fill_indexer(
|
|
self, target: Index, method: str_t, limit: int | None = None, tolerance=None
|
|
) -> npt.NDArray[np.intp]:
|
|
if self._is_multi:
|
|
# TODO: get_indexer_with_fill docstring says values must be _sorted_
|
|
# but that doesn't appear to be enforced
|
|
# error: "IndexEngine" has no attribute "get_indexer_with_fill"
|
|
engine = self._engine
|
|
with warnings.catch_warnings():
|
|
# TODO: We need to fix this. Casting to int64 in cython
|
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
return engine.get_indexer_with_fill( # type: ignore[union-attr]
|
|
target=target._values,
|
|
values=self._values,
|
|
method=method,
|
|
limit=limit,
|
|
)
|
|
|
|
if self.is_monotonic_increasing and target.is_monotonic_increasing:
|
|
target_values = target._get_engine_target()
|
|
own_values = self._get_engine_target()
|
|
if not isinstance(target_values, np.ndarray) or not isinstance(
|
|
own_values, np.ndarray
|
|
):
|
|
raise NotImplementedError
|
|
|
|
if method == "pad":
|
|
indexer = libalgos.pad(own_values, target_values, limit=limit)
|
|
else:
|
|
# i.e. "backfill"
|
|
indexer = libalgos.backfill(own_values, target_values, limit=limit)
|
|
else:
|
|
indexer = self._get_fill_indexer_searchsorted(target, method, limit)
|
|
if tolerance is not None and len(self):
|
|
indexer = self._filter_indexer_tolerance(target, indexer, tolerance)
|
|
return indexer
|
|
|
|
@final
|
|
def _get_fill_indexer_searchsorted(
|
|
self, target: Index, method: str_t, limit: int | None = None
|
|
) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Fallback pad/backfill get_indexer that works for monotonic decreasing
|
|
indexes and non-monotonic targets.
|
|
"""
|
|
if limit is not None:
|
|
raise ValueError(
|
|
f"limit argument for {repr(method)} method only well-defined "
|
|
"if index and target are monotonic"
|
|
)
|
|
|
|
side: Literal["left", "right"] = "left" if method == "pad" else "right"
|
|
|
|
# find exact matches first (this simplifies the algorithm)
|
|
indexer = self.get_indexer(target)
|
|
nonexact = indexer == -1
|
|
indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
|
|
if side == "left":
|
|
# searchsorted returns "indices into a sorted array such that,
|
|
# if the corresponding elements in v were inserted before the
|
|
# indices, the order of a would be preserved".
|
|
# Thus, we need to subtract 1 to find values to the left.
|
|
indexer[nonexact] -= 1
|
|
# This also mapped not found values (values of 0 from
|
|
# np.searchsorted) to -1, which conveniently is also our
|
|
# sentinel for missing values
|
|
else:
|
|
# Mark indices to the right of the largest value as not found
|
|
indexer[indexer == len(self)] = -1
|
|
return indexer
|
|
|
|
@final
|
|
def _get_nearest_indexer(
|
|
self, target: Index, limit: int | None, tolerance
|
|
) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Get the indexer for the nearest index labels; requires an index with
|
|
values that can be subtracted from each other (e.g., not strings or
|
|
tuples).
|
|
"""
|
|
if not len(self):
|
|
return self._get_fill_indexer(target, "pad")
|
|
|
|
left_indexer = self.get_indexer(target, "pad", limit=limit)
|
|
right_indexer = self.get_indexer(target, "backfill", limit=limit)
|
|
|
|
left_distances = self._difference_compat(target, left_indexer)
|
|
right_distances = self._difference_compat(target, right_indexer)
|
|
|
|
op = operator.lt if self.is_monotonic_increasing else operator.le
|
|
indexer = np.where(
|
|
# error: Argument 1&2 has incompatible type "Union[ExtensionArray,
|
|
# ndarray[Any, Any]]"; expected "Union[SupportsDunderLE,
|
|
# SupportsDunderGE, SupportsDunderGT, SupportsDunderLT]"
|
|
op(left_distances, right_distances) # type: ignore[arg-type]
|
|
| (right_indexer == -1),
|
|
left_indexer,
|
|
right_indexer,
|
|
)
|
|
if tolerance is not None:
|
|
indexer = self._filter_indexer_tolerance(target, indexer, tolerance)
|
|
return indexer
|
|
|
|
@final
|
|
def _filter_indexer_tolerance(
|
|
self,
|
|
target: Index,
|
|
indexer: npt.NDArray[np.intp],
|
|
tolerance,
|
|
) -> npt.NDArray[np.intp]:
|
|
distance = self._difference_compat(target, indexer)
|
|
|
|
return np.where(distance <= tolerance, indexer, -1)
|
|
|
|
@final
|
|
def _difference_compat(
|
|
self, target: Index, indexer: npt.NDArray[np.intp]
|
|
) -> ArrayLike:
|
|
# Compatibility for PeriodArray, for which __sub__ returns an ndarray[object]
|
|
# of DateOffset objects, which do not support __abs__ (and would be slow
|
|
# if they did)
|
|
|
|
if isinstance(self.dtype, PeriodDtype):
|
|
# Note: we only get here with matching dtypes
|
|
own_values = cast("PeriodArray", self._data)._ndarray
|
|
target_values = cast("PeriodArray", target._data)._ndarray
|
|
diff = own_values[indexer] - target_values
|
|
else:
|
|
# error: Unsupported left operand type for - ("ExtensionArray")
|
|
diff = self._values[indexer] - target._values # type: ignore[operator]
|
|
return abs(diff)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Indexer Conversion Methods
|
|
|
|
@final
|
|
def _validate_positional_slice(self, key: slice) -> None:
|
|
"""
|
|
For positional indexing, a slice must have either int or None
|
|
for each of start, stop, and step.
|
|
"""
|
|
self._validate_indexer("positional", key.start, "iloc")
|
|
self._validate_indexer("positional", key.stop, "iloc")
|
|
self._validate_indexer("positional", key.step, "iloc")
|
|
|
|
def _convert_slice_indexer(self, key: slice, kind: str_t):
|
|
"""
|
|
Convert a slice indexer.
|
|
|
|
By definition, these are labels unless 'iloc' is passed in.
|
|
Floats are not allowed as the start, step, or stop of the slice.
|
|
|
|
Parameters
|
|
----------
|
|
key : label of the slice bound
|
|
kind : {'loc', 'getitem'}
|
|
"""
|
|
assert kind in ["loc", "getitem"], kind
|
|
|
|
# potentially cast the bounds to integers
|
|
start, stop, step = key.start, key.stop, key.step
|
|
|
|
# TODO(GH#50617): once Series.__[gs]etitem__ is removed we should be able
|
|
# to simplify this.
|
|
if isinstance(self.dtype, np.dtype) and is_float_dtype(self.dtype):
|
|
# We always treat __getitem__ slicing as label-based
|
|
# translate to locations
|
|
return self.slice_indexer(start, stop, step)
|
|
|
|
# figure out if this is a positional indexer
|
|
def is_int(v):
|
|
return v is None or is_integer(v)
|
|
|
|
is_index_slice = is_int(start) and is_int(stop) and is_int(step)
|
|
|
|
# special case for interval_dtype bc we do not do partial-indexing
|
|
# on integer Intervals when slicing
|
|
# TODO: write this in terms of e.g. should_partial_index?
|
|
ints_are_positional = self._should_fallback_to_positional or is_interval_dtype(
|
|
self.dtype
|
|
)
|
|
is_positional = is_index_slice and ints_are_positional
|
|
|
|
if kind == "getitem":
|
|
# called from the getitem slicers, validate that we are in fact integers
|
|
if is_integer_dtype(self.dtype) or is_index_slice:
|
|
# Note: these checks are redundant if we know is_index_slice
|
|
self._validate_indexer("slice", key.start, "getitem")
|
|
self._validate_indexer("slice", key.stop, "getitem")
|
|
self._validate_indexer("slice", key.step, "getitem")
|
|
return key
|
|
|
|
# convert the slice to an indexer here
|
|
|
|
# if we are mixed and have integers
|
|
if is_positional:
|
|
try:
|
|
# Validate start & stop
|
|
if start is not None:
|
|
self.get_loc(start)
|
|
if stop is not None:
|
|
self.get_loc(stop)
|
|
is_positional = False
|
|
except KeyError:
|
|
pass
|
|
|
|
if com.is_null_slice(key):
|
|
# It doesn't matter if we are positional or label based
|
|
indexer = key
|
|
elif is_positional:
|
|
if kind == "loc":
|
|
# GH#16121, GH#24612, GH#31810
|
|
raise TypeError(
|
|
"Slicing a positional slice with .loc is not allowed, "
|
|
"Use .loc with labels or .iloc with positions instead.",
|
|
)
|
|
indexer = key
|
|
else:
|
|
indexer = self.slice_indexer(start, stop, step)
|
|
|
|
return indexer
|
|
|
|
@final
|
|
def _raise_invalid_indexer(
|
|
self,
|
|
form: str_t,
|
|
key,
|
|
reraise: lib.NoDefault | None | Exception = lib.no_default,
|
|
) -> None:
|
|
"""
|
|
Raise consistent invalid indexer message.
|
|
"""
|
|
msg = (
|
|
f"cannot do {form} indexing on {type(self).__name__} with these "
|
|
f"indexers [{key}] of type {type(key).__name__}"
|
|
)
|
|
if reraise is not lib.no_default:
|
|
raise TypeError(msg) from reraise
|
|
raise TypeError(msg)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Reindex Methods
|
|
|
|
@final
|
|
def _validate_can_reindex(self, indexer: np.ndarray) -> None:
|
|
"""
|
|
Check if we are allowing reindexing with this particular indexer.
|
|
|
|
Parameters
|
|
----------
|
|
indexer : an integer ndarray
|
|
|
|
Raises
|
|
------
|
|
ValueError if its a duplicate axis
|
|
"""
|
|
# trying to reindex on an axis with duplicates
|
|
if not self._index_as_unique and len(indexer):
|
|
raise ValueError("cannot reindex on an axis with duplicate labels")
|
|
|
|
def reindex(
|
|
self, target, method=None, level=None, limit=None, tolerance=None
|
|
) -> tuple[Index, npt.NDArray[np.intp] | None]:
|
|
"""
|
|
Create index with target's values.
|
|
|
|
Parameters
|
|
----------
|
|
target : an iterable
|
|
method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
|
|
* default: exact matches only.
|
|
* pad / ffill: find the PREVIOUS index value if no exact match.
|
|
* backfill / bfill: use NEXT index value if no exact match
|
|
* nearest: use the NEAREST index value if no exact match. Tied
|
|
distances are broken by preferring the larger index value.
|
|
level : int, optional
|
|
Level of multiindex.
|
|
limit : int, optional
|
|
Maximum number of consecutive labels in ``target`` to match for
|
|
inexact matches.
|
|
tolerance : int or float, optional
|
|
Maximum distance between original and new labels for inexact
|
|
matches. The values of the index at the matching locations must
|
|
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
|
|
|
|
Tolerance may be a scalar value, which applies the same tolerance
|
|
to all values, or list-like, which applies variable tolerance per
|
|
element. List-like includes list, tuple, array, Series, and must be
|
|
the same size as the index and its dtype must exactly match the
|
|
index's type.
|
|
|
|
Returns
|
|
-------
|
|
new_index : pd.Index
|
|
Resulting index.
|
|
indexer : np.ndarray[np.intp] or None
|
|
Indices of output values in original index.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If ``method`` passed along with ``level``.
|
|
ValueError
|
|
If non-unique multi-index
|
|
ValueError
|
|
If non-unique index and ``method`` or ``limit`` passed.
|
|
|
|
See Also
|
|
--------
|
|
Series.reindex : Conform Series to new index with optional filling logic.
|
|
DataFrame.reindex : Conform DataFrame to new index with optional filling logic.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['car', 'bike', 'train', 'tractor'])
|
|
>>> idx
|
|
Index(['car', 'bike', 'train', 'tractor'], dtype='object')
|
|
>>> idx.reindex(['car', 'bike'])
|
|
(Index(['car', 'bike'], dtype='object'), array([0, 1]))
|
|
"""
|
|
# GH6552: preserve names when reindexing to non-named target
|
|
# (i.e. neither Index nor Series).
|
|
preserve_names = not hasattr(target, "name")
|
|
|
|
# GH7774: preserve dtype/tz if target is empty and not an Index.
|
|
target = ensure_has_len(target) # target may be an iterator
|
|
|
|
if not isinstance(target, Index) and len(target) == 0:
|
|
if level is not None and self._is_multi:
|
|
# "Index" has no attribute "levels"; maybe "nlevels"?
|
|
idx = self.levels[level] # type: ignore[attr-defined]
|
|
else:
|
|
idx = self
|
|
target = idx[:0]
|
|
else:
|
|
target = ensure_index(target)
|
|
|
|
if level is not None and (
|
|
isinstance(self, ABCMultiIndex) or isinstance(target, ABCMultiIndex)
|
|
):
|
|
if method is not None:
|
|
raise TypeError("Fill method not supported if level passed")
|
|
|
|
# TODO: tests where passing `keep_order=not self._is_multi`
|
|
# makes a difference for non-MultiIndex case
|
|
target, indexer, _ = self._join_level(
|
|
target, level, how="right", keep_order=not self._is_multi
|
|
)
|
|
|
|
else:
|
|
if self.equals(target):
|
|
indexer = None
|
|
else:
|
|
if self._index_as_unique:
|
|
indexer = self.get_indexer(
|
|
target, method=method, limit=limit, tolerance=tolerance
|
|
)
|
|
elif self._is_multi:
|
|
raise ValueError("cannot handle a non-unique multi-index!")
|
|
elif not self.is_unique:
|
|
# GH#42568
|
|
raise ValueError("cannot reindex on an axis with duplicate labels")
|
|
else:
|
|
indexer, _ = self.get_indexer_non_unique(target)
|
|
|
|
target = self._wrap_reindex_result(target, indexer, preserve_names)
|
|
return target, indexer
|
|
|
|
def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
|
|
target = self._maybe_preserve_names(target, preserve_names)
|
|
return target
|
|
|
|
def _maybe_preserve_names(self, target: Index, preserve_names: bool):
|
|
if preserve_names and target.nlevels == 1 and target.name != self.name:
|
|
target = target.copy(deep=False)
|
|
target.name = self.name
|
|
return target
|
|
|
|
@final
|
|
def _reindex_non_unique(
|
|
self, target: Index
|
|
) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp] | None]:
|
|
"""
|
|
Create a new index with target's values (move/add/delete values as
|
|
necessary) use with non-unique Index and a possibly non-unique target.
|
|
|
|
Parameters
|
|
----------
|
|
target : an iterable
|
|
|
|
Returns
|
|
-------
|
|
new_index : pd.Index
|
|
Resulting index.
|
|
indexer : np.ndarray[np.intp]
|
|
Indices of output values in original index.
|
|
new_indexer : np.ndarray[np.intp] or None
|
|
|
|
"""
|
|
target = ensure_index(target)
|
|
if len(target) == 0:
|
|
# GH#13691
|
|
return self[:0], np.array([], dtype=np.intp), None
|
|
|
|
indexer, missing = self.get_indexer_non_unique(target)
|
|
check = indexer != -1
|
|
new_labels = self.take(indexer[check])
|
|
new_indexer = None
|
|
|
|
if len(missing):
|
|
length = np.arange(len(indexer), dtype=np.intp)
|
|
|
|
missing = ensure_platform_int(missing)
|
|
missing_labels = target.take(missing)
|
|
missing_indexer = length[~check]
|
|
cur_labels = self.take(indexer[check]).values
|
|
cur_indexer = length[check]
|
|
|
|
# Index constructor below will do inference
|
|
new_labels = np.empty((len(indexer),), dtype=object)
|
|
new_labels[cur_indexer] = cur_labels
|
|
new_labels[missing_indexer] = missing_labels
|
|
|
|
# GH#38906
|
|
if not len(self):
|
|
new_indexer = np.arange(0, dtype=np.intp)
|
|
|
|
# a unique indexer
|
|
elif target.is_unique:
|
|
# see GH5553, make sure we use the right indexer
|
|
new_indexer = np.arange(len(indexer), dtype=np.intp)
|
|
new_indexer[cur_indexer] = np.arange(len(cur_labels))
|
|
new_indexer[missing_indexer] = -1
|
|
|
|
# we have a non_unique selector, need to use the original
|
|
# indexer here
|
|
else:
|
|
# need to retake to have the same size as the indexer
|
|
indexer[~check] = -1
|
|
|
|
# reset the new indexer to account for the new size
|
|
new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp)
|
|
new_indexer[~check] = -1
|
|
|
|
if not isinstance(self, ABCMultiIndex):
|
|
new_index = Index(new_labels, name=self.name)
|
|
else:
|
|
new_index = type(self).from_tuples(new_labels, names=self.names)
|
|
return new_index, indexer, new_indexer
|
|
|
|
# --------------------------------------------------------------------
|
|
# Join Methods
|
|
|
|
@overload
|
|
def join(
|
|
self,
|
|
other: Index,
|
|
*,
|
|
how: JoinHow = ...,
|
|
level: Level = ...,
|
|
return_indexers: Literal[True],
|
|
sort: bool = ...,
|
|
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
...
|
|
|
|
@overload
|
|
def join(
|
|
self,
|
|
other: Index,
|
|
*,
|
|
how: JoinHow = ...,
|
|
level: Level = ...,
|
|
return_indexers: Literal[False] = ...,
|
|
sort: bool = ...,
|
|
) -> Index:
|
|
...
|
|
|
|
@overload
|
|
def join(
|
|
self,
|
|
other: Index,
|
|
*,
|
|
how: JoinHow = ...,
|
|
level: Level = ...,
|
|
return_indexers: bool = ...,
|
|
sort: bool = ...,
|
|
) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
...
|
|
|
|
@final
|
|
@_maybe_return_indexers
|
|
def join(
|
|
self,
|
|
other: Index,
|
|
*,
|
|
how: JoinHow = "left",
|
|
level: Level = None,
|
|
return_indexers: bool = False,
|
|
sort: bool = False,
|
|
) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
"""
|
|
Compute join_index and indexers to conform data structures to the new index.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index
|
|
how : {'left', 'right', 'inner', 'outer'}
|
|
level : int or level name, default None
|
|
return_indexers : bool, default False
|
|
sort : bool, default False
|
|
Sort the join keys lexicographically in the result Index. If False,
|
|
the order of the join keys depends on the join type (how keyword).
|
|
|
|
Returns
|
|
-------
|
|
join_index, (left_indexer, right_indexer)
|
|
"""
|
|
other = ensure_index(other)
|
|
|
|
if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
|
|
if (self.tz is None) ^ (other.tz is None):
|
|
# Raise instead of casting to object below.
|
|
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
|
|
|
|
if not self._is_multi and not other._is_multi:
|
|
# We have specific handling for MultiIndex below
|
|
pself, pother = self._maybe_promote(other)
|
|
if pself is not self or pother is not other:
|
|
return pself.join(
|
|
pother, how=how, level=level, return_indexers=True, sort=sort
|
|
)
|
|
|
|
lindexer: np.ndarray | None
|
|
rindexer: np.ndarray | None
|
|
|
|
# try to figure out the join level
|
|
# GH3662
|
|
if level is None and (self._is_multi or other._is_multi):
|
|
# have the same levels/names so a simple join
|
|
if self.names == other.names:
|
|
pass
|
|
else:
|
|
return self._join_multi(other, how=how)
|
|
|
|
# join on the level
|
|
if level is not None and (self._is_multi or other._is_multi):
|
|
return self._join_level(other, level, how=how)
|
|
|
|
if len(other) == 0:
|
|
if how in ("left", "outer"):
|
|
join_index = self._view()
|
|
rindexer = np.broadcast_to(np.intp(-1), len(join_index))
|
|
return join_index, None, rindexer
|
|
elif how in ("right", "inner", "cross"):
|
|
join_index = other._view()
|
|
lindexer = np.array([])
|
|
return join_index, lindexer, None
|
|
|
|
if len(self) == 0:
|
|
if how in ("right", "outer"):
|
|
join_index = other._view()
|
|
lindexer = np.broadcast_to(np.intp(-1), len(join_index))
|
|
return join_index, lindexer, None
|
|
elif how in ("left", "inner", "cross"):
|
|
join_index = self._view()
|
|
rindexer = np.array([])
|
|
return join_index, None, rindexer
|
|
|
|
if self._join_precedence < other._join_precedence:
|
|
flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
|
|
how = flip.get(how, how)
|
|
join_index, lidx, ridx = other.join(
|
|
self, how=how, level=level, return_indexers=True
|
|
)
|
|
lidx, ridx = ridx, lidx
|
|
return join_index, lidx, ridx
|
|
|
|
if not is_dtype_equal(self.dtype, other.dtype):
|
|
dtype = self._find_common_type_compat(other)
|
|
this = self.astype(dtype, copy=False)
|
|
other = other.astype(dtype, copy=False)
|
|
return this.join(other, how=how, return_indexers=True)
|
|
|
|
_validate_join_method(how)
|
|
|
|
if not self.is_unique and not other.is_unique:
|
|
return self._join_non_unique(other, how=how)
|
|
elif not self.is_unique or not other.is_unique:
|
|
if self.is_monotonic_increasing and other.is_monotonic_increasing:
|
|
if not is_interval_dtype(self.dtype):
|
|
# otherwise we will fall through to _join_via_get_indexer
|
|
# GH#39133
|
|
# go through object dtype for ea till engine is supported properly
|
|
return self._join_monotonic(other, how=how)
|
|
else:
|
|
return self._join_non_unique(other, how=how)
|
|
elif (
|
|
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
|
|
self.is_monotonic_increasing
|
|
and other.is_monotonic_increasing
|
|
and self._can_use_libjoin
|
|
and not isinstance(self, ABCMultiIndex)
|
|
and not is_categorical_dtype(self.dtype)
|
|
):
|
|
# Categorical is monotonic if data are ordered as categories, but join can
|
|
# not handle this in case of not lexicographically monotonic GH#38502
|
|
try:
|
|
return self._join_monotonic(other, how=how)
|
|
except TypeError:
|
|
# object dtype; non-comparable objects
|
|
pass
|
|
|
|
return self._join_via_get_indexer(other, how, sort)
|
|
|
|
@final
|
|
def _join_via_get_indexer(
|
|
self, other: Index, how: JoinHow, sort: bool
|
|
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
# Fallback if we do not have any fastpaths available based on
|
|
# uniqueness/monotonicity
|
|
|
|
# Note: at this point we have checked matching dtypes
|
|
|
|
if how == "left":
|
|
join_index = self
|
|
elif how == "right":
|
|
join_index = other
|
|
elif how == "inner":
|
|
# TODO: sort=False here for backwards compat. It may
|
|
# be better to use the sort parameter passed into join
|
|
join_index = self.intersection(other, sort=False)
|
|
elif how == "outer":
|
|
# TODO: sort=True here for backwards compat. It may
|
|
# be better to use the sort parameter passed into join
|
|
join_index = self.union(other)
|
|
|
|
if sort:
|
|
join_index = join_index.sort_values()
|
|
|
|
if join_index is self:
|
|
lindexer = None
|
|
else:
|
|
lindexer = self.get_indexer_for(join_index)
|
|
if join_index is other:
|
|
rindexer = None
|
|
else:
|
|
rindexer = other.get_indexer_for(join_index)
|
|
return join_index, lindexer, rindexer
|
|
|
|
@final
|
|
def _join_multi(self, other: Index, how: JoinHow):
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
from pandas.core.reshape.merge import restore_dropped_levels_multijoin
|
|
|
|
# figure out join names
|
|
self_names_list = list(com.not_none(*self.names))
|
|
other_names_list = list(com.not_none(*other.names))
|
|
self_names_order = self_names_list.index
|
|
other_names_order = other_names_list.index
|
|
self_names = set(self_names_list)
|
|
other_names = set(other_names_list)
|
|
overlap = self_names & other_names
|
|
|
|
# need at least 1 in common
|
|
if not overlap:
|
|
raise ValueError("cannot join with no overlapping index names")
|
|
|
|
if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
|
|
# Drop the non-matching levels from left and right respectively
|
|
ldrop_names = sorted(self_names - overlap, key=self_names_order)
|
|
rdrop_names = sorted(other_names - overlap, key=other_names_order)
|
|
|
|
# if only the order differs
|
|
if not len(ldrop_names + rdrop_names):
|
|
self_jnlevels = self
|
|
other_jnlevels = other.reorder_levels(self.names)
|
|
else:
|
|
self_jnlevels = self.droplevel(ldrop_names)
|
|
other_jnlevels = other.droplevel(rdrop_names)
|
|
|
|
# Join left and right
|
|
# Join on same leveled multi-index frames is supported
|
|
join_idx, lidx, ridx = self_jnlevels.join(
|
|
other_jnlevels, how=how, return_indexers=True
|
|
)
|
|
|
|
# Restore the dropped levels
|
|
# Returned index level order is
|
|
# common levels, ldrop_names, rdrop_names
|
|
dropped_names = ldrop_names + rdrop_names
|
|
|
|
# error: Argument 5/6 to "restore_dropped_levels_multijoin" has
|
|
# incompatible type "Optional[ndarray[Any, dtype[signedinteger[Any
|
|
# ]]]]"; expected "ndarray[Any, dtype[signedinteger[Any]]]"
|
|
levels, codes, names = restore_dropped_levels_multijoin(
|
|
self,
|
|
other,
|
|
dropped_names,
|
|
join_idx,
|
|
lidx, # type: ignore[arg-type]
|
|
ridx, # type: ignore[arg-type]
|
|
)
|
|
|
|
# Re-create the multi-index
|
|
multi_join_idx = MultiIndex(
|
|
levels=levels, codes=codes, names=names, verify_integrity=False
|
|
)
|
|
|
|
multi_join_idx = multi_join_idx.remove_unused_levels()
|
|
|
|
return multi_join_idx, lidx, ridx
|
|
|
|
jl = list(overlap)[0]
|
|
|
|
# Case where only one index is multi
|
|
# make the indices into mi's that match
|
|
flip_order = False
|
|
if isinstance(self, MultiIndex):
|
|
self, other = other, self
|
|
flip_order = True
|
|
# flip if join method is right or left
|
|
flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
|
|
how = flip.get(how, how)
|
|
|
|
level = other.names.index(jl)
|
|
result = self._join_level(other, level, how=how)
|
|
|
|
if flip_order:
|
|
return result[0], result[2], result[1]
|
|
return result
|
|
|
|
@final
|
|
def _join_non_unique(
|
|
self, other: Index, how: JoinHow = "left"
|
|
) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
from pandas.core.reshape.merge import get_join_indexers
|
|
|
|
# We only get here if dtypes match
|
|
assert self.dtype == other.dtype
|
|
|
|
left_idx, right_idx = get_join_indexers(
|
|
[self._values], [other._values], how=how, sort=True
|
|
)
|
|
mask = left_idx == -1
|
|
|
|
join_idx = self.take(left_idx)
|
|
right = other.take(right_idx)
|
|
join_index = join_idx.putmask(mask, right)
|
|
return join_index, left_idx, right_idx
|
|
|
|
@final
|
|
def _join_level(
|
|
self, other: Index, level, how: JoinHow = "left", keep_order: bool = True
|
|
) -> tuple[MultiIndex, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
"""
|
|
The join method *only* affects the level of the resulting
|
|
MultiIndex. Otherwise it just exactly aligns the Index data to the
|
|
labels of the level in the MultiIndex.
|
|
|
|
If ```keep_order == True```, the order of the data indexed by the
|
|
MultiIndex will not be changed; otherwise, it will tie out
|
|
with `other`.
|
|
"""
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Returns sorter for the inner most level while preserving the
|
|
order of higher levels.
|
|
|
|
Parameters
|
|
----------
|
|
labels : list[np.ndarray]
|
|
Each ndarray has signed integer dtype, not necessarily identical.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
"""
|
|
if labels[0].size == 0:
|
|
return np.empty(0, dtype=np.intp)
|
|
|
|
if len(labels) == 1:
|
|
return get_group_index_sorter(ensure_platform_int(labels[0]))
|
|
|
|
# find indexers of beginning of each set of
|
|
# same-key labels w.r.t all but last level
|
|
tic = labels[0][:-1] != labels[0][1:]
|
|
for lab in labels[1:-1]:
|
|
tic |= lab[:-1] != lab[1:]
|
|
|
|
starts = np.hstack(([True], tic, [True])).nonzero()[0]
|
|
lab = ensure_int64(labels[-1])
|
|
return lib.get_level_sorter(lab, ensure_platform_int(starts))
|
|
|
|
if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
|
|
raise TypeError("Join on level between two MultiIndex objects is ambiguous")
|
|
|
|
left, right = self, other
|
|
|
|
flip_order = not isinstance(self, MultiIndex)
|
|
if flip_order:
|
|
left, right = right, left
|
|
flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
|
|
how = flip.get(how, how)
|
|
|
|
assert isinstance(left, MultiIndex)
|
|
|
|
level = left._get_level_number(level)
|
|
old_level = left.levels[level]
|
|
|
|
if not right.is_unique:
|
|
raise NotImplementedError(
|
|
"Index._join_level on non-unique index is not implemented"
|
|
)
|
|
|
|
new_level, left_lev_indexer, right_lev_indexer = old_level.join(
|
|
right, how=how, return_indexers=True
|
|
)
|
|
|
|
if left_lev_indexer is None:
|
|
if keep_order or len(left) == 0:
|
|
left_indexer = None
|
|
join_index = left
|
|
else: # sort the leaves
|
|
left_indexer = _get_leaf_sorter(left.codes[: level + 1])
|
|
join_index = left[left_indexer]
|
|
|
|
else:
|
|
left_lev_indexer = ensure_platform_int(left_lev_indexer)
|
|
rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
|
|
old_codes = left.codes[level]
|
|
|
|
taker = old_codes[old_codes != -1]
|
|
new_lev_codes = rev_indexer.take(taker)
|
|
|
|
new_codes = list(left.codes)
|
|
new_codes[level] = new_lev_codes
|
|
|
|
new_levels = list(left.levels)
|
|
new_levels[level] = new_level
|
|
|
|
if keep_order: # just drop missing values. o.w. keep order
|
|
left_indexer = np.arange(len(left), dtype=np.intp)
|
|
left_indexer = cast(np.ndarray, left_indexer)
|
|
mask = new_lev_codes != -1
|
|
if not mask.all():
|
|
new_codes = [lab[mask] for lab in new_codes]
|
|
left_indexer = left_indexer[mask]
|
|
|
|
else: # tie out the order with other
|
|
if level == 0: # outer most level, take the fast route
|
|
max_new_lev = 0 if len(new_lev_codes) == 0 else new_lev_codes.max()
|
|
ngroups = 1 + max_new_lev
|
|
left_indexer, counts = libalgos.groupsort_indexer(
|
|
new_lev_codes, ngroups
|
|
)
|
|
|
|
# missing values are placed first; drop them!
|
|
left_indexer = left_indexer[counts[0] :]
|
|
new_codes = [lab[left_indexer] for lab in new_codes]
|
|
|
|
else: # sort the leaves
|
|
mask = new_lev_codes != -1
|
|
mask_all = mask.all()
|
|
if not mask_all:
|
|
new_codes = [lab[mask] for lab in new_codes]
|
|
|
|
left_indexer = _get_leaf_sorter(new_codes[: level + 1])
|
|
new_codes = [lab[left_indexer] for lab in new_codes]
|
|
|
|
# left_indexers are w.r.t masked frame.
|
|
# reverse to original frame!
|
|
if not mask_all:
|
|
left_indexer = mask.nonzero()[0][left_indexer]
|
|
|
|
join_index = MultiIndex(
|
|
levels=new_levels,
|
|
codes=new_codes,
|
|
names=left.names,
|
|
verify_integrity=False,
|
|
)
|
|
|
|
if right_lev_indexer is not None:
|
|
right_indexer = right_lev_indexer.take(join_index.codes[level])
|
|
else:
|
|
right_indexer = join_index.codes[level]
|
|
|
|
if flip_order:
|
|
left_indexer, right_indexer = right_indexer, left_indexer
|
|
|
|
left_indexer = (
|
|
None if left_indexer is None else ensure_platform_int(left_indexer)
|
|
)
|
|
right_indexer = (
|
|
None if right_indexer is None else ensure_platform_int(right_indexer)
|
|
)
|
|
return join_index, left_indexer, right_indexer
|
|
|
|
@final
|
|
def _join_monotonic(
|
|
self, other: Index, how: JoinHow = "left"
|
|
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
|
|
# We only get here with matching dtypes and both monotonic increasing
|
|
assert other.dtype == self.dtype
|
|
|
|
if self.equals(other):
|
|
# This is a convenient place for this check, but its correctness
|
|
# does not depend on monotonicity, so it could go earlier
|
|
# in the calling method.
|
|
ret_index = other if how == "right" else self
|
|
return ret_index, None, None
|
|
|
|
ridx: npt.NDArray[np.intp] | None
|
|
lidx: npt.NDArray[np.intp] | None
|
|
|
|
if self.is_unique and other.is_unique:
|
|
# We can perform much better than the general case
|
|
if how == "left":
|
|
join_index = self
|
|
lidx = None
|
|
ridx = self._left_indexer_unique(other)
|
|
elif how == "right":
|
|
join_index = other
|
|
lidx = other._left_indexer_unique(self)
|
|
ridx = None
|
|
elif how == "inner":
|
|
join_array, lidx, ridx = self._inner_indexer(other)
|
|
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
|
|
elif how == "outer":
|
|
join_array, lidx, ridx = self._outer_indexer(other)
|
|
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
|
|
else:
|
|
if how == "left":
|
|
join_array, lidx, ridx = self._left_indexer(other)
|
|
elif how == "right":
|
|
join_array, ridx, lidx = other._left_indexer(self)
|
|
elif how == "inner":
|
|
join_array, lidx, ridx = self._inner_indexer(other)
|
|
elif how == "outer":
|
|
join_array, lidx, ridx = self._outer_indexer(other)
|
|
|
|
assert lidx is not None
|
|
assert ridx is not None
|
|
|
|
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
|
|
|
|
lidx = None if lidx is None else ensure_platform_int(lidx)
|
|
ridx = None if ridx is None else ensure_platform_int(ridx)
|
|
return join_index, lidx, ridx
|
|
|
|
def _wrap_joined_index(
|
|
self: _IndexT,
|
|
joined: ArrayLike,
|
|
other: _IndexT,
|
|
lidx: npt.NDArray[np.intp],
|
|
ridx: npt.NDArray[np.intp],
|
|
) -> _IndexT:
|
|
assert other.dtype == self.dtype
|
|
|
|
if isinstance(self, ABCMultiIndex):
|
|
name = self.names if self.names == other.names else None
|
|
# error: Incompatible return value type (got "MultiIndex",
|
|
# expected "_IndexT")
|
|
mask = lidx == -1
|
|
join_idx = self.take(lidx)
|
|
right = other.take(ridx)
|
|
join_index = join_idx.putmask(mask, right)._sort_levels_monotonic()
|
|
return join_index.set_names(name) # type: ignore[return-value]
|
|
else:
|
|
name = get_op_result_name(self, other)
|
|
return self._constructor._with_infer(joined, name=name, dtype=self.dtype)
|
|
|
|
@cache_readonly
|
|
def _can_use_libjoin(self) -> bool:
|
|
"""
|
|
Whether we can use the fastpaths implement in _libs.join
|
|
"""
|
|
if type(self) is Index:
|
|
# excludes EAs, but include masks, we get here with monotonic
|
|
# values only, meaning no NA
|
|
return (
|
|
isinstance(self.dtype, np.dtype)
|
|
or isinstance(self.values, BaseMaskedArray)
|
|
or isinstance(self._values, ArrowExtensionArray)
|
|
)
|
|
return not is_interval_dtype(self.dtype)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Uncategorized Methods
|
|
|
|
@property
|
|
def values(self) -> ArrayLike:
|
|
"""
|
|
Return an array representing the data in the Index.
|
|
|
|
.. warning::
|
|
|
|
We recommend using :attr:`Index.array` or
|
|
:meth:`Index.to_numpy`, depending on whether you need
|
|
a reference to the underlying data or a NumPy array.
|
|
|
|
Returns
|
|
-------
|
|
array: numpy.ndarray or ExtensionArray
|
|
|
|
See Also
|
|
--------
|
|
Index.array : Reference to the underlying data.
|
|
Index.to_numpy : A NumPy array representing the underlying data.
|
|
"""
|
|
return self._data
|
|
|
|
@cache_readonly
|
|
@doc(IndexOpsMixin.array)
|
|
def array(self) -> ExtensionArray:
|
|
array = self._data
|
|
if isinstance(array, np.ndarray):
|
|
from pandas.core.arrays.numpy_ import PandasArray
|
|
|
|
array = PandasArray(array)
|
|
return array
|
|
|
|
@property
|
|
def _values(self) -> ExtensionArray | np.ndarray:
|
|
"""
|
|
The best array representation.
|
|
|
|
This is an ndarray or ExtensionArray.
|
|
|
|
``_values`` are consistent between ``Series`` and ``Index``.
|
|
|
|
It may differ from the public '.values' method.
|
|
|
|
index | values | _values |
|
|
----------------- | --------------- | ------------- |
|
|
Index | ndarray | ndarray |
|
|
CategoricalIndex | Categorical | Categorical |
|
|
DatetimeIndex | ndarray[M8ns] | DatetimeArray |
|
|
DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray |
|
|
PeriodIndex | ndarray[object] | PeriodArray |
|
|
IntervalIndex | IntervalArray | IntervalArray |
|
|
|
|
See Also
|
|
--------
|
|
values : Values
|
|
"""
|
|
return self._data
|
|
|
|
def _get_engine_target(self) -> ArrayLike:
|
|
"""
|
|
Get the ndarray or ExtensionArray that we can pass to the IndexEngine
|
|
constructor.
|
|
"""
|
|
vals = self._values
|
|
if isinstance(vals, StringArray):
|
|
# GH#45652 much more performant than ExtensionEngine
|
|
return vals._ndarray
|
|
if (
|
|
type(self) is Index
|
|
and isinstance(self._values, ExtensionArray)
|
|
and not isinstance(self._values, BaseMaskedArray)
|
|
and not (
|
|
isinstance(self._values, ArrowExtensionArray)
|
|
and is_numeric_dtype(self.dtype)
|
|
# Exclude decimal
|
|
and self.dtype.kind != "O"
|
|
)
|
|
):
|
|
# TODO(ExtensionIndex): remove special-case, just use self._values
|
|
return self._values.astype(object)
|
|
return vals
|
|
|
|
def _get_join_target(self) -> ArrayLike:
|
|
"""
|
|
Get the ndarray or ExtensionArray that we can pass to the join
|
|
functions.
|
|
"""
|
|
if isinstance(self._values, BaseMaskedArray):
|
|
# This is only used if our array is monotonic, so no NAs present
|
|
return self._values._data
|
|
elif isinstance(self._values, ArrowExtensionArray):
|
|
# This is only used if our array is monotonic, so no missing values
|
|
# present
|
|
return self._values.to_numpy()
|
|
return self._get_engine_target()
|
|
|
|
def _from_join_target(self, result: np.ndarray) -> ArrayLike:
|
|
"""
|
|
Cast the ndarray returned from one of the libjoin.foo_indexer functions
|
|
back to type(self)._data.
|
|
"""
|
|
if isinstance(self.values, BaseMaskedArray):
|
|
return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
|
|
elif isinstance(self.values, ArrowExtensionArray):
|
|
return type(self.values)._from_sequence(result)
|
|
return result
|
|
|
|
@doc(IndexOpsMixin._memory_usage)
|
|
def memory_usage(self, deep: bool = False) -> int:
|
|
result = self._memory_usage(deep=deep)
|
|
|
|
# include our engine hashtable
|
|
result += self._engine.sizeof(deep=deep)
|
|
return result
|
|
|
|
@final
|
|
def where(self, cond, other=None) -> Index:
|
|
"""
|
|
Replace values where the condition is False.
|
|
|
|
The replacement is taken from other.
|
|
|
|
Parameters
|
|
----------
|
|
cond : bool array-like with the same length as self
|
|
Condition to select the values on.
|
|
other : scalar, or array-like, default None
|
|
Replacement if the condition is False.
|
|
|
|
Returns
|
|
-------
|
|
pandas.Index
|
|
A copy of self with values replaced from other
|
|
where the condition is False.
|
|
|
|
See Also
|
|
--------
|
|
Series.where : Same method for Series.
|
|
DataFrame.where : Same method for DataFrame.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['car', 'bike', 'train', 'tractor'])
|
|
>>> idx
|
|
Index(['car', 'bike', 'train', 'tractor'], dtype='object')
|
|
>>> idx.where(idx.isin(['car', 'train']), 'other')
|
|
Index(['car', 'other', 'train', 'other'], dtype='object')
|
|
"""
|
|
if isinstance(self, ABCMultiIndex):
|
|
raise NotImplementedError(
|
|
".where is not supported for MultiIndex operations"
|
|
)
|
|
cond = np.asarray(cond, dtype=bool)
|
|
return self.putmask(~cond, other)
|
|
|
|
# construction helpers
|
|
@final
|
|
@classmethod
|
|
def _raise_scalar_data_error(cls, data):
|
|
# We return the TypeError so that we can raise it from the constructor
|
|
# in order to keep mypy happy
|
|
raise TypeError(
|
|
f"{cls.__name__}(...) must be called with a collection of some "
|
|
f"kind, {repr(data)} was passed"
|
|
)
|
|
|
|
def _validate_fill_value(self, value):
|
|
"""
|
|
Check if the value can be inserted into our array without casting,
|
|
and convert it to an appropriate native type if necessary.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If the value cannot be inserted into an array of this dtype.
|
|
"""
|
|
dtype = self.dtype
|
|
if isinstance(dtype, np.dtype) and dtype.kind not in ["m", "M"]:
|
|
# return np_can_hold_element(dtype, value)
|
|
try:
|
|
return np_can_hold_element(dtype, value)
|
|
except LossySetitemError as err:
|
|
# re-raise as TypeError for consistency
|
|
raise TypeError from err
|
|
elif not can_hold_element(self._values, value):
|
|
raise TypeError
|
|
return value
|
|
|
|
@final
|
|
def _require_scalar(self, value):
|
|
"""
|
|
Check that this is a scalar value that we can use for setitem-like
|
|
operations without changing dtype.
|
|
"""
|
|
if not is_scalar(value):
|
|
raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
|
|
return value
|
|
|
|
def _is_memory_usage_qualified(self) -> bool:
|
|
"""
|
|
Return a boolean if we need a qualified .info display.
|
|
"""
|
|
return is_object_dtype(self.dtype)
|
|
|
|
def __contains__(self, key: Any) -> bool:
|
|
"""
|
|
Return a boolean indicating whether the provided key is in the index.
|
|
|
|
Parameters
|
|
----------
|
|
key : label
|
|
The key to check if it is present in the index.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether the key search is in the index.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If the key is not hashable.
|
|
|
|
See Also
|
|
--------
|
|
Index.isin : Returns an ndarray of boolean dtype indicating whether the
|
|
list-like key is in the index.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1, 2, 3, 4])
|
|
>>> idx
|
|
Index([1, 2, 3, 4], dtype='int64')
|
|
|
|
>>> 2 in idx
|
|
True
|
|
>>> 6 in idx
|
|
False
|
|
"""
|
|
hash(key)
|
|
try:
|
|
return key in self._engine
|
|
except (OverflowError, TypeError, ValueError):
|
|
return False
|
|
|
|
# https://github.com/python/typeshed/issues/2148#issuecomment-520783318
|
|
# Incompatible types in assignment (expression has type "None", base class
|
|
# "object" defined the type as "Callable[[object], int]")
|
|
__hash__: ClassVar[None] # type: ignore[assignment]
|
|
|
|
@final
|
|
def __setitem__(self, key, value):
|
|
raise TypeError("Index does not support mutable operations")
|
|
|
|
def __getitem__(self, key):
|
|
"""
|
|
Override numpy.ndarray's __getitem__ method to work as desired.
|
|
|
|
This function adds lists and Series as valid boolean indexers
|
|
(ndarrays only supports ndarray with dtype=bool).
|
|
|
|
If resulting ndim != 1, plain ndarray is returned instead of
|
|
corresponding `Index` subclass.
|
|
|
|
"""
|
|
getitem = self._data.__getitem__
|
|
|
|
if is_integer(key) or is_float(key):
|
|
# GH#44051 exclude bool, which would return a 2d ndarray
|
|
key = com.cast_scalar_indexer(key)
|
|
return getitem(key)
|
|
|
|
if isinstance(key, slice):
|
|
# This case is separated from the conditional above to avoid
|
|
# pessimization com.is_bool_indexer and ndim checks.
|
|
result = getitem(key)
|
|
# Going through simple_new for performance.
|
|
return type(self)._simple_new(
|
|
result, name=self._name, refs=self._references
|
|
)
|
|
|
|
if com.is_bool_indexer(key):
|
|
# if we have list[bools, length=1e5] then doing this check+convert
|
|
# takes 166 µs + 2.1 ms and cuts the ndarray.__getitem__
|
|
# time below from 3.8 ms to 496 µs
|
|
# if we already have ndarray[bool], the overhead is 1.4 µs or .25%
|
|
if is_extension_array_dtype(getattr(key, "dtype", None)):
|
|
key = key.to_numpy(dtype=bool, na_value=False)
|
|
else:
|
|
key = np.asarray(key, dtype=bool)
|
|
|
|
result = getitem(key)
|
|
# Because we ruled out integer above, we always get an arraylike here
|
|
if result.ndim > 1:
|
|
disallow_ndim_indexing(result)
|
|
|
|
# NB: Using _constructor._simple_new would break if MultiIndex
|
|
# didn't override __getitem__
|
|
return self._constructor._simple_new(result, name=self._name)
|
|
|
|
def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT:
|
|
"""
|
|
Fastpath for __getitem__ when we know we have a slice.
|
|
"""
|
|
res = self._data[slobj]
|
|
return type(self)._simple_new(res, name=self._name, refs=self._references)
|
|
|
|
@final
|
|
def _can_hold_identifiers_and_holds_name(self, name) -> bool:
|
|
"""
|
|
Faster check for ``name in self`` when we know `name` is a Python
|
|
identifier (e.g. in NDFrame.__getattr__, which hits this to support
|
|
. key lookup). For indexes that can't hold identifiers (everything
|
|
but object & categorical) we just return False.
|
|
|
|
https://github.com/pandas-dev/pandas/issues/19764
|
|
"""
|
|
if (
|
|
is_object_dtype(self.dtype)
|
|
or is_string_dtype(self.dtype)
|
|
or is_categorical_dtype(self.dtype)
|
|
):
|
|
return name in self
|
|
return False
|
|
|
|
def append(self, other: Index | Sequence[Index]) -> Index:
|
|
"""
|
|
Append a collection of Index options together.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index or list/tuple of indices
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
to_concat = [self]
|
|
|
|
if isinstance(other, (list, tuple)):
|
|
to_concat += list(other)
|
|
else:
|
|
# error: Argument 1 to "append" of "list" has incompatible type
|
|
# "Union[Index, Sequence[Index]]"; expected "Index"
|
|
to_concat.append(other) # type: ignore[arg-type]
|
|
|
|
for obj in to_concat:
|
|
if not isinstance(obj, Index):
|
|
raise TypeError("all inputs must be Index")
|
|
|
|
names = {obj.name for obj in to_concat}
|
|
name = None if len(names) > 1 else self.name
|
|
|
|
return self._concat(to_concat, name)
|
|
|
|
def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
|
|
"""
|
|
Concatenate multiple Index objects.
|
|
"""
|
|
to_concat_vals = [x._values for x in to_concat]
|
|
|
|
result = concat_compat(to_concat_vals)
|
|
|
|
return Index._with_infer(result, name=name)
|
|
|
|
def putmask(self, mask, value) -> Index:
|
|
"""
|
|
Return a new Index of the values set with the mask.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
|
|
See Also
|
|
--------
|
|
numpy.ndarray.putmask : Changes elements of an array
|
|
based on conditional and input values.
|
|
"""
|
|
mask, noop = validate_putmask(self._values, mask)
|
|
if noop:
|
|
return self.copy()
|
|
|
|
if self.dtype != object and is_valid_na_for_dtype(value, self.dtype):
|
|
# e.g. None -> np.nan, see also Block._standardize_fill_value
|
|
value = self._na_value
|
|
|
|
try:
|
|
converted = self._validate_fill_value(value)
|
|
except (LossySetitemError, ValueError, TypeError) as err:
|
|
if is_object_dtype(self): # pragma: no cover
|
|
raise err
|
|
|
|
# See also: Block.coerce_to_target_dtype
|
|
dtype = self._find_common_type_compat(value)
|
|
return self.astype(dtype).putmask(mask, value)
|
|
|
|
values = self._values.copy()
|
|
|
|
if isinstance(values, np.ndarray):
|
|
converted = setitem_datetimelike_compat(values, mask.sum(), converted)
|
|
np.putmask(values, mask, converted)
|
|
|
|
else:
|
|
# Note: we use the original value here, not converted, as
|
|
# _validate_fill_value is not idempotent
|
|
values._putmask(mask, value)
|
|
|
|
return self._shallow_copy(values)
|
|
|
|
def equals(self, other: Any) -> bool:
|
|
"""
|
|
Determine if two Index object are equal.
|
|
|
|
The things that are being compared are:
|
|
|
|
* The elements inside the Index object.
|
|
* The order of the elements inside the Index object.
|
|
|
|
Parameters
|
|
----------
|
|
other : Any
|
|
The other object to compare against.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if "other" is an Index and it has the same elements and order
|
|
as the calling index; False otherwise.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx1 = pd.Index([1, 2, 3])
|
|
>>> idx1
|
|
Index([1, 2, 3], dtype='int64')
|
|
>>> idx1.equals(pd.Index([1, 2, 3]))
|
|
True
|
|
|
|
The elements inside are compared
|
|
|
|
>>> idx2 = pd.Index(["1", "2", "3"])
|
|
>>> idx2
|
|
Index(['1', '2', '3'], dtype='object')
|
|
|
|
>>> idx1.equals(idx2)
|
|
False
|
|
|
|
The order is compared
|
|
|
|
>>> ascending_idx = pd.Index([1, 2, 3])
|
|
>>> ascending_idx
|
|
Index([1, 2, 3], dtype='int64')
|
|
>>> descending_idx = pd.Index([3, 2, 1])
|
|
>>> descending_idx
|
|
Index([3, 2, 1], dtype='int64')
|
|
>>> ascending_idx.equals(descending_idx)
|
|
False
|
|
|
|
The dtype is *not* compared
|
|
|
|
>>> int64_idx = pd.Index([1, 2, 3], dtype='int64')
|
|
>>> int64_idx
|
|
Index([1, 2, 3], dtype='int64')
|
|
>>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64')
|
|
>>> uint64_idx
|
|
Index([1, 2, 3], dtype='uint64')
|
|
>>> int64_idx.equals(uint64_idx)
|
|
True
|
|
"""
|
|
if self.is_(other):
|
|
return True
|
|
|
|
if not isinstance(other, Index):
|
|
return False
|
|
|
|
if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
|
|
# if other is not object, use other's logic for coercion
|
|
return other.equals(self)
|
|
|
|
if isinstance(other, ABCMultiIndex):
|
|
# d-level MultiIndex can equal d-tuple Index
|
|
return other.equals(self)
|
|
|
|
if isinstance(self._values, ExtensionArray):
|
|
# Dispatch to the ExtensionArray's .equals method.
|
|
if not isinstance(other, type(self)):
|
|
return False
|
|
|
|
earr = cast(ExtensionArray, self._data)
|
|
return earr.equals(other._data)
|
|
|
|
if is_extension_array_dtype(other.dtype):
|
|
# All EA-backed Index subclasses override equals
|
|
return other.equals(self)
|
|
|
|
return array_equivalent(self._values, other._values)
|
|
|
|
@final
|
|
def identical(self, other) -> bool:
|
|
"""
|
|
Similar to equals, but checks that object attributes and types are also equal.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
If two Index objects have equal elements and same type True,
|
|
otherwise False.
|
|
"""
|
|
return (
|
|
self.equals(other)
|
|
and all(
|
|
getattr(self, c, None) == getattr(other, c, None)
|
|
for c in self._comparables
|
|
)
|
|
and type(self) == type(other)
|
|
and self.dtype == other.dtype
|
|
)
|
|
|
|
@final
|
|
def asof(self, label):
|
|
"""
|
|
Return the label from the index, or, if not present, the previous one.
|
|
|
|
Assuming that the index is sorted, return the passed index label if it
|
|
is in the index, or return the previous index label if the passed one
|
|
is not in the index.
|
|
|
|
Parameters
|
|
----------
|
|
label : object
|
|
The label up to which the method returns the latest index label.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
The passed label if it is in the index. The previous label if the
|
|
passed label is not in the sorted index or `NaN` if there is no
|
|
such label.
|
|
|
|
See Also
|
|
--------
|
|
Series.asof : Return the latest value in a Series up to the
|
|
passed index.
|
|
merge_asof : Perform an asof merge (similar to left join but it
|
|
matches on nearest key rather than equal key).
|
|
Index.get_loc : An `asof` is a thin wrapper around `get_loc`
|
|
with method='pad'.
|
|
|
|
Examples
|
|
--------
|
|
`Index.asof` returns the latest index label up to the passed label.
|
|
|
|
>>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
|
|
>>> idx.asof('2014-01-01')
|
|
'2013-12-31'
|
|
|
|
If the label is in the index, the method returns the passed label.
|
|
|
|
>>> idx.asof('2014-01-02')
|
|
'2014-01-02'
|
|
|
|
If all of the labels in the index are later than the passed label,
|
|
NaN is returned.
|
|
|
|
>>> idx.asof('1999-01-02')
|
|
nan
|
|
|
|
If the index is not sorted, an error is raised.
|
|
|
|
>>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02',
|
|
... '2014-01-03'])
|
|
>>> idx_not_sorted.asof('2013-12-31')
|
|
Traceback (most recent call last):
|
|
ValueError: index must be monotonic increasing or decreasing
|
|
"""
|
|
self._searchsorted_monotonic(label) # validate sortedness
|
|
try:
|
|
loc = self.get_loc(label)
|
|
except (KeyError, TypeError):
|
|
# KeyError -> No exact match, try for padded
|
|
# TypeError -> passed e.g. non-hashable, fall through to get
|
|
# the tested exception message
|
|
indexer = self.get_indexer([label], method="pad")
|
|
if indexer.ndim > 1 or indexer.size > 1:
|
|
raise TypeError("asof requires scalar valued input")
|
|
loc = indexer.item()
|
|
if loc == -1:
|
|
return self._na_value
|
|
else:
|
|
if isinstance(loc, slice):
|
|
loc = loc.indices(len(self))[-1]
|
|
|
|
return self[loc]
|
|
|
|
def asof_locs(
|
|
self, where: Index, mask: npt.NDArray[np.bool_]
|
|
) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Return the locations (indices) of labels in the index.
|
|
|
|
As in the `asof` function, if the label (a particular entry in
|
|
`where`) is not in the index, the latest index label up to the
|
|
passed label is chosen and its index returned.
|
|
|
|
If all of the labels in the index are later than a label in `where`,
|
|
-1 is returned.
|
|
|
|
`mask` is used to ignore NA values in the index during calculation.
|
|
|
|
Parameters
|
|
----------
|
|
where : Index
|
|
An Index consisting of an array of timestamps.
|
|
mask : np.ndarray[bool]
|
|
Array of booleans denoting where values in the original
|
|
data are not NA.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
An array of locations (indices) of the labels from the Index
|
|
which correspond to the return values of the `asof` function
|
|
for every element in `where`.
|
|
"""
|
|
# error: No overload variant of "searchsorted" of "ndarray" matches argument
|
|
# types "Union[ExtensionArray, ndarray[Any, Any]]", "str"
|
|
# TODO: will be fixed when ExtensionArray.searchsorted() is fixed
|
|
locs = self._values[mask].searchsorted(
|
|
where._values, side="right" # type: ignore[call-overload]
|
|
)
|
|
locs = np.where(locs > 0, locs - 1, 0)
|
|
|
|
result = np.arange(len(self), dtype=np.intp)[mask].take(locs)
|
|
|
|
first_value = self._values[mask.argmax()]
|
|
result[(locs == 0) & (where._values < first_value)] = -1
|
|
|
|
return result
|
|
|
|
def sort_values(
|
|
self,
|
|
return_indexer: bool = False,
|
|
ascending: bool = True,
|
|
na_position: str_t = "last",
|
|
key: Callable | None = None,
|
|
):
|
|
"""
|
|
Return a sorted copy of the index.
|
|
|
|
Return a sorted copy of the index, and optionally return the indices
|
|
that sorted the index itself.
|
|
|
|
Parameters
|
|
----------
|
|
return_indexer : bool, default False
|
|
Should the indices that would sort the index be returned.
|
|
ascending : bool, default True
|
|
Should the index values be sorted in an ascending order.
|
|
na_position : {'first' or 'last'}, default 'last'
|
|
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
|
|
the end.
|
|
|
|
.. versionadded:: 1.2.0
|
|
|
|
key : callable, optional
|
|
If not None, apply the key function to the index values
|
|
before sorting. This is similar to the `key` argument in the
|
|
builtin :meth:`sorted` function, with the notable difference that
|
|
this `key` function should be *vectorized*. It should expect an
|
|
``Index`` and return an ``Index`` of the same shape.
|
|
|
|
.. versionadded:: 1.1.0
|
|
|
|
Returns
|
|
-------
|
|
sorted_index : pandas.Index
|
|
Sorted copy of the index.
|
|
indexer : numpy.ndarray, optional
|
|
The indices that the index itself was sorted by.
|
|
|
|
See Also
|
|
--------
|
|
Series.sort_values : Sort values of a Series.
|
|
DataFrame.sort_values : Sort values in a DataFrame.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([10, 100, 1, 1000])
|
|
>>> idx
|
|
Index([10, 100, 1, 1000], dtype='int64')
|
|
|
|
Sort values in ascending order (default behavior).
|
|
|
|
>>> idx.sort_values()
|
|
Index([1, 10, 100, 1000], dtype='int64')
|
|
|
|
Sort values in descending order, and also get the indices `idx` was
|
|
sorted by.
|
|
|
|
>>> idx.sort_values(ascending=False, return_indexer=True)
|
|
(Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2]))
|
|
"""
|
|
idx = ensure_key_mapped(self, key)
|
|
|
|
# GH 35584. Sort missing values according to na_position kwarg
|
|
# ignore na_position for MultiIndex
|
|
if not isinstance(self, ABCMultiIndex):
|
|
_as = nargsort(
|
|
items=idx, ascending=ascending, na_position=na_position, key=key
|
|
)
|
|
else:
|
|
_as = idx.argsort()
|
|
if not ascending:
|
|
_as = _as[::-1]
|
|
|
|
sorted_index = self.take(_as)
|
|
|
|
if return_indexer:
|
|
return sorted_index, _as
|
|
else:
|
|
return sorted_index
|
|
|
|
@final
|
|
def sort(self, *args, **kwargs):
|
|
"""
|
|
Use sort_values instead.
|
|
"""
|
|
raise TypeError("cannot sort an Index object in-place, use sort_values instead")
|
|
|
|
def shift(self, periods: int = 1, freq=None):
|
|
"""
|
|
Shift index by desired number of time frequency increments.
|
|
|
|
This method is for shifting the values of datetime-like indexes
|
|
by a specified time increment a given number of times.
|
|
|
|
Parameters
|
|
----------
|
|
periods : int, default 1
|
|
Number of periods (or increments) to shift by,
|
|
can be positive or negative.
|
|
freq : pandas.DateOffset, pandas.Timedelta or str, optional
|
|
Frequency increment to shift by.
|
|
If None, the index is shifted by its own `freq` attribute.
|
|
Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.
|
|
|
|
Returns
|
|
-------
|
|
pandas.Index
|
|
Shifted index.
|
|
|
|
See Also
|
|
--------
|
|
Series.shift : Shift values of Series.
|
|
|
|
Notes
|
|
-----
|
|
This method is only implemented for datetime-like index classes,
|
|
i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex.
|
|
|
|
Examples
|
|
--------
|
|
Put the first 5 month starts of 2011 into an index.
|
|
|
|
>>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS')
|
|
>>> month_starts
|
|
DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01',
|
|
'2011-05-01'],
|
|
dtype='datetime64[ns]', freq='MS')
|
|
|
|
Shift the index by 10 days.
|
|
|
|
>>> month_starts.shift(10, freq='D')
|
|
DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11',
|
|
'2011-05-11'],
|
|
dtype='datetime64[ns]', freq=None)
|
|
|
|
The default value of `freq` is the `freq` attribute of the index,
|
|
which is 'MS' (month start) in this example.
|
|
|
|
>>> month_starts.shift(10)
|
|
DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01',
|
|
'2012-03-01'],
|
|
dtype='datetime64[ns]', freq='MS')
|
|
"""
|
|
raise NotImplementedError(
|
|
f"This method is only implemented for DatetimeIndex, PeriodIndex and "
|
|
f"TimedeltaIndex; Got type {type(self).__name__}"
|
|
)
|
|
|
|
def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Return the integer indices that would sort the index.
|
|
|
|
Parameters
|
|
----------
|
|
*args
|
|
Passed to `numpy.ndarray.argsort`.
|
|
**kwargs
|
|
Passed to `numpy.ndarray.argsort`.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
Integer indices that would sort the index if used as
|
|
an indexer.
|
|
|
|
See Also
|
|
--------
|
|
numpy.argsort : Similar method for NumPy arrays.
|
|
Index.sort_values : Return sorted copy of Index.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['b', 'a', 'd', 'c'])
|
|
>>> idx
|
|
Index(['b', 'a', 'd', 'c'], dtype='object')
|
|
|
|
>>> order = idx.argsort()
|
|
>>> order
|
|
array([1, 0, 3, 2])
|
|
|
|
>>> idx[order]
|
|
Index(['a', 'b', 'c', 'd'], dtype='object')
|
|
"""
|
|
# This works for either ndarray or EA, is overridden
|
|
# by RangeIndex, MultIIndex
|
|
return self._data.argsort(*args, **kwargs)
|
|
|
|
def _check_indexing_error(self, key):
|
|
if not is_scalar(key):
|
|
# if key is not a scalar, directly raise an error (the code below
|
|
# would convert to numpy arrays and raise later any way) - GH29926
|
|
raise InvalidIndexError(key)
|
|
|
|
@cache_readonly
|
|
def _should_fallback_to_positional(self) -> bool:
|
|
"""
|
|
Should an integer key be treated as positional?
|
|
"""
|
|
return self.inferred_type not in {
|
|
"integer",
|
|
"mixed-integer",
|
|
"floating",
|
|
"complex",
|
|
}
|
|
|
|
_index_shared_docs[
|
|
"get_indexer_non_unique"
|
|
] = """
|
|
Compute indexer and mask for new index given the current index.
|
|
|
|
The indexer should be then used as an input to ndarray.take to align the
|
|
current data to the new index.
|
|
|
|
Parameters
|
|
----------
|
|
target : %(target_klass)s
|
|
|
|
Returns
|
|
-------
|
|
indexer : np.ndarray[np.intp]
|
|
Integers from 0 to n - 1 indicating that the index at these
|
|
positions matches the corresponding target values. Missing values
|
|
in the target are marked by -1.
|
|
missing : np.ndarray[np.intp]
|
|
An indexer into the target of the values not found.
|
|
These correspond to the -1 in the indexer array.
|
|
|
|
Examples
|
|
--------
|
|
>>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
|
|
>>> index.get_indexer_non_unique(['b', 'b'])
|
|
(array([1, 3, 4, 1, 3, 4]), array([], dtype=int64))
|
|
|
|
In the example below there are no matched values.
|
|
|
|
>>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
|
|
>>> index.get_indexer_non_unique(['q', 'r', 't'])
|
|
(array([-1, -1, -1]), array([0, 1, 2]))
|
|
|
|
For this reason, the returned ``indexer`` contains only integers equal to -1.
|
|
It demonstrates that there's no match between the index and the ``target``
|
|
values at these positions. The mask [0, 1, 2] in the return value shows that
|
|
the first, second, and third elements are missing.
|
|
|
|
Notice that the return value is a tuple contains two items. In the example
|
|
below the first item is an array of locations in ``index``. The second
|
|
item is a mask shows that the first and third elements are missing.
|
|
|
|
>>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
|
|
>>> index.get_indexer_non_unique(['f', 'b', 's'])
|
|
(array([-1, 1, 3, 4, -1]), array([0, 2]))
|
|
"""
|
|
|
|
@Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
|
|
def get_indexer_non_unique(
|
|
self, target
|
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
target = ensure_index(target)
|
|
target = self._maybe_cast_listlike_indexer(target)
|
|
|
|
if not self._should_compare(target) and not self._should_partial_index(target):
|
|
# _should_partial_index e.g. IntervalIndex with numeric scalars
|
|
# that can be matched to Interval scalars.
|
|
return self._get_indexer_non_comparable(target, method=None, unique=False)
|
|
|
|
pself, ptarget = self._maybe_promote(target)
|
|
if pself is not self or ptarget is not target:
|
|
return pself.get_indexer_non_unique(ptarget)
|
|
|
|
if not is_dtype_equal(self.dtype, target.dtype):
|
|
# TODO: if object, could use infer_dtype to preempt costly
|
|
# conversion if still non-comparable?
|
|
dtype = self._find_common_type_compat(target)
|
|
|
|
this = self.astype(dtype, copy=False)
|
|
that = target.astype(dtype, copy=False)
|
|
return this.get_indexer_non_unique(that)
|
|
|
|
# TODO: get_indexer has fastpaths for both Categorical-self and
|
|
# Categorical-target. Can we do something similar here?
|
|
|
|
# Note: _maybe_promote ensures we never get here with MultiIndex
|
|
# self and non-Multi target
|
|
tgt_values = target._get_engine_target()
|
|
if self._is_multi and target._is_multi:
|
|
engine = self._engine
|
|
# Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has
|
|
# no attribute "_extract_level_codes"
|
|
tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr]
|
|
|
|
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
|
|
return ensure_platform_int(indexer), ensure_platform_int(missing)
|
|
|
|
@final
|
|
def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
|
|
"""
|
|
Guaranteed return of an indexer even when non-unique.
|
|
|
|
This dispatches to get_indexer or get_indexer_non_unique
|
|
as appropriate.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
List of indices.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([np.nan, 'var1', np.nan])
|
|
>>> idx.get_indexer_for([np.nan])
|
|
array([0, 2])
|
|
"""
|
|
if self._index_as_unique:
|
|
return self.get_indexer(target)
|
|
indexer, _ = self.get_indexer_non_unique(target)
|
|
return indexer
|
|
|
|
def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]:
|
|
"""
|
|
Analogue to get_indexer that raises if any elements are missing.
|
|
"""
|
|
keyarr = key
|
|
if not isinstance(keyarr, Index):
|
|
keyarr = com.asarray_tuplesafe(keyarr)
|
|
|
|
if self._index_as_unique:
|
|
indexer = self.get_indexer_for(keyarr)
|
|
keyarr = self.reindex(keyarr)[0]
|
|
else:
|
|
keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
|
|
|
|
self._raise_if_missing(keyarr, indexer, axis_name)
|
|
|
|
keyarr = self.take(indexer)
|
|
if isinstance(key, Index):
|
|
# GH 42790 - Preserve name from an Index
|
|
keyarr.name = key.name
|
|
if keyarr.dtype.kind in ["m", "M"]:
|
|
# DTI/TDI.take can infer a freq in some cases when we dont want one
|
|
if isinstance(key, list) or (
|
|
isinstance(key, type(self))
|
|
# "Index" has no attribute "freq"
|
|
and key.freq is None # type: ignore[attr-defined]
|
|
):
|
|
keyarr = keyarr._with_freq(None)
|
|
|
|
return keyarr, indexer
|
|
|
|
def _raise_if_missing(self, key, indexer, axis_name: str_t) -> None:
|
|
"""
|
|
Check that indexer can be used to return a result.
|
|
|
|
e.g. at least one element was found,
|
|
unless the list of keys was actually empty.
|
|
|
|
Parameters
|
|
----------
|
|
key : list-like
|
|
Targeted labels (only used to show correct error message).
|
|
indexer: array-like of booleans
|
|
Indices corresponding to the key,
|
|
(with -1 indicating not found).
|
|
axis_name : str
|
|
|
|
Raises
|
|
------
|
|
KeyError
|
|
If at least one key was requested but none was found.
|
|
"""
|
|
if len(key) == 0:
|
|
return
|
|
|
|
# Count missing values
|
|
missing_mask = indexer < 0
|
|
nmissing = missing_mask.sum()
|
|
|
|
if nmissing:
|
|
# TODO: remove special-case; this is just to keep exception
|
|
# message tests from raising while debugging
|
|
use_interval_msg = is_interval_dtype(self.dtype) or (
|
|
is_categorical_dtype(self.dtype)
|
|
# "Index" has no attribute "categories" [attr-defined]
|
|
and is_interval_dtype(
|
|
self.categories.dtype # type: ignore[attr-defined]
|
|
)
|
|
)
|
|
|
|
if nmissing == len(indexer):
|
|
if use_interval_msg:
|
|
key = list(key)
|
|
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
|
|
|
|
not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
|
|
raise KeyError(f"{not_found} not in index")
|
|
|
|
@overload
|
|
def _get_indexer_non_comparable(
|
|
self, target: Index, method, unique: Literal[True] = ...
|
|
) -> npt.NDArray[np.intp]:
|
|
...
|
|
|
|
@overload
|
|
def _get_indexer_non_comparable(
|
|
self, target: Index, method, unique: Literal[False]
|
|
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
...
|
|
|
|
@overload
|
|
def _get_indexer_non_comparable(
|
|
self, target: Index, method, unique: bool = True
|
|
) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
...
|
|
|
|
@final
|
|
def _get_indexer_non_comparable(
|
|
self, target: Index, method, unique: bool = True
|
|
) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
|
|
"""
|
|
Called from get_indexer or get_indexer_non_unique when the target
|
|
is of a non-comparable dtype.
|
|
|
|
For get_indexer lookups with method=None, get_indexer is an _equality_
|
|
check, so non-comparable dtypes mean we will always have no matches.
|
|
|
|
For get_indexer lookups with a method, get_indexer is an _inequality_
|
|
check, so non-comparable dtypes mean we will always raise TypeError.
|
|
|
|
Parameters
|
|
----------
|
|
target : Index
|
|
method : str or None
|
|
unique : bool, default True
|
|
* True if called from get_indexer.
|
|
* False if called from get_indexer_non_unique.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If doing an inequality check, i.e. method is not None.
|
|
"""
|
|
if method is not None:
|
|
other = _unpack_nested_dtype(target)
|
|
raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
|
|
|
|
no_matches = -1 * np.ones(target.shape, dtype=np.intp)
|
|
if unique:
|
|
# This is for get_indexer
|
|
return no_matches
|
|
else:
|
|
# This is for get_indexer_non_unique
|
|
missing = np.arange(len(target), dtype=np.intp)
|
|
return no_matches, missing
|
|
|
|
@property
|
|
def _index_as_unique(self) -> bool:
|
|
"""
|
|
Whether we should treat this as unique for the sake of
|
|
get_indexer vs get_indexer_non_unique.
|
|
|
|
For IntervalIndex compat.
|
|
"""
|
|
return self.is_unique
|
|
|
|
_requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"
|
|
|
|
@final
|
|
def _maybe_promote(self, other: Index) -> tuple[Index, Index]:
|
|
"""
|
|
When dealing with an object-dtype Index and a non-object Index, see
|
|
if we can upcast the object-dtype one to improve performance.
|
|
"""
|
|
|
|
if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
|
|
if (
|
|
self.tz is not None
|
|
and other.tz is not None
|
|
and not tz_compare(self.tz, other.tz)
|
|
):
|
|
# standardize on UTC
|
|
return self.tz_convert("UTC"), other.tz_convert("UTC")
|
|
|
|
elif self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex):
|
|
try:
|
|
return type(other)(self), other
|
|
except OutOfBoundsDatetime:
|
|
return self, other
|
|
elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex):
|
|
# TODO: we dont have tests that get here
|
|
return type(other)(self), other
|
|
|
|
elif self.dtype.kind == "u" and other.dtype.kind == "i":
|
|
# GH#41873
|
|
if other.min() >= 0:
|
|
# lookup min as it may be cached
|
|
# TODO: may need itemsize check if we have non-64-bit Indexes
|
|
return self, other.astype(self.dtype)
|
|
|
|
elif self._is_multi and not other._is_multi:
|
|
try:
|
|
# "Type[Index]" has no attribute "from_tuples"
|
|
other = type(self).from_tuples(other) # type: ignore[attr-defined]
|
|
except (TypeError, ValueError):
|
|
# let's instead try with a straight Index
|
|
self = Index(self._values)
|
|
|
|
if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype):
|
|
# Reverse op so we dont need to re-implement on the subclasses
|
|
other, self = other._maybe_promote(self)
|
|
|
|
return self, other
|
|
|
|
@final
|
|
def _find_common_type_compat(self, target) -> DtypeObj:
|
|
"""
|
|
Implementation of find_common_type that adjusts for Index-specific
|
|
special cases.
|
|
"""
|
|
target_dtype, _ = infer_dtype_from(target, pandas_dtype=True)
|
|
|
|
# special case: if one dtype is uint64 and the other a signed int, return object
|
|
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
|
|
# Now it's:
|
|
# * float | [u]int -> float
|
|
# * uint64 | signed int -> object
|
|
# We may change union(float | [u]int) to go to object.
|
|
if self.dtype == "uint64" or target_dtype == "uint64":
|
|
if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype(
|
|
target_dtype
|
|
):
|
|
return _dtype_obj
|
|
|
|
dtype = find_result_type(self._values, target)
|
|
dtype = common_dtype_categorical_compat([self, target], dtype)
|
|
return dtype
|
|
|
|
@final
|
|
def _should_compare(self, other: Index) -> bool:
|
|
"""
|
|
Check if `self == other` can ever have non-False entries.
|
|
"""
|
|
|
|
if (is_bool_dtype(other) and is_any_real_numeric_dtype(self)) or (
|
|
is_bool_dtype(self) and is_any_real_numeric_dtype(other)
|
|
):
|
|
# GH#16877 Treat boolean labels passed to a numeric index as not
|
|
# found. Without this fix False and True would be treated as 0 and 1
|
|
# respectively.
|
|
return False
|
|
|
|
other = _unpack_nested_dtype(other)
|
|
dtype = other.dtype
|
|
return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
|
|
|
|
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
|
|
"""
|
|
Can we compare values of the given dtype to our own?
|
|
"""
|
|
if self.dtype.kind == "b":
|
|
return dtype.kind == "b"
|
|
elif is_numeric_dtype(self.dtype):
|
|
return is_numeric_dtype(dtype)
|
|
# TODO: this was written assuming we only get here with object-dtype,
|
|
# which is nom longer correct. Can we specialize for EA?
|
|
return True
|
|
|
|
@final
|
|
def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]:
|
|
"""
|
|
Group the index labels by a given array of values.
|
|
|
|
Parameters
|
|
----------
|
|
values : array
|
|
Values used to determine the groups.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
{group name -> group labels}
|
|
"""
|
|
# TODO: if we are a MultiIndex, we can do better
|
|
# that converting to tuples
|
|
if isinstance(values, ABCMultiIndex):
|
|
values = values._values
|
|
values = Categorical(values)
|
|
result = values._reverse_indexer()
|
|
|
|
# map to the label
|
|
result = {k: self.take(v) for k, v in result.items()}
|
|
|
|
return PrettyDict(result)
|
|
|
|
def map(self, mapper, na_action=None):
|
|
"""
|
|
Map values using an input mapping or function.
|
|
|
|
Parameters
|
|
----------
|
|
mapper : function, dict, or Series
|
|
Mapping correspondence.
|
|
na_action : {None, 'ignore'}
|
|
If 'ignore', propagate NA values, without passing them to the
|
|
mapping correspondence.
|
|
|
|
Returns
|
|
-------
|
|
Union[Index, MultiIndex]
|
|
The output of the mapping function applied to the index.
|
|
If the function returns a tuple with more than one element
|
|
a MultiIndex will be returned.
|
|
"""
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
new_values = self._map_values(mapper, na_action=na_action)
|
|
|
|
# we can return a MultiIndex
|
|
if new_values.size and isinstance(new_values[0], tuple):
|
|
if isinstance(self, MultiIndex):
|
|
names = self.names
|
|
elif self.name:
|
|
names = [self.name] * len(new_values[0])
|
|
else:
|
|
names = None
|
|
return MultiIndex.from_tuples(new_values, names=names)
|
|
|
|
dtype = None
|
|
if not new_values.size:
|
|
# empty
|
|
dtype = self.dtype
|
|
|
|
# e.g. if we are floating and new_values is all ints, then we
|
|
# don't want to cast back to floating. But if we are UInt64
|
|
# and new_values is all ints, we want to try.
|
|
same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type
|
|
if same_dtype:
|
|
new_values = maybe_cast_pointwise_result(
|
|
new_values, self.dtype, same_dtype=same_dtype
|
|
)
|
|
|
|
return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
|
|
|
|
# TODO: De-duplicate with map, xref GH#32349
|
|
@final
|
|
def _transform_index(self, func, *, level=None) -> Index:
|
|
"""
|
|
Apply function to all values found in index.
|
|
|
|
This includes transforming multiindex entries separately.
|
|
Only apply function to one level of the MultiIndex if level is specified.
|
|
"""
|
|
if isinstance(self, ABCMultiIndex):
|
|
values = [
|
|
self.get_level_values(i).map(func)
|
|
if i == level or level is None
|
|
else self.get_level_values(i)
|
|
for i in range(self.nlevels)
|
|
]
|
|
return type(self).from_arrays(values)
|
|
else:
|
|
items = [func(x) for x in self]
|
|
return Index(items, name=self.name, tupleize_cols=False)
|
|
|
|
def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
|
|
"""
|
|
Return a boolean array where the index values are in `values`.
|
|
|
|
Compute boolean array of whether each index value is found in the
|
|
passed set of values. The length of the returned boolean array matches
|
|
the length of the index.
|
|
|
|
Parameters
|
|
----------
|
|
values : set or list-like
|
|
Sought values.
|
|
level : str or int, optional
|
|
Name or position of the index level to use (if the index is a
|
|
`MultiIndex`).
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[bool]
|
|
NumPy array of boolean values.
|
|
|
|
See Also
|
|
--------
|
|
Series.isin : Same for Series.
|
|
DataFrame.isin : Same method for DataFrames.
|
|
|
|
Notes
|
|
-----
|
|
In the case of `MultiIndex` you must either specify `values` as a
|
|
list-like object containing tuples that are the same length as the
|
|
number of levels, or specify `level`. Otherwise it will raise a
|
|
``ValueError``.
|
|
|
|
If `level` is specified:
|
|
|
|
- if it is the name of one *and only one* index level, use that level;
|
|
- otherwise it should be a number indicating level position.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index([1,2,3])
|
|
>>> idx
|
|
Index([1, 2, 3], dtype='int64')
|
|
|
|
Check whether each index value in a list of values.
|
|
|
|
>>> idx.isin([1, 4])
|
|
array([ True, False, False])
|
|
|
|
>>> midx = pd.MultiIndex.from_arrays([[1,2,3],
|
|
... ['red', 'blue', 'green']],
|
|
... names=('number', 'color'))
|
|
>>> midx
|
|
MultiIndex([(1, 'red'),
|
|
(2, 'blue'),
|
|
(3, 'green')],
|
|
names=['number', 'color'])
|
|
|
|
Check whether the strings in the 'color' level of the MultiIndex
|
|
are in a list of colors.
|
|
|
|
>>> midx.isin(['red', 'orange', 'yellow'], level='color')
|
|
array([ True, False, False])
|
|
|
|
To check across the levels of a MultiIndex, pass a list of tuples:
|
|
|
|
>>> midx.isin([(1, 'red'), (3, 'red')])
|
|
array([ True, False, False])
|
|
|
|
For a DatetimeIndex, string values in `values` are converted to
|
|
Timestamps.
|
|
|
|
>>> dates = ['2000-03-11', '2000-03-12', '2000-03-13']
|
|
>>> dti = pd.to_datetime(dates)
|
|
>>> dti
|
|
DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'],
|
|
dtype='datetime64[ns]', freq=None)
|
|
|
|
>>> dti.isin(['2000-03-11'])
|
|
array([ True, False, False])
|
|
"""
|
|
if level is not None:
|
|
self._validate_index_level(level)
|
|
return algos.isin(self._values, values)
|
|
|
|
def _get_string_slice(self, key: str_t):
|
|
# this is for partial string indexing,
|
|
# overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex
|
|
raise NotImplementedError
|
|
|
|
def slice_indexer(
|
|
self,
|
|
start: Hashable | None = None,
|
|
end: Hashable | None = None,
|
|
step: int | None = None,
|
|
) -> slice:
|
|
"""
|
|
Compute the slice indexer for input labels and step.
|
|
|
|
Index needs to be ordered and unique.
|
|
|
|
Parameters
|
|
----------
|
|
start : label, default None
|
|
If None, defaults to the beginning.
|
|
end : label, default None
|
|
If None, defaults to the end.
|
|
step : int, default None
|
|
|
|
Returns
|
|
-------
|
|
slice
|
|
|
|
Raises
|
|
------
|
|
KeyError : If key does not exist, or key is not unique and index is
|
|
not ordered.
|
|
|
|
Notes
|
|
-----
|
|
This function assumes that the data is sorted, so use at your own peril
|
|
|
|
Examples
|
|
--------
|
|
This is a method on all index types. For example you can do:
|
|
|
|
>>> idx = pd.Index(list('abcd'))
|
|
>>> idx.slice_indexer(start='b', end='c')
|
|
slice(1, 3, None)
|
|
|
|
>>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
|
|
>>> idx.slice_indexer(start='b', end=('c', 'g'))
|
|
slice(1, 3, None)
|
|
"""
|
|
start_slice, end_slice = self.slice_locs(start, end, step=step)
|
|
|
|
# return a slice
|
|
if not is_scalar(start_slice):
|
|
raise AssertionError("Start slice bound is non-scalar")
|
|
if not is_scalar(end_slice):
|
|
raise AssertionError("End slice bound is non-scalar")
|
|
|
|
return slice(start_slice, end_slice, step)
|
|
|
|
def _maybe_cast_indexer(self, key):
|
|
"""
|
|
If we have a float key and are not a floating index, then try to cast
|
|
to an int if equivalent.
|
|
"""
|
|
return key
|
|
|
|
def _maybe_cast_listlike_indexer(self, target) -> Index:
|
|
"""
|
|
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
|
|
"""
|
|
return ensure_index(target)
|
|
|
|
@final
|
|
def _validate_indexer(self, form: str_t, key, kind: str_t) -> None:
|
|
"""
|
|
If we are positional indexer, validate that we have appropriate
|
|
typed bounds must be an integer.
|
|
"""
|
|
assert kind in ["getitem", "iloc"]
|
|
|
|
if key is not None and not is_integer(key):
|
|
self._raise_invalid_indexer(form, key)
|
|
|
|
def _maybe_cast_slice_bound(self, label, side: str_t):
|
|
"""
|
|
This function should be overloaded in subclasses that allow non-trivial
|
|
casting on label-slice bounds, e.g. datetime-like indices allowing
|
|
strings containing formatted datetimes.
|
|
|
|
Parameters
|
|
----------
|
|
label : object
|
|
side : {'left', 'right'}
|
|
|
|
Returns
|
|
-------
|
|
label : object
|
|
|
|
Notes
|
|
-----
|
|
Value of `side` parameter should be validated in caller.
|
|
"""
|
|
|
|
# We are a plain index here (sub-class override this method if they
|
|
# wish to have special treatment for floats/ints, e.g. datetimelike Indexes
|
|
|
|
if is_numeric_dtype(self.dtype):
|
|
return self._maybe_cast_indexer(label)
|
|
|
|
# reject them, if index does not contain label
|
|
if (is_float(label) or is_integer(label)) and label not in self:
|
|
self._raise_invalid_indexer("slice", label)
|
|
|
|
return label
|
|
|
|
def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"):
|
|
if self.is_monotonic_increasing:
|
|
return self.searchsorted(label, side=side)
|
|
elif self.is_monotonic_decreasing:
|
|
# np.searchsorted expects ascending sort order, have to reverse
|
|
# everything for it to work (element ordering, search side and
|
|
# resulting value).
|
|
pos = self[::-1].searchsorted(
|
|
label, side="right" if side == "left" else "left"
|
|
)
|
|
return len(self) - pos
|
|
|
|
raise ValueError("index must be monotonic increasing or decreasing")
|
|
|
|
def get_slice_bound(self, label, side: Literal["left", "right"]) -> int:
|
|
"""
|
|
Calculate slice bound that corresponds to given label.
|
|
|
|
Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
|
|
of given label.
|
|
|
|
Parameters
|
|
----------
|
|
label : object
|
|
side : {'left', 'right'}
|
|
|
|
Returns
|
|
-------
|
|
int
|
|
Index of label.
|
|
"""
|
|
|
|
if side not in ("left", "right"):
|
|
raise ValueError(
|
|
"Invalid value for side kwarg, must be either "
|
|
f"'left' or 'right': {side}"
|
|
)
|
|
|
|
original_label = label
|
|
|
|
# For datetime indices label may be a string that has to be converted
|
|
# to datetime boundary according to its resolution.
|
|
label = self._maybe_cast_slice_bound(label, side)
|
|
|
|
# we need to look up the label
|
|
try:
|
|
slc = self.get_loc(label)
|
|
except KeyError as err:
|
|
try:
|
|
return self._searchsorted_monotonic(label, side)
|
|
except ValueError:
|
|
# raise the original KeyError
|
|
raise err
|
|
|
|
if isinstance(slc, np.ndarray):
|
|
# get_loc may return a boolean array, which
|
|
# is OK as long as they are representable by a slice.
|
|
assert is_bool_dtype(slc.dtype)
|
|
slc = lib.maybe_booleans_to_slice(slc.view("u1"))
|
|
if isinstance(slc, np.ndarray):
|
|
raise KeyError(
|
|
f"Cannot get {side} slice bound for non-unique "
|
|
f"label: {repr(original_label)}"
|
|
)
|
|
|
|
if isinstance(slc, slice):
|
|
if side == "left":
|
|
return slc.start
|
|
else:
|
|
return slc.stop
|
|
else:
|
|
if side == "right":
|
|
return slc + 1
|
|
else:
|
|
return slc
|
|
|
|
def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
|
|
"""
|
|
Compute slice locations for input labels.
|
|
|
|
Parameters
|
|
----------
|
|
start : label, default None
|
|
If None, defaults to the beginning.
|
|
end : label, default None
|
|
If None, defaults to the end.
|
|
step : int, defaults None
|
|
If None, defaults to 1.
|
|
|
|
Returns
|
|
-------
|
|
tuple[int, int]
|
|
|
|
See Also
|
|
--------
|
|
Index.get_loc : Get location for a single label.
|
|
|
|
Notes
|
|
-----
|
|
This method only works if the index is monotonic or unique.
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(list('abcd'))
|
|
>>> idx.slice_locs(start='b', end='c')
|
|
(1, 3)
|
|
"""
|
|
inc = step is None or step >= 0
|
|
|
|
if not inc:
|
|
# If it's a reverse slice, temporarily swap bounds.
|
|
start, end = end, start
|
|
|
|
# GH 16785: If start and end happen to be date strings with UTC offsets
|
|
# attempt to parse and check that the offsets are the same
|
|
if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)):
|
|
try:
|
|
ts_start = Timestamp(start)
|
|
ts_end = Timestamp(end)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
else:
|
|
if not tz_compare(ts_start.tzinfo, ts_end.tzinfo):
|
|
raise ValueError("Both dates must have the same UTC offset")
|
|
|
|
start_slice = None
|
|
if start is not None:
|
|
start_slice = self.get_slice_bound(start, "left")
|
|
if start_slice is None:
|
|
start_slice = 0
|
|
|
|
end_slice = None
|
|
if end is not None:
|
|
end_slice = self.get_slice_bound(end, "right")
|
|
if end_slice is None:
|
|
end_slice = len(self)
|
|
|
|
if not inc:
|
|
# Bounds at this moment are swapped, swap them back and shift by 1.
|
|
#
|
|
# slice_locs('B', 'A', step=-1): s='B', e='A'
|
|
#
|
|
# s='A' e='B'
|
|
# AFTER SWAP: | |
|
|
# v ------------------> V
|
|
# -----------------------------------
|
|
# | | |A|A|A|A| | | | | |B|B| | | | |
|
|
# -----------------------------------
|
|
# ^ <------------------ ^
|
|
# SHOULD BE: | |
|
|
# end=s-1 start=e-1
|
|
#
|
|
end_slice, start_slice = start_slice - 1, end_slice - 1
|
|
|
|
# i == -1 triggers ``len(self) + i`` selection that points to the
|
|
# last element, not before-the-first one, subtracting len(self)
|
|
# compensates that.
|
|
if end_slice == -1:
|
|
end_slice -= len(self)
|
|
if start_slice == -1:
|
|
start_slice -= len(self)
|
|
|
|
return start_slice, end_slice
|
|
|
|
def delete(self: _IndexT, loc) -> _IndexT:
|
|
"""
|
|
Make new Index with passed location(-s) deleted.
|
|
|
|
Parameters
|
|
----------
|
|
loc : int or list of int
|
|
Location of item(-s) which will be deleted.
|
|
Use a list of locations to delete more than one value at the same time.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
Will be same type as self, except for RangeIndex.
|
|
|
|
See Also
|
|
--------
|
|
numpy.delete : Delete any rows and column from NumPy array (ndarray).
|
|
|
|
Examples
|
|
--------
|
|
>>> idx = pd.Index(['a', 'b', 'c'])
|
|
>>> idx.delete(1)
|
|
Index(['a', 'c'], dtype='object')
|
|
|
|
>>> idx = pd.Index(['a', 'b', 'c'])
|
|
>>> idx.delete([0, 2])
|
|
Index(['b'], dtype='object')
|
|
"""
|
|
values = self._values
|
|
res_values: ArrayLike
|
|
if isinstance(values, np.ndarray):
|
|
# TODO(__array_function__): special casing will be unnecessary
|
|
res_values = np.delete(values, loc)
|
|
else:
|
|
res_values = values.delete(loc)
|
|
|
|
# _constructor so RangeIndex-> Index with an int64 dtype
|
|
return self._constructor._simple_new(res_values, name=self.name)
|
|
|
|
def insert(self, loc: int, item) -> Index:
|
|
"""
|
|
Make new Index inserting new item at location.
|
|
|
|
Follows Python numpy.insert semantics for negative values.
|
|
|
|
Parameters
|
|
----------
|
|
loc : int
|
|
item : object
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
item = lib.item_from_zerodim(item)
|
|
if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object:
|
|
item = self._na_value
|
|
|
|
arr = self._values
|
|
|
|
try:
|
|
if isinstance(arr, ExtensionArray):
|
|
res_values = arr.insert(loc, item)
|
|
return type(self)._simple_new(res_values, name=self.name)
|
|
else:
|
|
item = self._validate_fill_value(item)
|
|
except (TypeError, ValueError, LossySetitemError):
|
|
# e.g. trying to insert an integer into a DatetimeIndex
|
|
# We cannot keep the same dtype, so cast to the (often object)
|
|
# minimal shared dtype before doing the insert.
|
|
dtype = self._find_common_type_compat(item)
|
|
return self.astype(dtype).insert(loc, item)
|
|
|
|
if arr.dtype != object or not isinstance(
|
|
item, (tuple, np.datetime64, np.timedelta64)
|
|
):
|
|
# with object-dtype we need to worry about numpy incorrectly casting
|
|
# dt64/td64 to integer, also about treating tuples as sequences
|
|
# special-casing dt64/td64 https://github.com/numpy/numpy/issues/12550
|
|
casted = arr.dtype.type(item)
|
|
new_values = np.insert(arr, loc, casted)
|
|
|
|
else:
|
|
# error: No overload variant of "insert" matches argument types
|
|
# "ndarray[Any, Any]", "int", "None"
|
|
new_values = np.insert(arr, loc, None) # type: ignore[call-overload]
|
|
loc = loc if loc >= 0 else loc - 1
|
|
new_values[loc] = item
|
|
|
|
return Index._with_infer(new_values, name=self.name)
|
|
|
|
def drop(
|
|
self,
|
|
labels: Index | np.ndarray | Iterable[Hashable],
|
|
errors: IgnoreRaise = "raise",
|
|
) -> Index:
|
|
"""
|
|
Make new Index with passed list of labels deleted.
|
|
|
|
Parameters
|
|
----------
|
|
labels : array-like or scalar
|
|
errors : {'ignore', 'raise'}, default 'raise'
|
|
If 'ignore', suppress error and existing labels are dropped.
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
Will be same type as self, except for RangeIndex.
|
|
|
|
Raises
|
|
------
|
|
KeyError
|
|
If not all of the labels are found in the selected axis
|
|
"""
|
|
if not isinstance(labels, Index):
|
|
# avoid materializing e.g. RangeIndex
|
|
arr_dtype = "object" if self.dtype == "object" else None
|
|
labels = com.index_labels_to_array(labels, dtype=arr_dtype)
|
|
|
|
indexer = self.get_indexer_for(labels)
|
|
mask = indexer == -1
|
|
if mask.any():
|
|
if errors != "ignore":
|
|
raise KeyError(f"{list(labels[mask])} not found in axis")
|
|
indexer = indexer[~mask]
|
|
return self.delete(indexer)
|
|
|
|
def infer_objects(self, copy: bool = True) -> Index:
|
|
"""
|
|
If we have an object dtype, try to infer a non-object dtype.
|
|
|
|
Parameters
|
|
----------
|
|
copy : bool, default True
|
|
Whether to make a copy in cases where no inference occurs.
|
|
"""
|
|
if self._is_multi:
|
|
raise NotImplementedError(
|
|
"infer_objects is not implemented for MultiIndex. "
|
|
"Use index.to_frame().infer_objects() instead."
|
|
)
|
|
if self.dtype != object:
|
|
return self.copy() if copy else self
|
|
|
|
values = self._values
|
|
values = cast("npt.NDArray[np.object_]", values)
|
|
res_values = lib.maybe_convert_objects(
|
|
values,
|
|
convert_datetime=True,
|
|
convert_timedelta=True,
|
|
convert_period=True,
|
|
convert_interval=True,
|
|
)
|
|
if copy and res_values is values:
|
|
return self.copy()
|
|
result = Index(res_values, name=self.name)
|
|
if not copy and res_values is values and self._references is not None:
|
|
result._references = self._references
|
|
result._references.add_index_reference(result)
|
|
return result
|
|
|
|
# --------------------------------------------------------------------
|
|
# Generated Arithmetic, Comparison, and Unary Methods
|
|
|
|
def _cmp_method(self, other, op):
|
|
"""
|
|
Wrapper used to dispatch comparison operations.
|
|
"""
|
|
if self.is_(other):
|
|
# fastpath
|
|
if op in {operator.eq, operator.le, operator.ge}:
|
|
arr = np.ones(len(self), dtype=bool)
|
|
if self._can_hold_na and not isinstance(self, ABCMultiIndex):
|
|
# TODO: should set MultiIndex._can_hold_na = False?
|
|
arr[self.isna()] = False
|
|
return arr
|
|
elif op is operator.ne:
|
|
arr = np.zeros(len(self), dtype=bool)
|
|
if self._can_hold_na and not isinstance(self, ABCMultiIndex):
|
|
arr[self.isna()] = True
|
|
return arr
|
|
|
|
if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)) and len(
|
|
self
|
|
) != len(other):
|
|
raise ValueError("Lengths must match to compare")
|
|
|
|
if not isinstance(other, ABCMultiIndex):
|
|
other = extract_array(other, extract_numpy=True)
|
|
else:
|
|
other = np.asarray(other)
|
|
|
|
if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray):
|
|
# e.g. PeriodArray, Categorical
|
|
with np.errstate(all="ignore"):
|
|
result = op(self._values, other)
|
|
|
|
elif isinstance(self._values, ExtensionArray):
|
|
result = op(self._values, other)
|
|
|
|
elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex):
|
|
# don't pass MultiIndex
|
|
with np.errstate(all="ignore"):
|
|
result = ops.comp_method_OBJECT_ARRAY(op, self._values, other)
|
|
|
|
else:
|
|
with np.errstate(all="ignore"):
|
|
result = ops.comparison_op(self._values, other, op)
|
|
|
|
return result
|
|
|
|
@final
|
|
def _logical_method(self, other, op):
|
|
res_name = ops.get_op_result_name(self, other)
|
|
|
|
lvalues = self._values
|
|
rvalues = extract_array(other, extract_numpy=True, extract_range=True)
|
|
|
|
res_values = ops.logical_op(lvalues, rvalues, op)
|
|
return self._construct_result(res_values, name=res_name)
|
|
|
|
@final
|
|
def _construct_result(self, result, name):
|
|
if isinstance(result, tuple):
|
|
return (
|
|
Index(result[0], name=name, dtype=result[0].dtype),
|
|
Index(result[1], name=name, dtype=result[1].dtype),
|
|
)
|
|
return Index(result, name=name, dtype=result.dtype)
|
|
|
|
def _arith_method(self, other, op):
|
|
if (
|
|
isinstance(other, Index)
|
|
and is_object_dtype(other.dtype)
|
|
and type(other) is not Index
|
|
):
|
|
# We return NotImplemented for object-dtype index *subclasses* so they have
|
|
# a chance to implement ops before we unwrap them.
|
|
# See https://github.com/pandas-dev/pandas/issues/31109
|
|
return NotImplemented
|
|
|
|
return super()._arith_method(other, op)
|
|
|
|
@final
|
|
def _unary_method(self, op):
|
|
result = op(self._values)
|
|
return Index(result, name=self.name)
|
|
|
|
def __abs__(self) -> Index:
|
|
return self._unary_method(operator.abs)
|
|
|
|
def __neg__(self) -> Index:
|
|
return self._unary_method(operator.neg)
|
|
|
|
def __pos__(self) -> Index:
|
|
return self._unary_method(operator.pos)
|
|
|
|
def __invert__(self) -> Index:
|
|
# GH#8875
|
|
return self._unary_method(operator.inv)
|
|
|
|
# --------------------------------------------------------------------
|
|
# Reductions
|
|
|
|
def any(self, *args, **kwargs):
|
|
"""
|
|
Return whether any element is Truthy.
|
|
|
|
Parameters
|
|
----------
|
|
*args
|
|
Required for compatibility with numpy.
|
|
**kwargs
|
|
Required for compatibility with numpy.
|
|
|
|
Returns
|
|
-------
|
|
bool or array-like (if axis is specified)
|
|
A single element array-like may be converted to bool.
|
|
|
|
See Also
|
|
--------
|
|
Index.all : Return whether all elements are True.
|
|
Series.all : Return whether all elements are True.
|
|
|
|
Notes
|
|
-----
|
|
Not a Number (NaN), positive infinity and negative infinity
|
|
evaluate to True because these are not equal to zero.
|
|
|
|
Examples
|
|
--------
|
|
>>> index = pd.Index([0, 1, 2])
|
|
>>> index.any()
|
|
True
|
|
|
|
>>> index = pd.Index([0, 0, 0])
|
|
>>> index.any()
|
|
False
|
|
"""
|
|
nv.validate_any(args, kwargs)
|
|
self._maybe_disable_logical_methods("any")
|
|
# error: Argument 1 to "any" has incompatible type "ArrayLike"; expected
|
|
# "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int,
|
|
# float, complex, str, bytes, generic]], Sequence[Sequence[Any]],
|
|
# _SupportsArray]"
|
|
return np.any(self.values) # type: ignore[arg-type]
|
|
|
|
def all(self, *args, **kwargs):
|
|
"""
|
|
Return whether all elements are Truthy.
|
|
|
|
Parameters
|
|
----------
|
|
*args
|
|
Required for compatibility with numpy.
|
|
**kwargs
|
|
Required for compatibility with numpy.
|
|
|
|
Returns
|
|
-------
|
|
bool or array-like (if axis is specified)
|
|
A single element array-like may be converted to bool.
|
|
|
|
See Also
|
|
--------
|
|
Index.any : Return whether any element in an Index is True.
|
|
Series.any : Return whether any element in a Series is True.
|
|
Series.all : Return whether all elements in a Series are True.
|
|
|
|
Notes
|
|
-----
|
|
Not a Number (NaN), positive infinity and negative infinity
|
|
evaluate to True because these are not equal to zero.
|
|
|
|
Examples
|
|
--------
|
|
True, because nonzero integers are considered True.
|
|
|
|
>>> pd.Index([1, 2, 3]).all()
|
|
True
|
|
|
|
False, because ``0`` is considered False.
|
|
|
|
>>> pd.Index([0, 1, 2]).all()
|
|
False
|
|
"""
|
|
nv.validate_all(args, kwargs)
|
|
self._maybe_disable_logical_methods("all")
|
|
# error: Argument 1 to "all" has incompatible type "ArrayLike"; expected
|
|
# "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int,
|
|
# float, complex, str, bytes, generic]], Sequence[Sequence[Any]],
|
|
# _SupportsArray]"
|
|
return np.all(self.values) # type: ignore[arg-type]
|
|
|
|
@final
|
|
def _maybe_disable_logical_methods(self, opname: str_t) -> None:
|
|
"""
|
|
raise if this Index subclass does not support any or all.
|
|
"""
|
|
if (
|
|
isinstance(self, ABCMultiIndex)
|
|
or needs_i8_conversion(self.dtype)
|
|
or is_interval_dtype(self.dtype)
|
|
or is_categorical_dtype(self.dtype)
|
|
or is_float_dtype(self.dtype)
|
|
):
|
|
# This call will raise
|
|
make_invalid_op(opname)(self)
|
|
|
|
@Appender(IndexOpsMixin.argmin.__doc__)
|
|
def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
|
|
nv.validate_argmin(args, kwargs)
|
|
nv.validate_minmax_axis(axis)
|
|
|
|
if not self._is_multi and self.hasnans:
|
|
# Take advantage of cache
|
|
mask = self._isnan
|
|
if not skipna or mask.all():
|
|
return -1
|
|
return super().argmin(skipna=skipna)
|
|
|
|
@Appender(IndexOpsMixin.argmax.__doc__)
|
|
def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
|
|
nv.validate_argmax(args, kwargs)
|
|
nv.validate_minmax_axis(axis)
|
|
|
|
if not self._is_multi and self.hasnans:
|
|
# Take advantage of cache
|
|
mask = self._isnan
|
|
if not skipna or mask.all():
|
|
return -1
|
|
return super().argmax(skipna=skipna)
|
|
|
|
@doc(IndexOpsMixin.min)
|
|
def min(self, axis=None, skipna: bool = True, *args, **kwargs):
|
|
nv.validate_min(args, kwargs)
|
|
nv.validate_minmax_axis(axis)
|
|
|
|
if not len(self):
|
|
return self._na_value
|
|
|
|
if len(self) and self.is_monotonic_increasing:
|
|
# quick check
|
|
first = self[0]
|
|
if not isna(first):
|
|
return first
|
|
|
|
if not self._is_multi and self.hasnans:
|
|
# Take advantage of cache
|
|
mask = self._isnan
|
|
if not skipna or mask.all():
|
|
return self._na_value
|
|
|
|
if not self._is_multi and not isinstance(self._values, np.ndarray):
|
|
return self._values._reduce(name="min", skipna=skipna)
|
|
|
|
return super().min(skipna=skipna)
|
|
|
|
@doc(IndexOpsMixin.max)
|
|
def max(self, axis=None, skipna: bool = True, *args, **kwargs):
|
|
nv.validate_max(args, kwargs)
|
|
nv.validate_minmax_axis(axis)
|
|
|
|
if not len(self):
|
|
return self._na_value
|
|
|
|
if len(self) and self.is_monotonic_increasing:
|
|
# quick check
|
|
last = self[-1]
|
|
if not isna(last):
|
|
return last
|
|
|
|
if not self._is_multi and self.hasnans:
|
|
# Take advantage of cache
|
|
mask = self._isnan
|
|
if not skipna or mask.all():
|
|
return self._na_value
|
|
|
|
if not self._is_multi and not isinstance(self._values, np.ndarray):
|
|
return self._values._reduce(name="max", skipna=skipna)
|
|
|
|
return super().max(skipna=skipna)
|
|
|
|
# --------------------------------------------------------------------
|
|
|
|
@final
|
|
@property
|
|
def shape(self) -> Shape:
|
|
"""
|
|
Return a tuple of the shape of the underlying data.
|
|
"""
|
|
# See GH#27775, GH#27384 for history/reasoning in how this is defined.
|
|
return (len(self),)
|
|
|
|
|
|
def ensure_index_from_sequences(sequences, names=None) -> Index:
|
|
"""
|
|
Construct an index from sequences of data.
|
|
|
|
A single sequence returns an Index. Many sequences returns a
|
|
MultiIndex.
|
|
|
|
Parameters
|
|
----------
|
|
sequences : sequence of sequences
|
|
names : sequence of str
|
|
|
|
Returns
|
|
-------
|
|
index : Index or MultiIndex
|
|
|
|
Examples
|
|
--------
|
|
>>> ensure_index_from_sequences([[1, 2, 3]], names=["name"])
|
|
Index([1, 2, 3], dtype='int64', name='name')
|
|
|
|
>>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"])
|
|
MultiIndex([('a', 'a'),
|
|
('a', 'b')],
|
|
names=['L1', 'L2'])
|
|
|
|
See Also
|
|
--------
|
|
ensure_index
|
|
"""
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
if len(sequences) == 1:
|
|
if names is not None:
|
|
names = names[0]
|
|
return Index(sequences[0], name=names)
|
|
else:
|
|
return MultiIndex.from_arrays(sequences, names=names)
|
|
|
|
|
|
def ensure_index(index_like: Axes, copy: bool = False) -> Index:
|
|
"""
|
|
Ensure that we have an index from some index-like object.
|
|
|
|
Parameters
|
|
----------
|
|
index_like : sequence
|
|
An Index or other sequence
|
|
copy : bool, default False
|
|
|
|
Returns
|
|
-------
|
|
index : Index or MultiIndex
|
|
|
|
See Also
|
|
--------
|
|
ensure_index_from_sequences
|
|
|
|
Examples
|
|
--------
|
|
>>> ensure_index(['a', 'b'])
|
|
Index(['a', 'b'], dtype='object')
|
|
|
|
>>> ensure_index([('a', 'a'), ('b', 'c')])
|
|
Index([('a', 'a'), ('b', 'c')], dtype='object')
|
|
|
|
>>> ensure_index([['a', 'a'], ['b', 'c']])
|
|
MultiIndex([('a', 'b'),
|
|
('a', 'c')],
|
|
)
|
|
"""
|
|
if isinstance(index_like, Index):
|
|
if copy:
|
|
index_like = index_like.copy()
|
|
return index_like
|
|
|
|
if isinstance(index_like, ABCSeries):
|
|
name = index_like.name
|
|
return Index(index_like, name=name, copy=copy)
|
|
|
|
if is_iterator(index_like):
|
|
index_like = list(index_like)
|
|
|
|
if isinstance(index_like, list):
|
|
if type(index_like) is not list:
|
|
# must check for exactly list here because of strict type
|
|
# check in clean_index_list
|
|
index_like = list(index_like)
|
|
|
|
if len(index_like) and lib.is_all_arraylike(index_like):
|
|
from pandas.core.indexes.multi import MultiIndex
|
|
|
|
return MultiIndex.from_arrays(index_like)
|
|
else:
|
|
return Index(index_like, copy=copy, tupleize_cols=False)
|
|
else:
|
|
return Index(index_like, copy=copy)
|
|
|
|
|
|
def ensure_has_len(seq):
|
|
"""
|
|
If seq is an iterator, put its values into a list.
|
|
"""
|
|
try:
|
|
len(seq)
|
|
except TypeError:
|
|
return list(seq)
|
|
else:
|
|
return seq
|
|
|
|
|
|
def trim_front(strings: list[str]) -> list[str]:
|
|
"""
|
|
Trims zeros and decimal points.
|
|
|
|
Examples
|
|
--------
|
|
>>> trim_front([" a", " b"])
|
|
['a', 'b']
|
|
|
|
>>> trim_front([" a", " "])
|
|
['a', '']
|
|
"""
|
|
if not strings:
|
|
return strings
|
|
while all(strings) and all(x[0] == " " for x in strings):
|
|
strings = [x[1:] for x in strings]
|
|
return strings
|
|
|
|
|
|
def _validate_join_method(method: str) -> None:
|
|
if method not in ["left", "right", "inner", "outer"]:
|
|
raise ValueError(f"do not recognize join method {method}")
|
|
|
|
|
|
def maybe_extract_name(name, obj, cls) -> Hashable:
|
|
"""
|
|
If no name is passed, then extract it from data, validating hashability.
|
|
"""
|
|
if name is None and isinstance(obj, (Index, ABCSeries)):
|
|
# Note we don't just check for "name" attribute since that would
|
|
# pick up e.g. dtype.name
|
|
name = obj.name
|
|
|
|
# GH#29069
|
|
if not is_hashable(name):
|
|
raise TypeError(f"{cls.__name__}.name must be a hashable type")
|
|
|
|
return name
|
|
|
|
|
|
def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
|
|
"""
|
|
Return common name if all indices agree, otherwise None (level-by-level).
|
|
|
|
Parameters
|
|
----------
|
|
indexes : list of Index objects
|
|
|
|
Returns
|
|
-------
|
|
list
|
|
A list representing the unanimous 'names' found.
|
|
"""
|
|
name_tups = [tuple(i.names) for i in indexes]
|
|
name_sets = [{*ns} for ns in zip_longest(*name_tups)]
|
|
names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
|
|
return names
|
|
|
|
|
|
def _unpack_nested_dtype(other: Index) -> Index:
|
|
"""
|
|
When checking if our dtype is comparable with another, we need
|
|
to unpack CategoricalDtype to look at its categories.dtype.
|
|
|
|
Parameters
|
|
----------
|
|
other : Index
|
|
|
|
Returns
|
|
-------
|
|
Index
|
|
"""
|
|
dtype = other.dtype
|
|
if isinstance(dtype, CategoricalDtype):
|
|
# If there is ever a SparseIndex, this could get dispatched
|
|
# here too.
|
|
return dtype.categories
|
|
return other
|
|
|
|
|
|
def _maybe_try_sort(result, sort):
|
|
if sort is not False:
|
|
try:
|
|
result = algos.safe_sort(result)
|
|
except TypeError as err:
|
|
if sort is True:
|
|
raise
|
|
warnings.warn(
|
|
f"{err}, sort order is undefined for incomparable objects.",
|
|
RuntimeWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
return result
|