from copy import copy as copy_func from datetime import datetime from itertools import zip_longest import operator from typing import ( TYPE_CHECKING, Any, Callable, FrozenSet, Hashable, List, NewType, Optional, Sequence, Set, Tuple, TypeVar, Union, cast, ) import warnings import numpy as np from pandas._libs import algos as libalgos, index as libindex, lib import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array, no_default from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final from pandas.compat.numpy import function as nv from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import ( find_common_type, maybe_cast_to_integer_array, validate_numeric_casting, ) from pandas.core.dtypes.common import ( ensure_int64, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_dtype_equal, is_extension_array_dtype, is_float, is_float_dtype, is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import missing, ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( PrettyDict, default_pprint, format_object_attrs, format_object_summary, pprint_thing, ) if TYPE_CHECKING: from pandas import MultiIndex, RangeIndex, Series __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) _index_doc_kwargs = { "klass": "Index", "inplace": "", "target_klass": "Index", "raises_section": "", "unique": "Index", "duplicated": "np.ndarray", } _index_shared_docs = {} str_t = str _o_dtype = np.dtype(object) _Identity = NewType("_Identity", object) def _new_Index(cls, d): """ This is called upon unpickling, rather than the default which doesn't have arguments and breaks __new__. """ # required for backward compat, because PI can't be instantiated with # ordinals through __new__ GH #13277 if issubclass(cls, ABCPeriodIndex): from pandas.core.indexes.period import _new_PeriodIndex return _new_PeriodIndex(cls, **d) if issubclass(cls, ABCMultiIndex): if "labels" in d and "codes" not in d: # GH#23752 "labels" kwarg has been replaced with "codes" d["codes"] = d.pop("labels") return cls.__new__(cls, **d) _IndexT = TypeVar("_IndexT", bound="Index") class Index(IndexOpsMixin, PandasObject): """ Immutable sequence used for indexing and alignment. The basic object storing axis labels for all pandas objects. Parameters ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: object) If dtype is None, we find the dtype that best fits the data. If an actual dtype is provided, we coerce to that dtype if it's safe. Otherwise, an error will be raised. copy : bool Make a copy of input ndarray. name : object Name to be stored in the index. tupleize_cols : bool (default: True) When True, attempt to create a MultiIndex if possible. See Also -------- RangeIndex : Index implementing a monotonic integer range. CategoricalIndex : Index of :class:`Categorical` s. MultiIndex : A multi-level, or hierarchical Index. IntervalIndex : An Index of :class:`Interval` s. DatetimeIndex : Index of datetime64 data. TimedeltaIndex : Index of timedelta64 data. PeriodIndex : Index of Period data. Int64Index : A special case of :class:`Index` with purely integer labels. UInt64Index : A special case of :class:`Index` with purely unsigned integer labels. Float64Index : A special case of :class:`Index` with purely float labels. Notes ----- An Index instance can **only** contain hashable objects Examples -------- >>> pd.Index([1, 2, 3]) Int64Index([1, 2, 3], dtype='int64') >>> pd.Index(list('abc')) Index(['a', 'b', 'c'], dtype='object') """ # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs: FrozenSet[str] = ( PandasObject._hidden_attrs | IndexOpsMixin._hidden_attrs | frozenset(["contains", "set_value"]) ) # To hand over control to subclasses _join_precedence = 1 # Cython methods; see github.com/cython/cython/issues/2647 # for why we need to wrap these instead of making them class attributes # Moreover, cython will choose the appropriate-dtyped sub-function # given the dtypes of the passed arguments def _left_indexer_unique(self, left, right): return libjoin.left_join_indexer_unique(left, right) def _left_indexer(self, left, right): return libjoin.left_join_indexer(left, right) def _inner_indexer(self, left, right): return libjoin.inner_join_indexer(left, right) def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) _typ = "index" _data: Union[ExtensionArray, np.ndarray] _id: Optional[_Identity] = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We # don't allow this anymore, and raise if it happens rather than # failing silently. _no_setting_name: bool = False _comparables = ["name"] _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True _can_hold_strings = True # would we like our indexing holder to defer to us _defer_to_indexing = False _engine_type = libindex.ObjectEngine # whether we support partial string indexing. Overridden # in DatetimeIndex and PeriodIndex _supports_partial_string_indexing = False _accessors = {"str"} str = CachedAccessor("str", StringMethods) # -------------------------------------------------------------------- # Constructors def __new__( cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs ) -> "Index": from pandas.core.indexes.range import RangeIndex name = maybe_extract_name(name, data, cls) if dtype is not None: dtype = pandas_dtype(dtype) if "tz" in kwargs: tz = kwargs.pop("tz") validate_tz_from_dtype(dtype, tz) dtype = tz_to_dtype(tz) if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() data_dtype = getattr(data, "dtype", None) # range if isinstance(data, RangeIndex): return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) elif isinstance(data, range): return RangeIndex.from_range(data, dtype=dtype, name=name) # categorical elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.category import CategoricalIndex return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs) # interval elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.interval import IntervalIndex return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs) elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import DatetimeIndex return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs) elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import TimedeltaIndex return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs) elif is_period_dtype(data_dtype) or is_period_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import PeriodIndex return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs) # extension dtype elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype): if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=False) else: data = np.asarray(data, dtype=object) # coerce to the object dtype data = data.astype(object) return Index(data, dtype=object, copy=copy, name=name, **kwargs) # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.numeric import ( Float64Index, Int64Index, UInt64Index, ) if dtype is not None: # we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 data = _maybe_cast_with_dtype(data, dtype, copy) dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, bool) or is_bool_dtype(data): subarr = data.astype("object") else: subarr = com.asarray_tuplesafe(data, dtype=object) # asarray_tuplesafe does not always copy underlying data, # so need to make sure that this happens if copy: subarr = subarr.copy() if dtype is None: new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) if new_dtype is not None: return cls( new_data, dtype=new_dtype, copy=False, name=name, **kwargs ) if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") if subarr.ndim > 1: # GH#13601, GH#20285, GH#27125 raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name) elif data is None or is_scalar(data): raise cls._scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) if data and all(isinstance(e, tuple) for e in data): # we must be all tuples, otherwise don't construct # 10697 from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get("names") ) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) """ NOTE for new Index creation: - _simple_new: It returns new Index with the same type as the caller. All metadata (such as name) must be provided by caller's responsibility. Using _shallow_copy is recommended because it fills these metadata otherwise specified. - _shallow_copy: It returns new Index with the same type (using _simple_new), but fills caller's metadata otherwise specified. Passed kwargs will overwrite corresponding metadata. See each method's docstring. """ @property def asi8(self): """ Integer representation of the values. Returns ------- ndarray An ndarray with int64 dtype. """ warnings.warn( "Index.asi8 is deprecated and will be removed in a future version", FutureWarning, stacklevel=2, ) return None @classmethod def _simple_new(cls, values, name: Label = None): """ We require that we have a dtype compat for the values. If we are passed a non-dtype compat, then coerce using the constructor. Must be careful not to recurse. """ assert isinstance(values, np.ndarray), type(values) result = object.__new__(cls) result._data = values # _index_data is a (temporary?) fix to ensure that the direct data # manipulation we do in `_libs/reduction.pyx` continues to work. # We need access to the actual ndarray, since we're messing with # data buffers and strides. result._index_data = values result._name = name result._cache = {} result._reset_identity() return result @cache_readonly def _constructor(self): return type(self) @final def _maybe_check_unique(self): """ Check that an Index has no duplicates. This is typically only called via `NDFrame.flags.allows_duplicate_labels.setter` when it's set to True (duplicates aren't allowed). Raises ------ DuplicateLabelError When the index is not unique. """ if not self.is_unique: msg = """Index has duplicates.""" duplicates = self._format_duplicate_message() msg += f"\n{duplicates}" raise DuplicateLabelError(msg) @final def _format_duplicate_message(self): """ Construct the DataFrame for a DuplicateLabelError. This returns a DataFrame indicating the labels and positions of duplicates in an index. This should only be called when it's already known that duplicates are present. Examples -------- >>> idx = pd.Index(['a', 'b', 'a']) >>> idx._format_duplicate_message() positions label a [0, 2] """ from pandas import Series duplicates = self[self.duplicated(keep="first")].unique() assert len(duplicates) out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] if self.nlevels == 1: out = out.rename_axis("label") return out.to_frame(name="positions") # -------------------------------------------------------------------- # Index Internals Methods @final def _get_attributes_dict(self): """ Return an attributes dict for my class. """ return {k: getattr(self, k, None) for k in self._attributes} def _shallow_copy(self, values=None, name: Label = no_default): """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking precedence. *this is an internal non-public method* Parameters ---------- values : the values to create the new Index, optional name : Label, defaults to self.name """ name = self.name if name is no_default else name if values is not None: return self._simple_new(values, name=name) result = self._simple_new(self._values, name=name) result._cache = self._cache return result @final def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. Note: this is *not* the same as ``Index.identical()``, which checks that metadata is also the same. Parameters ---------- other : object Other object to compare against. Returns ------- bool True if both have same underlying data, False otherwise. See Also -------- Index.identical : Works like ``Index.is_`` but also checks metadata. """ if self is other: return True elif not hasattr(other, "_id"): return False elif self._id is None or other._id is None: return False else: return self._id is other._id @final def _reset_identity(self) -> None: """ Initializes or resets ``_id`` attribute with new object. """ self._id = _Identity(object()) @final def _cleanup(self): self._engine.clear_mapping() @cache_readonly def _engine(self): # property, for now, slow to look up # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) @cache_readonly def _dir_additions_for_owner(self) -> Set[str_t]: """ Add the string-like labels to the owner dataframe/series dir output. If this is a MultiIndex, it's first level values are used. """ return { c for c in self.unique(level=0)[:100] if isinstance(c, str) and c.isidentifier() } # -------------------------------------------------------------------- # Array-Like Methods # ndarray compat def __len__(self) -> int: """ Return the length of the Index. """ return len(self._data) def __array__(self, dtype=None) -> np.ndarray: """ The array interface, return my values. """ return np.asarray(self._data, dtype=dtype) def __array_wrap__(self, result, context=None): """ Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: return result attrs = self._get_attributes_dict() return Index(result, **attrs) @cache_readonly def dtype(self): """ Return the dtype object of the underlying data. """ return self._data.dtype @final def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. Returns ------- numpy.ndarray Flattened array. See Also -------- numpy.ndarray.ravel : Return a flattened array. """ warnings.warn( "Index.ravel returning ndarray is deprecated; in a future version " "this will return a view on self.", FutureWarning, stacklevel=2, ) values = self._get_engine_target() return values.ravel(order=order) def view(self, cls=None): # we need to see if we are subclassing an # index type here if cls is not None and not hasattr(cls, "_typ"): result = self._data.view(cls) else: result = self._shallow_copy() if isinstance(result, Index): result._id = self._id return result def astype(self, dtype, copy=True): """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a TypeError exception is raised. Parameters ---------- dtype : numpy dtype or pandas type Note that any signed integer `dtype` is treated as ``'int64'``, and any unsigned integer `dtype` is treated as ``'uint64'``, regardless of the size. copy : bool, default True By default, astype always returns a newly allocated object. If copy is set to False and internal requirements on dtype are satisfied, the original data is used to create a new Index or the original Index is returned. Returns ------- Index Index with values cast to specified dtype. """ if dtype is not None: dtype = pandas_dtype(dtype) if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self elif is_categorical_dtype(dtype): from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex( self._values, name=self.name, dtype=dtype, copy=copy ) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) try: casted = self._values.astype(dtype, copy=copy) except (TypeError, ValueError) as err: raise TypeError( f"Cannot cast {type(self).__name__} to dtype {dtype}" ) from err return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ "take" ] = """ Return a new %(klass)s of the values selected by the indices. For internal compatibility with numpy arrays. Parameters ---------- indices : list Indices to be taken. axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True fill_value : bool, default None If allow_fill=True and fill_value is not None, indices specified by -1 is regarded as NA. If Index doesn't hold NA, raise ValueError. Returns ------- numpy.ndarray Elements of given indices. See Also -------- numpy.ndarray.take: Return an array formed from the elements of a at the given indices. """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if kwargs: nv.validate_take((), kwargs) indices = ensure_platform_int(indices) allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) # Note: we discard fill_value and use self._na_value, only relevant # in the case where allow_fill is True and fill_value is not None taken = algos.take( self._values, indices, allow_fill=allow_fill, fill_value=self._na_value ) return self._shallow_copy(taken) def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: """ We only use pandas-style take when allow_fill is True _and_ fill_value is not None. """ if allow_fill and fill_value is not None: # only fill if we are passing a non-None fill_value if self._can_hold_na: if (indices < -1).any(): raise ValueError( "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" ) else: cls_name = type(self).__name__ raise ValueError( f"Unable to fill values because {cls_name} cannot contain NA" ) else: allow_fill = False return allow_fill _index_shared_docs[ "repeat" ] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s is repeated consecutively a given number of times. Parameters ---------- repeats : int or array of ints The number of repetitions for each element. This should be a non-negative integer. Repeating 0 times will return an empty %(klass)s. axis : None Must be ``None``. Has no effect but is accepted for compatibility with numpy. Returns ------- repeated_index : %(klass)s Newly created %(klass)s with repeated elements. See Also -------- Series.repeat : Equivalent function for Series. numpy.repeat : Similar method for :class:`numpy.ndarray`. Examples -------- >>> idx = pd.Index(['a', 'b', 'c']) >>> idx Index(['a', 'b', 'c'], dtype='object') >>> idx.repeat(2) Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object') >>> idx.repeat([1, 2, 3]) Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') """ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): repeats = ensure_platform_int(repeats) nv.validate_repeat((), {"axis": axis}) return self._shallow_copy(self._values.repeat(repeats)) # -------------------------------------------------------------------- # Copying Methods def copy( self: _IndexT, name: Optional[Label] = None, deep: bool = False, dtype: Optional[Dtype] = None, names: Optional[Sequence[Label]] = None, ) -> _IndexT: """ Make a copy of this object. Name and dtype sets those attributes on the new object. Parameters ---------- name : Label, optional Set name for new object. deep : bool, default False dtype : numpy dtype or pandas type, optional Set dtype for new object. .. deprecated:: 1.2.0 use ``astype`` method instead. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. Returns ------- Index Index refer to new object which is a copy of this object. Notes ----- In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: new_index = self._shallow_copy(self._data.copy(), name=name) else: new_index = self._shallow_copy(name=name) if dtype: warnings.warn( "parameter dtype is deprecated and will be removed in a future " "version. Use the astype method instead.", FutureWarning, stacklevel=2, ) new_index = new_index.astype(dtype) return new_index @final def __copy__(self, **kwargs): return self.copy(**kwargs) @final def __deepcopy__(self, memo=None): """ Parameters ---------- memo, default None Standard signature. Unused """ return self.copy(deep=True) # -------------------------------------------------------------------- # Rendering Methods def __repr__(self) -> str_t: """ Return a string representation for this object. """ klass_name = type(self).__name__ data = self._format_data() attrs = self._format_attrs() space = self._format_space() attrs_str = [f"{k}={v}" for k, v in attrs] prepr = f",{space}".join(attrs_str) # no data provided, just attributes if data is None: data = "" res = f"{klass_name}({data}{prepr})" return res def _format_space(self) -> str_t: # using space here controls if the attributes # are line separated or not (the default) # max_seq_items = get_option('display.max_seq_items') # if len(self) > max_seq_items: # space = "\n%s" % (' ' * (len(klass) + 1)) return " " @property def _formatter_func(self): """ Return the formatter function. """ return default_pprint def _format_data(self, name=None) -> str_t: """ Return the formatted data as a unicode string. """ # do we want to justify (only do so for non-objects) is_justify = True if self.inferred_type == "string": is_justify = False elif self.inferred_type == "categorical": # error: "Index" has no attribute "categories" if is_object_dtype(self.categories): # type: ignore[attr-defined] is_justify = False return format_object_summary( self, self._formatter_func, is_justify=is_justify, name=name ) def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value). """ return format_object_attrs(self) def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values def format( self, name: bool = False, formatter: Optional[Callable] = None, na_rep: str_t = "NaN", ) -> List[str_t]: """ Render a string representation of the Index. """ header = [] if name: header.append( pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) if self.name is not None else "" ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, na_rep=na_rep) def _format_with_header( self, header: List[str_t], na_rep: str_t = "NaN" ) -> List[str_t]: from pandas.io.formats.format import format_array values = self._values if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) if is_object_dtype(values.dtype): result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] # could have nans mask = isna(values) if mask.any(): result_arr = np.array(result) result_arr[mask] = na_rep result = result_arr.tolist() else: result = trim_front(format_array(values, None, justify="left")) return header + result def to_native_types(self, slicer=None, **kwargs): """ Format specified values of `self` and return them. .. deprecated:: 1.2.0 Parameters ---------- slicer : int, array-like An indexer into `self` that specifies which values are used in the formatting process. kwargs : dict Options for specifying how the values should be formatted. These options include the following: 1) na_rep : str The value that serves as a placeholder for NULL values 2) quoting : bool or None Whether or not there are quoted values in `self` 3) date_format : str The format used to represent date-like values. Returns ------- numpy.ndarray Formatted values. """ warnings.warn( "The 'to_native_types' method is deprecated and will be removed in " "a future version. Use 'astype(str)' instead.", FutureWarning, stacklevel=2, ) values = self if slicer is not None: values = values[slicer] return values._format_native_types(**kwargs) def _format_native_types(self, na_rep="", quoting=None, **kwargs): """ Actually format specific types of the index. """ mask = isna(self) if not self.is_object() and not quoting: values = np.asarray(self).astype(str) else: values = np.array(self, dtype=object, copy=True) values[mask] = na_rep return values def _summary(self, name=None) -> str_t: """ Return a summarized representation. Parameters ---------- name : str name to use in the summary representation Returns ------- String with a summarized representation of the index """ if len(self) > 0: head = self[0] if hasattr(head, "format") and not isinstance(head, str): head = head.format() tail = self[-1] if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() index_summary = f", {head} to {tail}" else: index_summary = "" if name is None: name = type(self).__name__ return f"{name}: {len(self)} entries{index_summary}" # -------------------------------------------------------------------- # Conversion Methods def to_flat_index(self): """ Identity method. .. versionadded:: 0.24.0 This is implemented for compatibility with subclass implementations when chaining. Returns ------- pd.Index Caller. See Also -------- MultiIndex.to_flat_index : Subclass implementation. """ return self def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys. Useful with map for returning an indexer based on an index. Parameters ---------- index : Index, optional Index of resulting Series. If None, defaults to original index. name : str, optional Name of resulting Series. If None, defaults to name of original index. Returns ------- Series The dtype will be based on the type of the Index values. See Also -------- Index.to_frame : Convert an Index to a DataFrame. Series.to_frame : Convert Series to DataFrame. Examples -------- >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') By default, the original Index and original name is reused. >>> idx.to_series() animal Ant Ant Bear Bear Cow Cow Name: animal, dtype: object To enforce a new Index, specify new labels to ``index``: >>> idx.to_series(index=[0, 1, 2]) 0 Ant 1 Bear 2 Cow Name: animal, dtype: object To override the name of the resulting column, specify `name`: >>> idx.to_series(name='zoo') animal Ant Ant Bear Bear Cow Cow Name: zoo, dtype: object """ from pandas import Series if index is None: index = self._shallow_copy() if name is None: name = self.name return Series(self.values.copy(), index=index, name=name) def to_frame(self, index: bool = True, name=None): """ Create a DataFrame with a column containing the Index. .. versionadded:: 0.24.0 Parameters ---------- index : bool, default True Set the index of the returned DataFrame as the original Index. name : object, default None The passed name should substitute for the index name (if it has one). Returns ------- DataFrame DataFrame containing the original Index data. See Also -------- Index.to_series : Convert an Index to a Series. Series.to_frame : Convert Series to DataFrame. Examples -------- >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') >>> idx.to_frame() animal animal Ant Ant Bear Bear Cow Cow By default, the original Index is reused. To enforce a new Index: >>> idx.to_frame(index=False) animal 0 Ant 1 Bear 2 Cow To override the name of the resulting column, specify `name`: >>> idx.to_frame(index=False, name='zoo') zoo 0 Ant 1 Bear 2 Cow """ from pandas import DataFrame if name is None: name = self.name or 0 result = DataFrame({name: self._values.copy()}) if index: result.index = self return result # -------------------------------------------------------------------- # Name-Centric Methods @property def name(self): """ Return Index or MultiIndex name. """ return self._name @name.setter def name(self, value): if self._no_setting_name: # Used in MultiIndex.levels to avoid silently ignoring name updates. raise RuntimeError( "Cannot set name on a level of a MultiIndex. Use " "'MultiIndex.set_names' instead." ) maybe_extract_name(value, None, type(self)) self._name = value @final def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. """ from copy import deepcopy if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: new_names = deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") new_names = names elif not is_list_like(name): new_names = [name] else: new_names = name if len(new_names) != len(self.names): raise ValueError( f"Length of new names must be {len(self.names)}, got {len(new_names)}" ) # All items in 'new_names' need to be hashable validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") return new_names def _get_names(self): return FrozenList((self.name,)) def _set_names(self, values, level=None): """ Set new names on index. Each name has to be a hashable type. Parameters ---------- values : str or sequence name(s) to set level : int, level name, or sequence of int/level names (default None) If the index is a MultiIndex (hierarchical), level(s) to set (None for all levels). Otherwise level must be None Raises ------ TypeError if each name is not hashable. """ if not is_list_like(values): raise ValueError("Names must be a list-like") if len(values) != 1: raise ValueError(f"Length of new names must be 1, got {len(values)}") # GH 20527 # All items in 'name' need to be hashable: validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") self._name = values[0] names = property(fset=_set_names, fget=_get_names) @final def set_names(self, names, level=None, inplace: bool = False): """ Set Index or MultiIndex name. Able to set new names partially and by level. Parameters ---------- names : label or list of label Name(s) to set. level : int, label or list of int or label, optional If the index is a MultiIndex, level(s) to set (None for all levels). Otherwise level must be None. inplace : bool, default False Modifies the object directly, instead of creating a new Index or MultiIndex. Returns ------- Index or None The same type as the caller or None if ``inplace=True``. See Also -------- Index.rename : Able to set new names without level. Examples -------- >>> idx = pd.Index([1, 2, 3, 4]) >>> idx Int64Index([1, 2, 3, 4], dtype='int64') >>> idx.set_names('quarter') Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]]) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], ) >>> idx.set_names(['kind', 'year'], inplace=True) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['kind', 'year']) >>> idx.set_names('species', level=0) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['species', 'year']) """ if level is not None and not isinstance(self, ABCMultiIndex): raise ValueError("Level must be None for non-MultiIndex") if level is not None and not is_list_like(level) and is_list_like(names): raise TypeError("Names must be a string when a single level is provided.") if not is_list_like(names) and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") if not is_list_like(names): names = [names] if level is not None and not is_list_like(level): level = [level] if inplace: idx = self else: idx = self._shallow_copy() idx._set_names(names, level=level) if not inplace: return idx def rename(self, name, inplace=False): """ Alter Index or MultiIndex name. Able to set new names without level. Defaults to returning new index. Length of names must match number of levels in MultiIndex. Parameters ---------- name : label or list of labels Name(s) to set. inplace : bool, default False Modifies the object directly, instead of creating a new Index or MultiIndex. Returns ------- Index or None The same type as the caller or None if ``inplace=True``. See Also -------- Index.set_names : Able to set new names partially and by level. Examples -------- >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]], ... names=['kind', 'year']) >>> idx MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['kind', 'year']) >>> idx.rename(['species', 'year']) MultiIndex([('python', 2018), ('python', 2019), ( 'cobra', 2018), ( 'cobra', 2019)], names=['species', 'year']) >>> idx.rename('species') Traceback (most recent call last): TypeError: Must pass list-like as `names`. """ return self.set_names([name], inplace=inplace) # -------------------------------------------------------------------- # Level-Centric Methods @property def nlevels(self) -> int: """ Number of levels. """ return 1 def _sort_levels_monotonic(self): """ Compat with MultiIndex. """ return self @final def _validate_index_level(self, level): """ Validate index level. For single-level Index getting level number is a no-op, but some verification must be done like in MultiIndex. """ if isinstance(level, int): if level < 0 and level != -1: raise IndexError( "Too many levels: Index has only 1 level, " f"{level} is not a valid level number" ) elif level > 0: raise IndexError( f"Too many levels: Index has only 1 level, not {level + 1}" ) elif level != self.name: raise KeyError( f"Requested level ({level}) does not match index name ({self.name})" ) def _get_level_number(self, level) -> int: self._validate_index_level(level) return 0 def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ For internal compatibility with the Index API. Sort the Index. This is for compat with MultiIndex Parameters ---------- ascending : bool, default True False to sort in descending order level, sort_remaining are compat parameters Returns ------- Index """ if not isinstance(ascending, (list, bool)): raise TypeError( "ascending must be a single bool value or" "a list of bool values of length 1" ) if isinstance(ascending, list): if len(ascending) != 1: raise TypeError("ascending must be a list of bool values of length 1") ascending = ascending[0] if not isinstance(ascending, bool): raise TypeError("ascending must be a bool value") return self.sort_values(return_indexer=True, ascending=ascending) def _get_level_values(self, level): """ Return an Index of values for requested level. This is primarily useful to get an individual level of values from a MultiIndex, but is provided on Index as well for compatibility. Parameters ---------- level : int or str It is either the integer position or the name of the level. Returns ------- Index Calling object, as there is only one level in the Index. See Also -------- MultiIndex.get_level_values : Get values for a level of a MultiIndex. Notes ----- For Index, level should be 0, since there are no multiple levels. Examples -------- >>> idx = pd.Index(list('abc')) >>> idx Index(['a', 'b', 'c'], dtype='object') Get level values by supplying `level` as integer: >>> idx.get_level_values(0) Index(['a', 'b', 'c'], dtype='object') """ self._validate_index_level(level) return self get_level_values = _get_level_values @final def droplevel(self, level=0): """ Return index with requested level(s) removed. If resulting index has only 1 level left, the result will be of Index type, not MultiIndex. Parameters ---------- level : int, str, or list-like, default 0 If a string is given, must be the name of a level If list-like, elements must be names or indexes of levels. Returns ------- Index or MultiIndex Examples -------- >>> mi = pd.MultiIndex.from_arrays( ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) >>> mi MultiIndex([(1, 3, 5), (2, 4, 6)], names=['x', 'y', 'z']) >>> mi.droplevel() MultiIndex([(3, 5), (4, 6)], names=['y', 'z']) >>> mi.droplevel(2) MultiIndex([(1, 3), (2, 4)], names=['x', 'y']) >>> mi.droplevel('z') MultiIndex([(1, 3), (2, 4)], names=['x', 'y']) >>> mi.droplevel(['x', 'y']) Int64Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] return self._drop_level_numbers(levnums) def _drop_level_numbers(self, levnums: List[int]): """ Drop MultiIndex levels by level _number_, not name. """ if len(levnums) == 0: return self if len(levnums) >= self.nlevels: raise ValueError( f"Cannot remove {len(levnums)} levels from an index with " f"{self.nlevels} levels: at least one level must be left." ) # The two checks above guarantee that here self is a MultiIndex self = cast("MultiIndex", self) new_levels = list(self.levels) new_codes = list(self.codes) new_names = list(self.names) for i in levnums: new_levels.pop(i) new_codes.pop(i) new_names.pop(i) if len(new_levels) == 1: # set nan if needed mask = new_codes[0] == -1 result = new_levels[0].take(new_codes[0]) if mask.any(): result = result.putmask(mask, np.nan) result._name = new_names[0] return result else: from pandas.core.indexes.multi import MultiIndex return MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False, ) def _get_grouper_for_level(self, mapper, level=None): """ Get index grouper corresponding to an index level Parameters ---------- mapper: Group mapping function or None Function mapping index values to groups level : int or None Index level Returns ------- grouper : Index Index of values to group on. labels : ndarray of int or None Array of locations in level_index. uniques : Index or None Index of unique values for level. """ assert level is None or level == 0 if mapper is None: grouper = self else: grouper = self.map(mapper) return grouper, None, None # -------------------------------------------------------------------- # Introspection Methods @final @property def is_monotonic(self) -> bool: """ Alias for is_monotonic_increasing. """ return self.is_monotonic_increasing @property def is_monotonic_increasing(self) -> bool: """ Return if the index is monotonic increasing (only equal or increasing) values. Examples -------- >>> Index([1, 2, 3]).is_monotonic_increasing True >>> Index([1, 2, 2]).is_monotonic_increasing True >>> Index([1, 3, 2]).is_monotonic_increasing False """ return self._engine.is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. Examples -------- >>> Index([3, 2, 1]).is_monotonic_decreasing True >>> Index([3, 2, 2]).is_monotonic_decreasing True >>> Index([3, 1, 2]).is_monotonic_decreasing False """ return self._engine.is_monotonic_decreasing @property def _is_strictly_monotonic_increasing(self) -> bool: """ Return if the index is strictly monotonic increasing (only increasing) values. Examples -------- >>> Index([1, 2, 3])._is_strictly_monotonic_increasing True >>> Index([1, 2, 2])._is_strictly_monotonic_increasing False >>> Index([1, 3, 2])._is_strictly_monotonic_increasing False """ return self.is_unique and self.is_monotonic_increasing @property def _is_strictly_monotonic_decreasing(self) -> bool: """ Return if the index is strictly monotonic decreasing (only decreasing) values. Examples -------- >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing True >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing False >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing False """ return self.is_unique and self.is_monotonic_decreasing @cache_readonly def is_unique(self) -> bool: """ Return if the index has unique values. """ return self._engine.is_unique @property def has_duplicates(self) -> bool: """ Check if the Index has duplicate values. Returns ------- bool Whether or not the Index has duplicate values. Examples -------- >>> idx = pd.Index([1, 5, 7, 7]) >>> idx.has_duplicates True >>> idx = pd.Index([1, 5, 7]) >>> idx.has_duplicates False >>> idx = pd.Index(["Watermelon", "Orange", "Apple", ... "Watermelon"]).astype("category") >>> idx.has_duplicates True >>> idx = pd.Index(["Orange", "Apple", ... "Watermelon"]).astype("category") >>> idx.has_duplicates False """ return not self.is_unique @final def is_boolean(self) -> bool: """ Check if the Index only consists of booleans. Returns ------- bool Whether or not the Index only consists of booleans. See Also -------- is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index([True, False, True]) >>> idx.is_boolean() True >>> idx = pd.Index(["True", "False", "True"]) >>> idx.is_boolean() False >>> idx = pd.Index([True, False, "True"]) >>> idx.is_boolean() False """ return self.inferred_type in ["boolean"] @final def is_integer(self) -> bool: """ Check if the Index only consists of integers. Returns ------- bool Whether or not the Index only consists of integers. See Also -------- is_boolean : Check if the Index only consists of booleans. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index([1, 2, 3, 4]) >>> idx.is_integer() True >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) >>> idx.is_integer() False >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) >>> idx.is_integer() False """ return self.inferred_type in ["integer"] @final def is_floating(self) -> bool: """ Check if the Index is a floating type. The Index may consist of only floats, NaNs, or a mix of floats, integers, or NaNs. Returns ------- bool Whether or not the Index only consists of only consists of floats, NaNs, or a mix of floats, integers, or NaNs. See Also -------- is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) >>> idx.is_floating() True >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0]) >>> idx.is_floating() True >>> idx = pd.Index([1, 2, 3, 4, np.nan]) >>> idx.is_floating() True >>> idx = pd.Index([1, 2, 3, 4]) >>> idx.is_floating() False """ return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] @final def is_numeric(self) -> bool: """ Check if the Index only consists of numeric data. Returns ------- bool Whether or not the Index only consists of numeric data. See Also -------- is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) >>> idx.is_numeric() True >>> idx = pd.Index([1, 2, 3, 4.0]) >>> idx.is_numeric() True >>> idx = pd.Index([1, 2, 3, 4]) >>> idx.is_numeric() True >>> idx = pd.Index([1, 2, 3, 4.0, np.nan]) >>> idx.is_numeric() True >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"]) >>> idx.is_numeric() False """ return self.inferred_type in ["integer", "floating"] @final def is_object(self) -> bool: """ Check if the Index is of the object dtype. Returns ------- bool Whether or not the Index is of the object dtype. See Also -------- is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) >>> idx.is_object() True >>> idx = pd.Index(["Apple", "Mango", 2.0]) >>> idx.is_object() True >>> idx = pd.Index(["Watermelon", "Orange", "Apple", ... "Watermelon"]).astype("category") >>> idx.is_object() False >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) >>> idx.is_object() False """ return is_object_dtype(self.dtype) @final def is_categorical(self) -> bool: """ Check if the Index holds categorical data. Returns ------- bool True if the Index is categorical. See Also -------- CategoricalIndex : Index for categorical data. is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_interval : Check if the Index holds Interval objects. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index(["Watermelon", "Orange", "Apple", ... "Watermelon"]).astype("category") >>> idx.is_categorical() True >>> idx = pd.Index([1, 3, 5, 7]) >>> idx.is_categorical() False >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) >>> s 0 Peter 1 Victor 2 Elisabeth 3 Mar dtype: object >>> s.index.is_categorical() False """ return self.inferred_type in ["categorical"] @final def is_interval(self) -> bool: """ Check if the Index holds Interval objects. Returns ------- bool Whether or not the Index holds Interval objects. See Also -------- IntervalIndex : Index for Interval objects. is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_mixed : Check if the Index holds data with mixed data types. Examples -------- >>> idx = pd.Index([pd.Interval(left=0, right=5), ... pd.Interval(left=5, right=10)]) >>> idx.is_interval() True >>> idx = pd.Index([1, 3, 5, 7]) >>> idx.is_interval() False """ return self.inferred_type in ["interval"] @final def is_mixed(self) -> bool: """ Check if the Index holds data with mixed data types. Returns ------- bool Whether or not the Index holds data with mixed data types. See Also -------- is_boolean : Check if the Index only consists of booleans. is_integer : Check if the Index only consists of integers. is_floating : Check if the Index is a floating type. is_numeric : Check if the Index only consists of numeric data. is_object : Check if the Index is of the object dtype. is_categorical : Check if the Index holds categorical data. is_interval : Check if the Index holds Interval objects. Examples -------- >>> idx = pd.Index(['a', np.nan, 'b']) >>> idx.is_mixed() True >>> idx = pd.Index([1.0, 2.0, 3.0, 5.0]) >>> idx.is_mixed() False """ warnings.warn( "Index.is_mixed is deprecated and will be removed in a future version. " "Check index.inferred_type directly instead.", FutureWarning, stacklevel=2, ) return self.inferred_type in ["mixed"] @final def holds_integer(self) -> bool: """ Whether the type is an integer type. """ return self.inferred_type in ["integer", "mixed-integer"] @cache_readonly def inferred_type(self) -> str_t: """ Return a string of the type inferred from the values. """ return lib.infer_dtype(self._values, skipna=False) @cache_readonly def _is_all_dates(self) -> bool: """ Whether or not the index values only consist of dates. """ return is_datetime_array(ensure_object(self._values)) @cache_readonly def is_all_dates(self): """ Whether or not the index values only consist of dates. """ warnings.warn( "Index.is_all_dates is deprecated, will be removed in a future version. " "check index.inferred_type instead", FutureWarning, stacklevel=2, ) return self._is_all_dates # -------------------------------------------------------------------- # Pickle Methods def __reduce__(self): d = {"data": self._data} d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None # -------------------------------------------------------------------- # Null Handling Methods _na_value = np.nan """The expected NA value to use with this index.""" @cache_readonly def _isnan(self): """ Return if each value is NaN. """ if self._can_hold_na: return isna(self) else: # shouldn't reach to this condition by checking hasnans beforehand values = np.empty(len(self), dtype=np.bool_) values.fill(False) return values @cache_readonly @final def _nan_idxs(self): if self._can_hold_na: return self._isnan.nonzero()[0] else: return np.array([], dtype=np.int64) @cache_readonly def hasnans(self) -> bool: """ Return if I have any nans; enables various perf speedups. """ if self._can_hold_na: return bool(self._isnan.any()) else: return False @final def isna(self): """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get mapped to ``True`` values. Everything else get mapped to ``False`` values. Characters such as empty strings `''` or :attr:`numpy.inf` are not considered NA values (unless you set ``pandas.options.mode.use_inf_as_na = True``). Returns ------- numpy.ndarray A boolean array of whether my values are NA. See Also -------- Index.notna : Boolean inverse of isna. Index.dropna : Omit entries with missing values. isna : Top-level isna. Series.isna : Detect missing values in Series object. Examples -------- Show which entries in a pandas.Index are NA. The result is an array. >>> idx = pd.Index([5.2, 6.0, np.NaN]) >>> idx Float64Index([5.2, 6.0, nan], dtype='float64') >>> idx.isna() array([False, False, True]) Empty strings are not considered NA values. None is considered an NA value. >>> idx = pd.Index(['black', '', 'red', None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.isna() array([False, False, False, True]) For datetimes, `NaT` (Not a Time) is considered as an NA value. >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), ... pd.Timestamp(''), None, pd.NaT]) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]', freq=None) >>> idx.isna() array([False, True, True, True]) """ return self._isnan isnull = isna @final def notna(self): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. Non-missing values get mapped to ``True``. Characters such as empty strings ``''`` or :attr:`numpy.inf` are not considered NA values (unless you set ``pandas.options.mode.use_inf_as_na = True``). NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` values. Returns ------- numpy.ndarray Boolean array to indicate which entries are not NA. See Also -------- Index.notnull : Alias of notna. Index.isna: Inverse of notna. notna : Top-level notna. Examples -------- Show which entries in an Index are not NA. The result is an array. >>> idx = pd.Index([5.2, 6.0, np.NaN]) >>> idx Float64Index([5.2, 6.0, nan], dtype='float64') >>> idx.notna() array([ True, True, False]) Empty strings are not considered NA values. None is considered a NA value. >>> idx = pd.Index(['black', '', 'red', None]) >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.notna() array([ True, True, True, False]) """ return ~self.isna() notnull = notna def fillna(self, value=None, downcast=None): """ Fill NA/NaN values with the specified value. Parameters ---------- value : scalar Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. downcast : dict, default is None A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). Returns ------- Index See Also -------- DataFrame.fillna : Fill NaN values of a DataFrame. Series.fillna : Fill NaN Values of a Series. """ value = self._require_scalar(value) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: # no need to care metadata other than name # because it can't have freq if return Index(result, name=self.name) return self._shallow_copy() def dropna(self, how="any"): """ Return Index without NA/NaN values. Parameters ---------- how : {'any', 'all'}, default 'any' If the Index is a MultiIndex, drop the value when any or all levels are NaN. Returns ------- Index """ if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") if self.hasnans: return self._shallow_copy(self._values[~self._isnan]) return self._shallow_copy() # -------------------------------------------------------------------- # Uniqueness Methods def unique(self, level=None): """ Return unique values in the index. Unique values are returned in order of appearance, this does NOT sort. Parameters ---------- level : int or str, optional, default None Only return values from specified level (for MultiIndex). Returns ------- Index without duplicates See Also -------- unique : Numpy array of unique values in that column. Series.unique : Return unique values of Series object. """ if level is not None: self._validate_index_level(level) if self.is_unique: return self._shallow_copy() result = super().unique() return self._shallow_copy(result) @final def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed. Parameters ---------- keep : {'first', 'last', ``False``}, default 'first' - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - ``False`` : Drop all duplicates. Returns ------- deduplicated : Index See Also -------- Series.drop_duplicates : Equivalent method on Series. DataFrame.drop_duplicates : Equivalent method on DataFrame. Index.duplicated : Related method on Index, indicating duplicate Index values. Examples -------- Generate an pandas.Index with duplicate values. >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) The `keep` parameter controls which duplicate values are removed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. >>> idx.drop_duplicates(keep='first') Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') The value 'last' keeps the last occurrence for each set of duplicated entries. >>> idx.drop_duplicates(keep='last') Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') The value ``False`` discards all sets of duplicated entries. >>> idx.drop_duplicates(keep=False) Index(['cow', 'beetle', 'hippo'], dtype='object') """ if self.is_unique: return self._shallow_copy() return super().drop_duplicates(keep=keep) def duplicated(self, keep="first"): """ Indicate duplicate index values. Duplicated values are indicated as ``True`` values in the resulting array. Either all duplicates, all except the first, or all except the last occurrence of duplicates can be indicated. Parameters ---------- keep : {'first', 'last', False}, default 'first' The value or values in a set of duplicates to mark as missing. - 'first' : Mark duplicates as ``True`` except for the first occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - ``False`` : Mark all duplicates as ``True``. Returns ------- numpy.ndarray See Also -------- Series.duplicated : Equivalent method on pandas.Series. DataFrame.duplicated : Equivalent method on pandas.DataFrame. Index.drop_duplicates : Remove duplicate values from Index. Examples -------- By default, for each set of duplicated values, the first occurrence is set to False and all others to True: >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) >>> idx.duplicated() array([False, False, True, False, True]) which is equivalent to >>> idx.duplicated(keep='first') array([False, False, True, False, True]) By using 'last', the last occurrence of each set of duplicated values is set on False and all others on True: >>> idx.duplicated(keep='last') array([ True, False, True, False, False]) By setting keep on ``False``, all duplicates are True: >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) """ if self.is_unique: # fastpath available bc we are immutable return np.zeros(len(self), dtype=bool) return super().duplicated(keep=keep) def _get_unique_index(self, dropna: bool = False): """ Returns an index containing unique values. Parameters ---------- dropna : bool, default False If True, NaN values are dropped. Returns ------- uniques : index """ if self.is_unique and not dropna: return self if not self.is_unique: values = self.unique() if not isinstance(self, ABCMultiIndex): # extract an array to pass to _shallow_copy values = values._data else: values = self._values if dropna and not isinstance(self, ABCMultiIndex): # isna not defined for MultiIndex if self.hasnans: values = values[~isna(values)] return self._shallow_copy(values) # -------------------------------------------------------------------- # Arithmetic & Logical Methods def __iadd__(self, other): # alias for __add__ return self + other @final def __and__(self, other): warnings.warn( "Index.__and__ operating as a set operation is deprecated, " "in the future this will be a logical operation matching " "Series.__and__. Use index.intersection(other) instead", FutureWarning, stacklevel=2, ) return self.intersection(other) @final def __or__(self, other): warnings.warn( "Index.__or__ operating as a set operation is deprecated, " "in the future this will be a logical operation matching " "Series.__or__. Use index.union(other) instead", FutureWarning, stacklevel=2, ) return self.union(other) @final def __xor__(self, other): warnings.warn( "Index.__xor__ operating as a set operation is deprecated, " "in the future this will be a logical operation matching " "Series.__xor__. Use index.symmetric_difference(other) instead", FutureWarning, stacklevel=2, ) return self.symmetric_difference(other) @final def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ # -------------------------------------------------------------------- # Set Operation Methods @final def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, return self, unless the name changes, in which case make a shallow copy of self. """ name = get_op_result_name(self, other) if self.name != name: return self.rename(name) return self @final def _union_incompatible_dtypes(self, other, sort): """ Casts this and other index to object dtype to allow the formation of a union between incompatible types. Parameters ---------- other : Index or array-like sort : False or None, default False Whether to sort the resulting index. * False : do not sort the result. * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. Returns ------- Index """ this = self.astype(object, copy=False) # cast to Index for when `other` is list-like other = Index(other).astype(object, copy=False) return Index.union(this, other, sort=sort).astype(object, copy=False) def _can_union_without_object_cast(self, other) -> bool: """ Check whether this and the other dtype are compatible with each other. Meaning a union can be formed between them without needing to be cast to dtype object. Parameters ---------- other : Index or array-like Returns ------- bool """ return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) @final def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError( "The 'sort' keyword only takes the values of " f"None or False; {sort} was passed." ) def union(self, other, sort=None): """ Form the union of two Index objects. If the Index objects are incompatible, both Index objects will be cast to dtype('object') first. .. versionchanged:: 0.25.0 Parameters ---------- other : Index or array-like sort : bool or None, default None Whether to sort the resulting Index. * None : Sort the result, except when 1. `self` and `other` are equal. 2. `self` or `other` has length 0. 3. Some values in `self` or `other` cannot be compared. A RuntimeWarning is issued in this case. * False : do not sort the result. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 Changed the default value from ``True`` to ``None`` (without change in behaviour). Returns ------- union : Index Examples -------- Union matching dtypes >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.union(idx2) Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') Union mismatched dtypes >>> idx1 = pd.Index(['a', 'b', 'c', 'd']) >>> idx2 = pd.Index([1, 2, 3, 4]) >>> idx1.union(idx2) Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) if not self._can_union_without_object_cast(other): return self._union_incompatible_dtypes(other, sort=sort) result = self._union(other, sort=sort) return self._wrap_setop_result(other, result) def _union(self, other, sort): """ Specific union logic should go here. In subclasses, union behavior should be overwritten here rather than in `self.union`. Parameters ---------- other : Index or array-like sort : False or None, default False Whether to sort the resulting index. * False : do not sort the result. * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. Returns ------- Index """ if not len(other) or self.equals(other): return self if not len(self): return other # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values if sort is None and self.is_monotonic and other.is_monotonic: try: result = self._outer_indexer(lvals, rvals)[0] except TypeError: # incomparable objects result = list(lvals) # worth making this faster? a very unusual case value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) result = Index(result)._values # do type inference here else: # find indexes of things in "other" that are not in "self" if self.is_unique: indexer = self.get_indexer(other) indexer = (indexer == -1).nonzero()[0] else: indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) result = concat_compat((lvals, other_diff)) else: result = lvals if sort is None: try: result = algos.safe_sort(result) except TypeError as err: warnings.warn( f"{err}, sort order is undefined for incomparable objects", RuntimeWarning, stacklevel=3, ) return result @final def _wrap_setop_result(self, other, result): if isinstance(self, (ABCDatetimeIndex, ABCTimedeltaIndex)) and isinstance( result, np.ndarray ): result = type(self._data)._simple_new(result, dtype=self.dtype) elif is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): result = Categorical(result, dtype=self.dtype) name = get_op_result_name(self, other) if isinstance(result, Index): if result.name != name: return result.rename(name) return result else: return self._shallow_copy(result, name=name) # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. Parameters ---------- other : Index or array-like sort : False or None, default False Whether to sort the resulting index. * False : do not sort the result. * None : sort the result, except when `self` and `other` are equal or when the values cannot be compared. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 Changed the default from ``True`` to ``False``, to match the behaviour of 0.23.4 and earlier. Returns ------- intersection : Index Examples -------- >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.intersection(idx2) Int64Index([3, 4], dtype='int64') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, _ = self._convert_can_do_setop(other) if self.equals(other): if self.has_duplicates: return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): dtype = find_common_type([self.dtype, other.dtype]) this = self.astype(dtype, copy=False) other = other.astype(dtype, copy=False) return this.intersection(other, sort=sort) result = self._intersection(other, sort=sort) return self._wrap_setop_result(other, result) def _intersection(self, other, sort=False): """ intersection specialized to the case with matching dtypes. """ # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(lvals, rvals)[0] except TypeError: pass else: return algos.unique1d(result) try: indexer = Index(rvals).get_indexer(lvals) indexer = indexer.take((indexer != -1).nonzero()[0]) except (InvalidIndexError, IncompatibleFrequency): # InvalidIndexError raised by get_indexer if non-unique # IncompatibleFrequency raised by PeriodIndex.get_indexer indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] result = other.take(indexer).unique()._values if sort is None: result = algos.safe_sort(result) # Intersection has to be unique assert Index(result).is_unique return result def difference(self, other, sort=None): """ Return a new Index with elements of index not in `other`. This is the set difference of two Index objects. Parameters ---------- other : Index or array-like sort : False or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from incomparable elements is caught by pandas. * None : Attempt to sort the result, but catch any TypeErrors from comparing incomparable elements. * False : Do not sort the result. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 Changed the default value from ``True`` to ``None`` (without change in behaviour). Returns ------- difference : Index Examples -------- >>> idx1 = pd.Index([2, 1, 3, 4]) >>> idx2 = pd.Index([3, 4, 5, 6]) >>> idx1.difference(idx2) Int64Index([1, 2], dtype='int64') >>> idx1.difference(idx2, sort=False) Int64Index([2, 1], dtype='int64') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) if self.equals(other): return self[:0].rename(result_name) result = self._difference(other, sort=sort) return self._wrap_setop_result(other, result) def _difference(self, other, sort): this = self._get_unique_index() indexer = this.get_indexer(other) indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this._values.take(label_diff) if sort is None: try: the_diff = algos.safe_sort(the_diff) except TypeError: pass return the_diff def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. Parameters ---------- other : Index or array-like result_name : str sort : False or None, default None Whether to sort the resulting index. By default, the values are attempted to be sorted, but any TypeError from incomparable elements is caught by pandas. * None : Attempt to sort the result, but catch any TypeErrors from comparing incomparable elements. * False : Do not sort the result. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 Changed the default value from ``True`` to ``None`` (without change in behaviour). Returns ------- symmetric_difference : Index Notes ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates dropped. Examples -------- >>> idx1 = pd.Index([1, 2, 3, 4]) >>> idx2 = pd.Index([2, 3, 4, 5]) >>> idx1.symmetric_difference(idx2) Int64Index([1, 5], dtype='int64') You can also use the ``^`` operator: >>> idx1 ^ idx2 Int64Index([1, 5], dtype='int64') """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_name_update = self._convert_can_do_setop(other) if result_name is None: result_name = result_name_update this = self._get_unique_index() other = other._get_unique_index() indexer = this.get_indexer(other) # {this} minus {other} common_indexer = indexer.take((indexer != -1).nonzero()[0]) left_indexer = np.setdiff1d( np.arange(this.size), common_indexer, assume_unique=True ) left_diff = this._values.take(left_indexer) # {other} minus {this} right_indexer = (indexer == -1).nonzero()[0] right_diff = other._values.take(right_indexer) the_diff = concat_compat([left_diff, right_diff]) if sort is None: try: the_diff = algos.safe_sort(the_diff) except TypeError: pass return Index(the_diff, dtype=self.dtype, name=result_name) def _assert_can_do_setop(self, other): if not is_list_like(other): raise TypeError("Input must be Index or array-like") return True def _convert_can_do_setop(self, other): if not isinstance(other, Index): other = Index(other, name=self.name) result_name = self.name else: result_name = get_op_result_name(self, other) return other, result_name # -------------------------------------------------------------------- # Indexing Methods def get_loc(self, key, method=None, tolerance=None): """ Get integer location, slice or boolean mask for requested label. Parameters ---------- key : label method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied distances are broken by preferring the larger index value. tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of the index at the matching location must satisfy the equation ``abs(index[loc] - key) <= tolerance``. Returns ------- loc : int if unique index, slice if monotonic index, else mask Examples -------- >>> unique_index = pd.Index(list('abc')) >>> unique_index.get_loc('b') 1 >>> monotonic_index = pd.Index(list('abbc')) >>> monotonic_index.get_loc('b') slice(1, 3, None) >>> non_monotonic_index = pd.Index(list('abcb')) >>> non_monotonic_index.get_loc('b') array([False, True, False, True]) """ if method is None: if tolerance is not None: raise ValueError( "tolerance argument only valid if using pad, " "backfill or nearest lookups" ) casted_key = self._maybe_cast_indexer(key) try: return self._engine.get_loc(casted_key) except KeyError as err: raise KeyError(key) from err if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) indexer = self.get_indexer([key], method=method, tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: raise TypeError("get_loc requires scalar valued input") loc = indexer.item() if loc == -1: raise KeyError(key) return loc _index_shared_docs[ "get_indexer" ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters ---------- target : %(target_klass)s method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied distances are broken by preferring the larger index value. limit : int, optional Maximum number of consecutive labels in ``target`` to match for inexact matches. tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance to all values, or list-like, which applies variable tolerance per element. List-like includes list, tuple, array, Series, and must be the same size as the index and its dtype must exactly match the index's type. Returns ------- indexer : ndarray of int Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. %(raises_section)s Examples -------- >>> index = pd.Index(['c', 'a', 'b']) >>> index.get_indexer(['a', 'b', 'x']) array([ 1, 2, -1]) Notice that the return value is an array of locations in ``index`` and ``x`` is marked by -1, as it is not in ``index``. """ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer( self, target, method=None, limit=None, tolerance=None ) -> np.ndarray: method = missing.clean_reindex_fill_method(method) target = ensure_index(target) if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. # (GH #16877) if target.is_boolean() and self.is_numeric(): return ensure_platform_int(np.repeat(-1, target.size)) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer( ptarget, method=method, limit=limit, tolerance=tolerance ) if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) if not self.is_unique: raise InvalidIndexError( "Reindexing only valid with uniquely valued Index objects" ) if method == "pad" or method == "backfill": indexer = self._get_fill_indexer(target, method, limit, tolerance) elif method == "nearest": indexer = self._get_nearest_indexer(target, limit, tolerance) else: if tolerance is not None: raise ValueError( "tolerance argument only valid if doing pad, " "backfill or nearest reindexing" ) if limit is not None: raise ValueError( "limit argument only valid if doing pad, " "backfill or nearest reindexing" ) indexer = self._engine.get_indexer(target._get_engine_target()) return ensure_platform_int(indexer) def _convert_tolerance(self, tolerance, target): # override this method on subclasses tolerance = np.asarray(tolerance) if target.size != tolerance.size and tolerance.size > 1: raise ValueError("list-like tolerance size must match target index size") return tolerance @final def _get_fill_indexer( self, target: "Index", method: str_t, limit=None, tolerance=None ) -> np.ndarray: target_values = target._get_engine_target() if self.is_monotonic_increasing and target.is_monotonic_increasing: engine_method = ( self._engine.get_pad_indexer if method == "pad" else self._engine.get_backfill_indexer ) indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None and len(self): indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer @final def _get_fill_indexer_searchsorted( self, target: "Index", method: str_t, limit=None ) -> np.ndarray: """ Fallback pad/backfill get_indexer that works for monotonic decreasing indexes and non-monotonic targets. """ if limit is not None: raise ValueError( f"limit argument for {repr(method)} method only well-defined " "if index and target are monotonic" ) side = "left" if method == "pad" else "right" # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) nonexact = indexer == -1 indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) if side == "left": # searchsorted returns "indices into a sorted array such that, # if the corresponding elements in v were inserted before the # indices, the order of a would be preserved". # Thus, we need to subtract 1 to find values to the left. indexer[nonexact] -= 1 # This also mapped not found values (values of 0 from # np.searchsorted) to -1, which conveniently is also our # sentinel for missing values else: # Mark indices to the right of the largest value as not found indexer[indexer == len(self)] = -1 return indexer @final def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or tuples). """ if not len(self): return self._get_fill_indexer(target, "pad") left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) target_values = target._values # error: Unsupported left operand type for - ("ExtensionArray") left_distances = np.abs( self._values[left_indexer] - target_values # type: ignore[operator] ) # error: Unsupported left operand type for - ("ExtensionArray") right_distances = np.abs( self._values[right_indexer] - target_values # type: ignore[operator] ) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( op(left_distances, right_distances) | (right_indexer == -1), left_indexer, right_indexer, ) if tolerance is not None: indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer @final def _filter_indexer_tolerance( self, target: Union["Index", np.ndarray, ExtensionArray], indexer: np.ndarray, tolerance, ) -> np.ndarray: # error: Unsupported left operand type for - ("ExtensionArray") distance = abs(self._values[indexer] - target) # type: ignore[operator] indexer = np.where(distance <= tolerance, indexer, -1) return indexer # -------------------------------------------------------------------- # Indexer Conversion Methods def _get_partial_string_timestamp_match_key(self, key): """ Translate any partial string timestamp matches in key, returning the new key. Only relevant for MultiIndex. """ # GH#10331 return key @final def _validate_positional_slice(self, key: slice): """ For positional indexing, a slice must have either int or None for each of start, stop, and step. """ self._validate_indexer("positional", key.start, "iloc") self._validate_indexer("positional", key.stop, "iloc") self._validate_indexer("positional", key.step, "iloc") def _convert_slice_indexer(self, key: slice, kind: str_t): """ Convert a slice indexer. By definition, these are labels unless 'iloc' is passed in. Floats are not allowed as the start, step, or stop of the slice. Parameters ---------- key : label of the slice bound kind : {'loc', 'getitem'} """ assert kind in ["loc", "getitem"], kind # potentially cast the bounds to integers start, stop, step = key.start, key.stop, key.step # figure out if this is a positional indexer def is_int(v): return v is None or is_integer(v) is_index_slice = is_int(start) and is_int(stop) and is_int(step) is_positional = is_index_slice and not ( self.is_integer() or self.is_categorical() ) if kind == "getitem": """ called from the getitem slicers, validate that we are in fact integers """ if self.is_integer() or is_index_slice: self._validate_indexer("slice", key.start, "getitem") self._validate_indexer("slice", key.stop, "getitem") self._validate_indexer("slice", key.step, "getitem") return key # convert the slice to an indexer here # if we are mixed and have integers if is_positional: try: # Validate start & stop if start is not None: self.get_loc(start) if stop is not None: self.get_loc(stop) is_positional = False except KeyError: pass if com.is_null_slice(key): # It doesn't matter if we are positional or label based indexer = key elif is_positional: if kind == "loc": # GH#16121, GH#24612, GH#31810 warnings.warn( "Slicing a positional slice with .loc is not supported, " "and will raise TypeError in a future version. " "Use .loc with labels or .iloc with positions instead.", FutureWarning, stacklevel=6, ) indexer = key else: indexer = self.slice_indexer(start, stop, step, kind=kind) return indexer def _convert_listlike_indexer(self, keyarr): """ Parameters ---------- keyarr : list-like Indexer to convert. Returns ------- indexer : numpy.ndarray or None Return an ndarray or None if cannot convert. keyarr : numpy.ndarray Return tuple-safe keys. """ if isinstance(keyarr, Index): pass else: keyarr = self._convert_arr_indexer(keyarr) indexer = self._convert_list_indexer(keyarr) return indexer, keyarr def _convert_arr_indexer(self, keyarr): """ Convert an array-like indexer to the appropriate dtype. Parameters ---------- keyarr : array-like Indexer to convert. Returns ------- converted_keyarr : array-like """ keyarr = com.asarray_tuplesafe(keyarr) return keyarr def _convert_list_indexer(self, keyarr): """ Convert a list-like indexer to the appropriate dtype. Parameters ---------- keyarr : Index (or sub-class) Indexer to convert. kind : iloc, loc, optional Returns ------- positional indexer or None """ return None @final def _invalid_indexer(self, form: str_t, key) -> TypeError: """ Consistent invalid indexer message. """ return TypeError( f"cannot do {form} indexing on {type(self).__name__} with these " f"indexers [{key}] of type {type(key).__name__}" ) # -------------------------------------------------------------------- # Reindex Methods @final def _can_reindex(self, indexer): """ Check if we are allowing reindexing with this particular indexer. Parameters ---------- indexer : an integer indexer Raises ------ ValueError if its a duplicate axis """ # trying to reindex on an axis with duplicates if not self._index_as_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values. Parameters ---------- target : an iterable Returns ------- new_index : pd.Index Resulting index. indexer : np.ndarray or None Indices of output values in original index. """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: target = self[:0] else: target = ensure_index(target) if level is not None: if method is not None: raise TypeError("Fill method not supported if level passed") _, indexer, _ = self._join_level( target, level, how="right", return_indexers=True ) else: if self.equals(target): indexer = None else: if self._index_as_unique: indexer = self.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) else: if method is not None or limit is not None: raise ValueError( "cannot reindex a non-unique index " "with a method or limit" ) indexer, missing = self.get_indexer_non_unique(target) if preserve_names and target.nlevels == 1 and target.name != self.name: target = target.copy() target.name = self.name return target, indexer def _reindex_non_unique(self, target): """ Create a new index with target's values (move/add/delete values as necessary) use with non-unique Index and a possibly non-unique target. Parameters ---------- target : an iterable Returns ------- new_index : pd.Index Resulting index. indexer : np.ndarray or None Indices of output values in original index. """ target = ensure_index(target) if len(target) == 0: # GH#13691 return self[:0], np.array([], dtype=np.intp), None indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 new_labels = self.take(indexer[check]) new_indexer = None if len(missing): length = np.arange(len(indexer)) missing = ensure_platform_int(missing) missing_labels = target.take(missing) missing_indexer = ensure_int64(length[~check]) cur_labels = self.take(indexer[check]).values cur_indexer = ensure_int64(length[check]) new_labels = np.empty((len(indexer),), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels # a unique indexer if target.is_unique: # see GH5553, make sure we use the right indexer new_indexer = np.arange(len(indexer)) new_indexer[cur_indexer] = np.arange(len(cur_labels)) new_indexer[missing_indexer] = -1 # we have a non_unique selector, need to use the original # indexer here else: # need to retake to have the same size as the indexer indexer[~check] = -1 # reset the new indexer to account for the new size new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 if isinstance(self, ABCMultiIndex): new_index = type(self).from_tuples(new_labels, names=self.names) else: new_index = Index(new_labels, name=self.name) return new_index, indexer, new_indexer # -------------------------------------------------------------------- # Join Methods def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ Compute join_index and indexers to conform data structures to the new index. Parameters ---------- other : Index how : {'left', 'right', 'inner', 'outer'} level : int or level name, default None return_indexers : bool, default False sort : bool, default False Sort the join keys lexicographically in the result Index. If False, the order of the join keys depends on the join type (how keyword). Returns ------- join_index, (left_indexer, right_indexer) """ other = ensure_index(other) self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) # try to figure out the join level # GH3662 if level is None and (self_is_mi or other_is_mi): # have the same levels/names so a simple join if self.names == other.names: pass else: return self._join_multi(other, how=how, return_indexers=return_indexers) # join on the level if level is not None and (self_is_mi or other_is_mi): return self._join_level( other, level, how=how, return_indexers=return_indexers ) if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: rindexer = np.repeat(-1, len(join_index)) return join_index, None, rindexer else: return join_index if len(self) == 0 and how in ("right", "outer"): join_index = other._shallow_copy() if return_indexers: lindexer = np.repeat(-1, len(join_index)) return join_index, lindexer, None else: return join_index if self._join_precedence < other._join_precedence: how = {"right": "left", "left": "right"}.get(how, how) result = other.join( self, how=how, level=level, return_indexers=return_indexers ) if return_indexers: x, y, z = result result = x, z, y return result if not is_dtype_equal(self.dtype, other.dtype): this = self.astype("O") other = other.astype("O") return this.join(other, how=how, return_indexers=return_indexers) _validate_join_method(how) if not self.is_unique and not other.is_unique: return self._join_non_unique( other, how=how, return_indexers=return_indexers ) elif not self.is_unique or not other.is_unique: if self.is_monotonic and other.is_monotonic: return self._join_monotonic( other, how=how, return_indexers=return_indexers ) else: return self._join_non_unique( other, how=how, return_indexers=return_indexers ) elif self.is_monotonic and other.is_monotonic: try: return self._join_monotonic( other, how=how, return_indexers=return_indexers ) except TypeError: pass if how == "left": join_index = self elif how == "right": join_index = other elif how == "inner": # TODO: sort=False here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.intersection(other, sort=False) elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) if sort: join_index = join_index.sort_values() if return_indexers: if join_index is self: lindexer = None else: lindexer = self.get_indexer(join_index) if join_index is other: rindexer = None else: rindexer = other.get_indexer(join_index) return join_index, lindexer, rindexer else: return join_index @final def _join_multi(self, other, how, return_indexers=True): from pandas.core.indexes.multi import MultiIndex from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names self_names_list = list(com.not_none(*self.names)) other_names_list = list(com.not_none(*other.names)) self_names_order = self_names_list.index other_names_order = other_names_list.index self_names = set(self_names_list) other_names = set(other_names_list) overlap = self_names & other_names # need at least 1 in common if not overlap: raise ValueError("cannot join with no overlapping index names") if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): # Drop the non-matching levels from left and right respectively ldrop_names = sorted(self_names - overlap, key=self_names_order) rdrop_names = sorted(other_names - overlap, key=other_names_order) # if only the order differs if not len(ldrop_names + rdrop_names): self_jnlevels = self other_jnlevels = other.reorder_levels(self.names) else: self_jnlevels = self.droplevel(ldrop_names) other_jnlevels = other.droplevel(rdrop_names) # Join left and right # Join on same leveled multi-index frames is supported join_idx, lidx, ridx = self_jnlevels.join( other_jnlevels, how, return_indexers=True ) # Restore the dropped levels # Returned index level order is # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names levels, codes, names = restore_dropped_levels_multijoin( self, other, dropped_names, join_idx, lidx, ridx ) # Re-create the multi-index multi_join_idx = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) multi_join_idx = multi_join_idx.remove_unused_levels() if return_indexers: return multi_join_idx, lidx, ridx else: return multi_join_idx jl = list(overlap)[0] # Case where only one index is multi # make the indices into mi's that match flip_order = False if isinstance(self, MultiIndex): self, other = other, self flip_order = True # flip if join method is right or left how = {"right": "left", "left": "right"}.get(how, how) level = other.names.index(jl) result = self._join_level( other, level, how=how, return_indexers=return_indexers ) if flip_order: if isinstance(result, tuple): return result[0], result[2], result[1] return result @final def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import get_join_indexers # We only get here if dtypes match assert self.dtype == other.dtype lvalues = self._get_engine_target() rvalues = other._get_engine_target() left_idx, right_idx = get_join_indexers( [lvalues], [rvalues], how=how, sort=True ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) join_index = np.asarray(lvalues.take(left_idx)) mask = left_idx == -1 np.putmask(join_index, mask, rvalues.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) if return_indexers: return join_index, left_idx, right_idx else: return join_index @final def _join_level( self, other, level, how="left", return_indexers=False, keep_order=True ): """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the labels of the level in the MultiIndex. If ```keep_order == True```, the order of the data indexed by the MultiIndex will not be changed; otherwise, it will tie out with `other`. """ from pandas.core.indexes.multi import MultiIndex def _get_leaf_sorter(labels): """ Returns sorter for the inner most level while preserving the order of higher levels. """ if labels[0].size == 0: return np.empty(0, dtype="int64") if len(labels) == 1: lab = ensure_int64(labels[0]) sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) return sorter # find indexers of beginning of each set of # same-key labels w.r.t all but last level tic = labels[0][:-1] != labels[0][1:] for lab in labels[1:-1]: tic |= lab[:-1] != lab[1:] starts = np.hstack(([True], tic, [True])).nonzero()[0] lab = ensure_int64(labels[-1]) return lib.get_level_sorter(lab, ensure_int64(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError("Join on level between two MultiIndex objects is ambiguous") left, right = self, other flip_order = not isinstance(self, MultiIndex) if flip_order: left, right = right, left how = {"right": "left", "left": "right"}.get(how, how) assert isinstance(left, MultiIndex) level = left._get_level_number(level) old_level = left.levels[level] if not right.is_unique: raise NotImplementedError( "Index._join_level on non-unique index is not implemented" ) new_level, left_lev_indexer, right_lev_indexer = old_level.join( right, how=how, return_indexers=True ) if left_lev_indexer is None: if keep_order or len(left) == 0: left_indexer = None join_index = left else: # sort the leaves left_indexer = _get_leaf_sorter(left.codes[: level + 1]) join_index = left[left_indexer] else: left_lev_indexer = ensure_int64(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) old_codes = left.codes[level] new_lev_codes = algos.take_nd( rev_indexer, old_codes[old_codes != -1], allow_fill=False ) new_codes = list(left.codes) new_codes[level] = new_lev_codes new_levels = list(left.levels) new_levels[level] = new_level if keep_order: # just drop missing values. o.w. keep order left_indexer = np.arange(len(left), dtype=np.intp) mask = new_lev_codes != -1 if not mask.all(): new_codes = [lab[mask] for lab in new_codes] left_indexer = left_indexer[mask] else: # tie out the order with other if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( new_lev_codes, ngroups ) # missing values are placed first; drop them! left_indexer = left_indexer[counts[0] :] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves mask = new_lev_codes != -1 mask_all = mask.all() if not mask_all: new_codes = [lab[mask] for lab in new_codes] left_indexer = _get_leaf_sorter(new_codes[: level + 1]) new_codes = [lab[left_indexer] for lab in new_codes] # left_indexers are w.r.t masked frame. # reverse to original frame! if not mask_all: left_indexer = mask.nonzero()[0][left_indexer] join_index = MultiIndex( levels=new_levels, codes=new_codes, names=left.names, verify_integrity=False, ) if right_lev_indexer is not None: right_indexer = algos.take_nd( right_lev_indexer, join_index.codes[level], allow_fill=False ) else: right_indexer = join_index.codes[level] if flip_order: left_indexer, right_indexer = right_indexer, left_indexer if return_indexers: left_indexer = ( None if left_indexer is None else ensure_platform_int(left_indexer) ) right_indexer = ( None if right_indexer is None else ensure_platform_int(right_indexer) ) return join_index, left_indexer, right_indexer else: return join_index @final def _join_monotonic(self, other, how="left", return_indexers=False): # We only get here with matching dtypes assert other.dtype == self.dtype if self.equals(other): ret_index = other if how == "right" else self if return_indexers: return ret_index, None, None else: return ret_index sv = self._get_engine_target() ov = other._get_engine_target() if self.is_unique and other.is_unique: # We can perform much better than the general case if how == "left": join_index = self lidx = None ridx = self._left_indexer_unique(sv, ov) elif how == "right": join_index = other lidx = self._left_indexer_unique(ov, sv) ridx = None elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) else: if how == "left": join_index, lidx, ridx = self._left_indexer(sv, ov) elif how == "right": join_index, ridx, lidx = self._left_indexer(ov, sv) elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) if return_indexers: lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) return join_index, lidx, ridx else: return join_index def _wrap_joined_index( self: _IndexT, joined: np.ndarray, other: _IndexT ) -> _IndexT: assert other.dtype == self.dtype if isinstance(self, ABCMultiIndex): name = self.names if self.names == other.names else None else: name = get_op_result_name(self, other) return self._constructor(joined, name=name) # -------------------------------------------------------------------- # Uncategorized Methods @property def values(self) -> np.ndarray: """ Return an array representing the data in the Index. .. warning:: We recommend using :attr:`Index.array` or :meth:`Index.to_numpy`, depending on whether you need a reference to the underlying data or a NumPy array. Returns ------- array: numpy.ndarray or ExtensionArray See Also -------- Index.array : Reference to the underlying data. Index.to_numpy : A NumPy array representing the underlying data. """ return self._data.view(np.ndarray) @cache_readonly @doc(IndexOpsMixin.array) def array(self) -> ExtensionArray: array = self._data if isinstance(array, np.ndarray): from pandas.core.arrays.numpy_ import PandasArray array = PandasArray(array) return array @property def _values(self) -> Union[ExtensionArray, np.ndarray]: """ The best array representation. This is an ndarray or ExtensionArray. ``_values`` are consistent between ``Series`` and ``Index``. It may differ from the public '.values' method. index | values | _values | ----------------- | --------------- | ------------- | Index | ndarray | ndarray | CategoricalIndex | Categorical | Categorical | DatetimeIndex | ndarray[M8ns] | DatetimeArray | DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | PeriodIndex | ndarray[object] | PeriodArray | IntervalIndex | IntervalArray | IntervalArray | See Also -------- values : Values """ return self._data def _get_engine_target(self) -> np.ndarray: """ Get the ndarray that we can pass to the IndexEngine constructor. """ return self._values @doc(IndexOpsMixin.memory_usage) def memory_usage(self, deep: bool = False) -> int: result = super().memory_usage(deep=deep) # include our engine hashtable result += self._engine.sizeof(deep=deep) return result def where(self, cond, other=None): """ Replace values where the condition is False. The replacement is taken from other. Parameters ---------- cond : bool array-like with the same length as self Condition to select the values on. other : scalar, or array-like, default None Replacement if the condition is False. Returns ------- pandas.Index A copy of self with values replaced from other where the condition is False. See Also -------- Series.where : Same method for Series. DataFrame.where : Same method for DataFrame. Examples -------- >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) >>> idx Index(['car', 'bike', 'train', 'tractor'], dtype='object') >>> idx.where(idx.isin(['car', 'train']), 'other') Index(['car', 'other', 'train', 'other'], dtype='object') """ if other is None: other = self._na_value values = self.values try: self._validate_fill_value(other) except (ValueError, TypeError): return self.astype(object).where(cond, other) values = np.where(cond, values, other) return Index(values, name=self.name) # construction helpers @final @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor # in order to keep mypy happy return TypeError( f"{cls.__name__}(...) must be called with a collection of some " f"kind, {repr(data)} was passed" ) @final @classmethod def _string_data_error(cls, data): raise TypeError( "String dtype not supported, you may need " "to explicitly cast to a numeric type" ) @final def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. Parameters ---------- item : scalar item to coerce """ dtype = self.dtype if self._is_numeric_dtype and isna(item): # We can't coerce to the numeric dtype of "self" (unless # it's float) if there are NaN values in our output. dtype = None return Index([item], dtype=dtype, **self._get_attributes_dict()) def _validate_fill_value(self, value): """ Check if the value can be inserted into our array, and convert it to an appropriate native type if necessary. """ return value @final def _require_scalar(self, value): """ Check that this is a scalar value that we can use for setitem-like operations without changing dtype. """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") return value @property def _has_complex_internals(self) -> bool: """ Indicates if an index is not directly backed by a numpy array """ # used to avoid libreduction code paths, which raise or require conversion return False def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ return self.is_object() def is_type_compatible(self, kind: str_t) -> bool: """ Whether the index type is compatible with the provided type. """ return kind == self.inferred_type def __contains__(self, key: Any) -> bool: """ Return a boolean indicating whether the provided key is in the index. Parameters ---------- key : label The key to check if it is present in the index. Returns ------- bool Whether the key search is in the index. Raises ------ TypeError If the key is not hashable. See Also -------- Index.isin : Returns an ndarray of boolean dtype indicating whether the list-like key is in the index. Examples -------- >>> idx = pd.Index([1, 2, 3, 4]) >>> idx Int64Index([1, 2, 3, 4], dtype='int64') >>> 2 in idx True >>> 6 in idx False """ hash(key) try: return key in self._engine except (OverflowError, TypeError, ValueError): return False @final def __hash__(self): raise TypeError(f"unhashable type: {repr(type(self).__name__)}") @final def __setitem__(self, key, value): raise TypeError("Index does not support mutable operations") def __getitem__(self, key): """ Override numpy.ndarray's __getitem__ method to work as desired. This function adds lists and Series as valid boolean indexers (ndarrays only supports ndarray with dtype=bool). If resulting ndim != 1, plain ndarray is returned instead of corresponding `Index` subclass. """ # There's no custom logic to be implemented in __getslice__, so it's # not overloaded intentionally. getitem = self._data.__getitem__ promote = self._shallow_copy if is_scalar(key): key = com.cast_scalar_indexer(key, warn_float=True) return getitem(key) if isinstance(key, slice): # This case is separated from the conditional above to avoid # pessimization of basic indexing. return promote(getitem(key)) if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) result = getitem(key) if not is_scalar(result): if np.ndim(result) > 1: deprecate_ndim_indexing(result) return result return promote(result) else: return result @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python identifier (e.g. in NDFrame.__getattr__, which hits this to support . key lookup). For indexes that can't hold identifiers (everything but object & categorical) we just return False. https://github.com/pandas-dev/pandas/issues/19764 """ if self.is_object() or self.is_categorical(): return name in self return False def append(self, other): """ Append a collection of Index options together. Parameters ---------- other : Index or list/tuple of indices Returns ------- appended : Index """ to_concat = [self] if isinstance(other, (list, tuple)): to_concat = to_concat + list(other) else: to_concat.append(other) for obj in to_concat: if not isinstance(obj, Index): raise TypeError("all inputs must be Index") names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name return self._concat(to_concat, name) def _concat(self, to_concat: List["Index"], name: Label) -> "Index": """ Concatenate multiple Index objects. """ to_concat_vals = [x._values for x in to_concat] result = concat_compat(to_concat_vals) return Index(result, name=name) def putmask(self, mask, value): """ Return a new Index of the values set with the mask. Returns ------- Index See Also -------- numpy.ndarray.putmask : Changes elements of an array based on conditional and input values. """ values = self._values.copy() try: converted = self._validate_fill_value(value) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err # coerces to object return self.astype(object).putmask(mask, value) np.putmask(values, mask, converted) return self._shallow_copy(values) def equals(self, other: object) -> bool: """ Determine if two Index object are equal. The things that are being compared are: * The elements inside the Index object. * The order of the elements inside the Index object. Parameters ---------- other : Any The other object to compare against. Returns ------- bool True if "other" is an Index and it has the same elements and order as the calling index; False otherwise. Examples -------- >>> idx1 = pd.Index([1, 2, 3]) >>> idx1 Int64Index([1, 2, 3], dtype='int64') >>> idx1.equals(pd.Index([1, 2, 3])) True The elements inside are compared >>> idx2 = pd.Index(["1", "2", "3"]) >>> idx2 Index(['1', '2', '3'], dtype='object') >>> idx1.equals(idx2) False The order is compared >>> ascending_idx = pd.Index([1, 2, 3]) >>> ascending_idx Int64Index([1, 2, 3], dtype='int64') >>> descending_idx = pd.Index([3, 2, 1]) >>> descending_idx Int64Index([3, 2, 1], dtype='int64') >>> ascending_idx.equals(descending_idx) False The dtype is *not* compared >>> int64_idx = pd.Int64Index([1, 2, 3]) >>> int64_idx Int64Index([1, 2, 3], dtype='int64') >>> uint64_idx = pd.UInt64Index([1, 2, 3]) >>> uint64_idx UInt64Index([1, 2, 3], dtype='uint64') >>> int64_idx.equals(uint64_idx) True """ if self.is_(other): return True if not isinstance(other, Index): return False if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) if isinstance(other, ABCMultiIndex): # d-level MultiIndex can equal d-tuple Index return other.equals(self) if is_extension_array_dtype(other.dtype): # All EA-backed Index subclasses override equals return other.equals(self) return array_equivalent(self._values, other._values) @final def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. Returns ------- bool If two Index objects have equal elements and same type True, otherwise False. """ return ( self.equals(other) and all( getattr(self, c, None) == getattr(other, c, None) for c in self._comparables ) and type(self) == type(other) ) @final def asof(self, label): """ Return the label from the index, or, if not present, the previous one. Assuming that the index is sorted, return the passed index label if it is in the index, or return the previous index label if the passed one is not in the index. Parameters ---------- label : object The label up to which the method returns the latest index label. Returns ------- object The passed label if it is in the index. The previous label if the passed label is not in the sorted index or `NaN` if there is no such label. See Also -------- Series.asof : Return the latest value in a Series up to the passed index. merge_asof : Perform an asof merge (similar to left join but it matches on nearest key rather than equal key). Index.get_loc : An `asof` is a thin wrapper around `get_loc` with method='pad'. Examples -------- `Index.asof` returns the latest index label up to the passed label. >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) >>> idx.asof('2014-01-01') '2013-12-31' If the label is in the index, the method returns the passed label. >>> idx.asof('2014-01-02') '2014-01-02' If all of the labels in the index are later than the passed label, NaN is returned. >>> idx.asof('1999-01-02') nan If the index is not sorted, an error is raised. >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', ... '2014-01-03']) >>> idx_not_sorted.asof('2013-12-31') Traceback (most recent call last): ValueError: index must be monotonic increasing or decreasing """ try: loc = self.get_loc(label, method="pad") except KeyError: return self._na_value else: if isinstance(loc, slice): loc = loc.indices(len(self))[-1] return self[loc] def asof_locs(self, where: "Index", mask) -> np.ndarray: """ Return the locations (indices) of labels in the index. As in the `asof` function, if the label (a particular entry in `where`) is not in the index, the latest index label up to the passed label is chosen and its index returned. If all of the labels in the index are later than a label in `where`, -1 is returned. `mask` is used to ignore NA values in the index during calculation. Parameters ---------- where : Index An Index consisting of an array of timestamps. mask : array-like Array of booleans denoting where values in the original data are not NA. Returns ------- numpy.ndarray An array of locations (indices) of the labels from the Index which correspond to the return values of the `asof` function for every element in `where`. """ locs = self._values[mask].searchsorted(where._values, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) # TODO: overload return type of ExtensionArray.__getitem__ first_value = cast(Any, self._values[mask.argmax()]) result[(locs == 0) & (where._values < first_value)] = -1 return result @final def sort_values( self, return_indexer: bool = False, ascending: bool = True, na_position: str_t = "last", key: Optional[Callable] = None, ): """ Return a sorted copy of the index. Return a sorted copy of the index, and optionally return the indices that sorted the index itself. Parameters ---------- return_indexer : bool, default False Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. na_position : {'first' or 'last'}, default 'last' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. .. versionadded:: 1.2.0 key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. .. versionadded:: 1.1.0 Returns ------- sorted_index : pandas.Index Sorted copy of the index. indexer : numpy.ndarray, optional The indices that the index itself was sorted by. See Also -------- Series.sort_values : Sort values of a Series. DataFrame.sort_values : Sort values in a DataFrame. Examples -------- >>> idx = pd.Index([10, 100, 1, 1000]) >>> idx Int64Index([10, 100, 1, 1000], dtype='int64') Sort values in ascending order (default behavior). >>> idx.sort_values() Int64Index([1, 10, 100, 1000], dtype='int64') Sort values in descending order, and also get the indices `idx` was sorted by. >>> idx.sort_values(ascending=False, return_indexer=True) (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ idx = ensure_key_mapped(self, key) # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex if not isinstance(self, ABCMultiIndex): _as = nargsort( items=idx, ascending=ascending, na_position=na_position, key=key ) else: _as = idx.argsort() if not ascending: _as = _as[::-1] sorted_index = self.take(_as) if return_indexer: return sorted_index, _as else: return sorted_index @final def sort(self, *args, **kwargs): """ Use sort_values instead. """ raise TypeError("cannot sort an Index object in-place, use sort_values instead") def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. This method is for shifting the values of datetime-like indexes by a specified time increment a given number of times. Parameters ---------- periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. freq : pandas.DateOffset, pandas.Timedelta or str, optional Frequency increment to shift by. If None, the index is shifted by its own `freq` attribute. Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. Returns ------- pandas.Index Shifted index. See Also -------- Series.shift : Shift values of Series. Notes ----- This method is only implemented for datetime-like index classes, i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. Examples -------- Put the first 5 month starts of 2011 into an index. >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') >>> month_starts DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', '2011-05-01'], dtype='datetime64[ns]', freq='MS') Shift the index by 10 days. >>> month_starts.shift(10, freq='D') DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', '2011-05-11'], dtype='datetime64[ns]', freq=None) The default value of `freq` is the `freq` attribute of the index, which is 'MS' (month start) in this example. >>> month_starts.shift(10) DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ raise NotImplementedError( f"This method is only implemented for DatetimeIndex, PeriodIndex and " f"TimedeltaIndex; Got type {type(self).__name__}" ) def argsort(self, *args, **kwargs) -> np.ndarray: """ Return the integer indices that would sort the index. Parameters ---------- *args Passed to `numpy.ndarray.argsort`. **kwargs Passed to `numpy.ndarray.argsort`. Returns ------- numpy.ndarray Integer indices that would sort the index if used as an indexer. See Also -------- numpy.argsort : Similar method for NumPy arrays. Index.sort_values : Return sorted copy of Index. Examples -------- >>> idx = pd.Index(['b', 'a', 'd', 'c']) >>> idx Index(['b', 'a', 'd', 'c'], dtype='object') >>> order = idx.argsort() >>> order array([1, 0, 3, 2]) >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ if needs_i8_conversion(self.dtype): # TODO: these do not match the underlying EA argsort methods GH#37863 return self.asi8.argsort(*args, **kwargs) # This works for either ndarray or EA, is overriden # by RangeIndex, MultIIndex return self._data.argsort(*args, **kwargs) @final def get_value(self, series: "Series", key): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. Returns ------- scalar or Series """ warnings.warn( "get_value is deprecated and will be removed in a future version. " "Use Series[key] instead", FutureWarning, stacklevel=2, ) self._check_indexing_error(key) try: # GH 20882, 21257 # First try to convert the key to a location # If that fails, raise a KeyError if an integer # index, otherwise, see if key is an integer, and # try that loc = self.get_loc(key) except KeyError: if not self._should_fallback_to_positional(): raise elif is_integer(key): # If the Index cannot hold integer, then this is unambiguously # a locational lookup. loc = key else: raise return self._get_values_for_loc(series, loc, key) def _check_indexing_error(self, key): if not is_scalar(key): # if key is not a scalar, directly raise an error (the code below # would convert to numpy arrays and raise later any way) - GH29926 raise InvalidIndexError(key) def _should_fallback_to_positional(self) -> bool: """ Should an integer key be treated as positional? """ if self.holds_integer() or self.is_boolean(): return False return True def _get_values_for_loc(self, series: "Series", loc, key): """ Do a positional lookup on the given Series, returning either a scalar or a Series. Assumes that `series.index is self` key is included for MultiIndex compat. """ if is_integer(loc): return series._values[loc] return series.iloc[loc] @final def set_value(self, arr, key, value): """ Fast lookup of value from 1-dimensional ndarray. .. deprecated:: 1.0 Notes ----- Only use this if you know what you're doing. """ warnings.warn( ( "The 'set_value' method is deprecated, and " "will be removed in a future version." ), FutureWarning, stacklevel=2, ) loc = self._engine.get_loc(key) validate_numeric_casting(arr.dtype, value) arr[loc] = value _index_shared_docs[ "get_indexer_non_unique" ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. Parameters ---------- target : %(target_klass)s Returns ------- indexer : ndarray of int Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. missing : ndarray of int An indexer into the target of the values not found. These correspond to the -1 in the indexer array. """ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) if target.is_boolean() and self.is_numeric(): # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. # (GH #16877) return self._get_indexer_non_comparable(target, method=None, unique=False) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) if not self._should_compare(target): return self._get_indexer_non_comparable(target, method=None, unique=False) if not is_dtype_equal(self.dtype, target.dtype): # TODO: if object, could use infer_dtype to pre-empt costly # conversion if still non-comparable? dtype = find_common_type([self.dtype, target.dtype]) if ( dtype.kind in ["i", "u"] and is_categorical_dtype(target.dtype) and target.hasnans ): # FIXME: find_common_type incorrect with Categorical GH#38240 # FIXME: some cases where float64 cast can be lossy? dtype = np.dtype(np.float64) this = self.astype(dtype, copy=False) that = target.astype(dtype, copy=False) return this.get_indexer_non_unique(that) if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) else: tgt_values = target._get_engine_target() indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), missing @final def get_indexer_for(self, target, **kwargs): """ Guaranteed return of an indexer even when non-unique. This dispatches to get_indexer or get_indexer_non_unique as appropriate. Returns ------- numpy.ndarray List of indices. """ if self._index_as_unique: return self.get_indexer(target, **kwargs) indexer, _ = self.get_indexer_non_unique(target) return indexer def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True): """ Called from get_indexer or get_indexer_non_unique when the target is of a non-comparable dtype. For get_indexer lookups with method=None, get_indexer is an _equality_ check, so non-comparable dtypes mean we will always have no matches. For get_indexer lookups with a method, get_indexer is an _inequality_ check, so non-comparable dtypes mean we will always raise TypeError. Parameters ---------- target : Index method : str or None unique : bool, default True * True if called from get_indexer. * False if called from get_indexer_non_unique. Raises ------ TypeError If doing an inequality check, i.e. method is not None. """ if method is not None: other = unpack_nested_dtype(target) raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) if unique: # This is for get_indexer return no_matches else: # This is for get_indexer_non_unique missing = np.arange(len(target), dtype=np.intp) return no_matches, missing @property def _index_as_unique(self): """ Whether we should treat this as unique for the sake of get_indexer vs get_indexer_non_unique. For IntervalIndex compat. """ return self.is_unique @final def _maybe_promote(self, other: "Index"): """ When dealing with an object-dtype Index and a non-object Index, see if we can upcast the object-dtype one to improve performance. """ if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): try: return type(other)(self), other except OutOfBoundsDatetime: return self, other elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): # TODO: we dont have tests that get here return type(other)(self), other elif self.inferred_type == "boolean": if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_promote(self) return self, other def _should_compare(self, other: "Index") -> bool: """ Check if `self == other` can ever have non-False entries. """ other = unpack_nested_dtype(other) dtype = other.dtype return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ return True @final def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. Parameters ---------- values : array Values used to determine the groups. Returns ------- dict {group name -> group labels} """ # TODO: if we are a MultiIndex, we can do better # that converting to tuples if isinstance(values, ABCMultiIndex): values = values._values values = Categorical(values) result = values._reverse_indexer() # map to the label result = {k: self.take(v) for k, v in result.items()} return PrettyDict(result) def map(self, mapper, na_action=None): """ Map values using input correspondence (a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series Mapping correspondence. na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping correspondence. Returns ------- applied : Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ from pandas.core.indexes.multi import MultiIndex new_values = super()._map_values(mapper, na_action=na_action) attributes = self._get_attributes_dict() # we can return a MultiIndex if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): names = self.names elif attributes.get("name"): names = [attributes.get("name")] * len(new_values[0]) else: names = None return MultiIndex.from_tuples(new_values, names=names) attributes["copy"] = False if not new_values.size: # empty attributes["dtype"] = self.dtype return Index(new_values, **attributes) # TODO: De-duplicate with map, xref GH#32349 @final def _transform_index(self, func, level=None) -> "Index": """ Apply function to all values found in index. This includes transforming multiindex entries separately. Only apply function to one level of the MultiIndex if level is specified. """ if isinstance(self, ABCMultiIndex): if level is not None: items = [ tuple(func(y) if i == level else y for i, y in enumerate(x)) for x in self ] else: items = [tuple(func(y) for y in x) for x in self] return type(self).from_tuples(items, names=self.names) else: items = [func(x) for x in self] return Index(items, name=self.name, tupleize_cols=False) def isin(self, values, level=None): """ Return a boolean array where the index values are in `values`. Compute boolean array of whether each index value is found in the passed set of values. The length of the returned boolean array matches the length of the index. Parameters ---------- values : set or list-like Sought values. level : str or int, optional Name or position of the index level to use (if the index is a `MultiIndex`). Returns ------- is_contained : ndarray NumPy array of boolean values. See Also -------- Series.isin : Same for Series. DataFrame.isin : Same method for DataFrames. Notes ----- In the case of `MultiIndex` you must either specify `values` as a list-like object containing tuples that are the same length as the number of levels, or specify `level`. Otherwise it will raise a ``ValueError``. If `level` is specified: - if it is the name of one *and only one* index level, use that level; - otherwise it should be a number indicating level position. Examples -------- >>> idx = pd.Index([1,2,3]) >>> idx Int64Index([1, 2, 3], dtype='int64') Check whether each index value in a list of values. >>> idx.isin([1, 4]) array([ True, False, False]) >>> midx = pd.MultiIndex.from_arrays([[1,2,3], ... ['red', 'blue', 'green']], ... names=('number', 'color')) >>> midx MultiIndex([(1, 'red'), (2, 'blue'), (3, 'green')], names=['number', 'color']) Check whether the strings in the 'color' level of the MultiIndex are in a list of colors. >>> midx.isin(['red', 'orange', 'yellow'], level='color') array([ True, False, False]) To check across the levels of a MultiIndex, pass a list of tuples: >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) For a DatetimeIndex, string values in `values` are converted to Timestamps. >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] >>> dti = pd.to_datetime(dates) >>> dti DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], dtype='datetime64[ns]', freq=None) >>> dti.isin(['2000-03-11']) array([ True, False, False]) """ if level is not None: self._validate_index_level(level) return algos.isin(self._values, values) def _get_string_slice(self, key: str_t): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError def slice_indexer( self, start: Optional[Label] = None, end: Optional[Label] = None, step: Optional[int] = None, kind: Optional[str_t] = None, ) -> slice: """ Compute the slice indexer for input labels and step. Index needs to be ordered and unique. Parameters ---------- start : label, default None If None, defaults to the beginning. end : label, default None If None, defaults to the end. step : int, default None kind : str, default None Returns ------- indexer : slice Raises ------ KeyError : If key does not exist, or key is not unique and index is not ordered. Notes ----- This function assumes that the data is sorted, so use at your own peril Examples -------- This is a method on all index types. For example you can do: >>> idx = pd.Index(list('abcd')) >>> idx.slice_indexer(start='b', end='c') slice(1, 3, None) >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) >>> idx.slice_indexer(start='b', end=('c', 'g')) slice(1, 3, None) """ start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) # return a slice if not is_scalar(start_slice): raise AssertionError("Start slice bound is non-scalar") if not is_scalar(end_slice): raise AssertionError("End slice bound is non-scalar") return slice(start_slice, end_slice, step) def _maybe_cast_indexer(self, key): """ If we have a float key and are not a floating index, then try to cast to an int if equivalent. """ if not self.is_floating(): return com.cast_scalar_indexer(key) return key @final def _validate_indexer(self, form: str_t, key, kind: str_t): """ If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ assert kind in ["getitem", "iloc"] if key is None: pass elif is_integer(key): pass else: raise self._invalid_indexer(form, key) def _maybe_cast_slice_bound(self, label, side: str_t, kind): """ This function should be overloaded in subclasses that allow non-trivial casting on label-slice bounds, e.g. datetime-like indices allowing strings containing formatted datetimes. Parameters ---------- label : object side : {'left', 'right'} kind : {'loc', 'getitem'} or None Returns ------- label : object Notes ----- Value of `side` parameter should be validated in caller. """ assert kind in ["loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them, if index does not contain label if (is_float(label) or is_integer(label)) and label not in self.values: raise self._invalid_indexer("slice", label) return label def _searchsorted_monotonic(self, label, side="left"): if self.is_monotonic_increasing: return self.searchsorted(label, side=side) elif self.is_monotonic_decreasing: # np.searchsorted expects ascending sort order, have to reverse # everything for it to work (element ordering, search side and # resulting value). pos = self[::-1].searchsorted( label, side="right" if side == "left" else "left" ) return len(self) - pos raise ValueError("index must be monotonic increasing or decreasing") def get_slice_bound(self, label, side: str_t, kind) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position of given label. Parameters ---------- label : object side : {'left', 'right'} kind : {'loc', 'getitem'} or None Returns ------- int Index of label. """ assert kind in ["loc", "getitem", None] if side not in ("left", "right"): raise ValueError( "Invalid value for side kwarg, must be either " f"'left' or 'right': {side}" ) original_label = label # For datetime indices label may be a string that has to be converted # to datetime boundary according to its resolution. label = self._maybe_cast_slice_bound(label, side, kind) # we need to look up the label try: slc = self.get_loc(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) except ValueError: # raise the original KeyError raise err if isinstance(slc, np.ndarray): # get_loc may return a boolean array or an array of indices, which # is OK as long as they are representable by a slice. if is_bool_dtype(slc): slc = lib.maybe_booleans_to_slice(slc.view("u1")) else: slc = lib.maybe_indices_to_slice( slc.astype(np.intp, copy=False), len(self) ) if isinstance(slc, np.ndarray): raise KeyError( f"Cannot get {side} slice bound for non-unique " f"label: {repr(original_label)}" ) if isinstance(slc, slice): if side == "left": return slc.start else: return slc.stop else: if side == "right": return slc + 1 else: return slc def slice_locs(self, start=None, end=None, step=None, kind=None): """ Compute slice locations for input labels. Parameters ---------- start : label, default None If None, defaults to the beginning. end : label, default None If None, defaults to the end. step : int, defaults None If None, defaults to 1. kind : {'loc', 'getitem'} or None Returns ------- start, end : int See Also -------- Index.get_loc : Get location for a single label. Notes ----- This method only works if the index is monotonic or unique. Examples -------- >>> idx = pd.Index(list('abcd')) >>> idx.slice_locs(start='b', end='c') (1, 3) """ inc = step is None or step >= 0 if not inc: # If it's a reverse slice, temporarily swap bounds. start, end = end, start # GH 16785: If start and end happen to be date strings with UTC offsets # attempt to parse and check that the offsets are the same if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)): try: ts_start = Timestamp(start) ts_end = Timestamp(end) except (ValueError, TypeError): pass else: if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): raise ValueError("Both dates must have the same UTC offset") start_slice = None if start is not None: start_slice = self.get_slice_bound(start, "left", kind) if start_slice is None: start_slice = 0 end_slice = None if end is not None: end_slice = self.get_slice_bound(end, "right", kind) if end_slice is None: end_slice = len(self) if not inc: # Bounds at this moment are swapped, swap them back and shift by 1. # # slice_locs('B', 'A', step=-1): s='B', e='A' # # s='A' e='B' # AFTER SWAP: | | # v ------------------> V # ----------------------------------- # | | |A|A|A|A| | | | | |B|B| | | | | # ----------------------------------- # ^ <------------------ ^ # SHOULD BE: | | # end=s-1 start=e-1 # end_slice, start_slice = start_slice - 1, end_slice - 1 # i == -1 triggers ``len(self) + i`` selection that points to the # last element, not before-the-first one, subtracting len(self) # compensates that. if end_slice == -1: end_slice -= len(self) if start_slice == -1: start_slice -= len(self) return start_slice, end_slice def delete(self, loc): """ Make new Index with passed location(-s) deleted. Parameters ---------- loc : int or list of int Location of item(-s) which will be deleted. Use a list of locations to delete more than one value at the same time. Returns ------- Index New Index with passed location(-s) deleted. See Also -------- numpy.delete : Delete any rows and column from NumPy array (ndarray). Examples -------- >>> idx = pd.Index(['a', 'b', 'c']) >>> idx.delete(1) Index(['a', 'c'], dtype='object') >>> idx = pd.Index(['a', 'b', 'c']) >>> idx.delete([0, 2]) Index(['b'], dtype='object') """ return self._shallow_copy(np.delete(self._data, loc)) def insert(self, loc: int, item): """ Make new Index inserting new item at location. Follows Python list.append semantics for negative values. Parameters ---------- loc : int item : object Returns ------- new_index : Index """ # Note: this method is overridden by all ExtensionIndex subclasses, # so self is never backed by an EA. arr = np.asarray(self) item = self._coerce_scalar_to_index(item)._values idx = np.concatenate((arr[:loc], item, arr[loc:])) return Index(idx, name=self.name) def drop(self, labels, errors: str_t = "raise"): """ Make new Index with passed list of labels deleted. Parameters ---------- labels : array-like errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. Returns ------- dropped : Index Raises ------ KeyError If not all of the labels are found in the selected axis """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) indexer = self.get_indexer_for(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": raise KeyError(f"{labels[mask]} not found in axis") indexer = indexer[~mask] return self.delete(indexer) # -------------------------------------------------------------------- # Generated Arithmetic, Comparison, and Unary Methods def _cmp_method(self, other, op): """ Wrapper used to dispatch comparison operations. """ if self.is_(other): # fastpath if op in {operator.eq, operator.le, operator.ge}: arr = np.ones(len(self), dtype=bool) if self._can_hold_na and not isinstance(self, ABCMultiIndex): # TODO: should set MultiIndex._can_hold_na = False? arr[self.isna()] = False return arr elif op in {operator.ne, operator.lt, operator.gt}: return np.zeros(len(self), dtype=bool) if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): if len(self) != len(other): raise ValueError("Lengths must match to compare") if not isinstance(other, ABCMultiIndex): other = extract_array(other, extract_numpy=True) else: other = np.asarray(other) if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): # e.g. PeriodArray, Categorical with np.errstate(all="ignore"): result = op(self._values, other) elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) elif is_interval_dtype(self.dtype): with np.errstate(all="ignore"): result = op(self._values, np.asarray(other)) else: with np.errstate(all="ignore"): result = ops.comparison_op(self._values, other, op) return result def _arith_method(self, other, op): """ Wrapper used to dispatch arithmetic operations. """ from pandas import Series result = op(Series(self), other) if isinstance(result, tuple): return (Index(result[0]), Index(result[1])) return Index(result) def _unary_method(self, op): result = op(self._values) return Index(result, name=self.name) def __abs__(self): return self._unary_method(operator.abs) def __neg__(self): return self._unary_method(operator.neg) def __pos__(self): return self._unary_method(operator.pos) def __inv__(self): # TODO: why not operator.inv? # TODO: __inv__ vs __invert__? return self._unary_method(lambda x: -x) def any(self, *args, **kwargs): """ Return whether any element is Truthy. Parameters ---------- *args These parameters will be passed to numpy.any. **kwargs These parameters will be passed to numpy.any. Returns ------- any : bool or array_like (if axis is specified) A single element array_like may be converted to bool. See Also -------- Index.all : Return whether all elements are True. Series.all : Return whether all elements are True. Notes ----- Not a Number (NaN), positive infinity and negative infinity evaluate to True because these are not equal to zero. Examples -------- >>> index = pd.Index([0, 1, 2]) >>> index.any() True >>> index = pd.Index([0, 0, 0]) >>> index.any() False """ # FIXME: docstr inaccurate, args/kwargs not passed self._maybe_disable_logical_methods("any") return np.any(self.values) def all(self): """ Return whether all elements are Truthy. Parameters ---------- *args These parameters will be passed to numpy.all. **kwargs These parameters will be passed to numpy.all. Returns ------- all : bool or array_like (if axis is specified) A single element array_like may be converted to bool. See Also -------- Index.any : Return whether any element in an Index is True. Series.any : Return whether any element in a Series is True. Series.all : Return whether all elements in a Series are True. Notes ----- Not a Number (NaN), positive infinity and negative infinity evaluate to True because these are not equal to zero. Examples -------- **all** True, because nonzero integers are considered True. >>> pd.Index([1, 2, 3]).all() True False, because ``0`` is considered False. >>> pd.Index([0, 1, 2]).all() False **any** True, because ``1`` is considered True. >>> pd.Index([0, 0, 1]).any() True False, because ``0`` is considered False. >>> pd.Index([0, 0, 0]).any() False """ # FIXME: docstr inaccurate, args/kwargs not passed self._maybe_disable_logical_methods("all") return np.all(self.values) @final def _maybe_disable_logical_methods(self, opname: str_t): """ raise if this Index subclass does not support any or all. """ if ( isinstance(self, ABCMultiIndex) or needs_i8_conversion(self.dtype) or is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype) or is_float_dtype(self.dtype) ): # This call will raise make_invalid_op(opname)(self) @property def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ # not using "(len(self), )" to return "correct" shape if the values # consists of a >1 D array (see GH-27775) # overridden in MultiIndex.shape to avoid materializing the values return self._values.shape def ensure_index_from_sequences(sequences, names=None): """ Construct an index from sequences of data. A single sequence returns an Index. Many sequences returns a MultiIndex. Parameters ---------- sequences : sequence of sequences names : sequence of str Returns ------- index : Index or MultiIndex Examples -------- >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) Int64Index([1, 2, 3], dtype='int64', name='name') >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) MultiIndex([('a', 'a'), ('a', 'b')], names=['L1', 'L2']) See Also -------- ensure_index """ from pandas.core.indexes.multi import MultiIndex if len(sequences) == 1: if names is not None: names = names[0] return Index(sequences[0], name=names) else: return MultiIndex.from_arrays(sequences, names=names) def ensure_index( index_like: Union[AnyArrayLike, Sequence], copy: bool = False ) -> Index: """ Ensure that we have an index from some index-like object. Parameters ---------- index_like : sequence An Index or other sequence copy : bool, default False Returns ------- index : Index or MultiIndex See Also -------- ensure_index_from_sequences Examples -------- >>> ensure_index(['a', 'b']) Index(['a', 'b'], dtype='object') >>> ensure_index([('a', 'a'), ('b', 'c')]) Index([('a', 'a'), ('b', 'c')], dtype='object') >>> ensure_index([['a', 'a'], ['b', 'c']]) MultiIndex([('a', 'b'), ('a', 'c')], ) """ if isinstance(index_like, Index): if copy: index_like = index_like.copy() return index_like if hasattr(index_like, "name"): # https://github.com/python/mypy/issues/1424 # error: Item "ExtensionArray" of "Union[ExtensionArray, # Sequence[Any]]" has no attribute "name" [union-attr] # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" # has no attribute "name" [union-attr] # error: "Sequence[Any]" has no attribute "name" [attr-defined] # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no # attribute "name" [union-attr] # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no # attribute "name" [union-attr] name = index_like.name # type: ignore[union-attr, attr-defined] return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) # must check for exactly list here because of strict type # check in clean_index_list if isinstance(index_like, list): if type(index_like) != list: index_like = list(index_like) converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_arrays(converted) else: if isinstance(converted, np.ndarray) and converted.dtype == np.int64: # Check for overflows if we should actually be uint64 # xref GH#35481 alt = np.asarray(index_like) if alt.dtype == np.uint64: converted = alt index_like = converted else: # clean_index_list does the equivalent of copying # so only need to do this if not list instance if copy: index_like = copy_func(index_like) return Index(index_like) def ensure_has_len(seq): """ If seq is an iterator, put its values into a list. """ try: len(seq) except TypeError: return list(seq) else: return seq def trim_front(strings: List[str]) -> List[str]: """ Trims zeros and decimal points. """ trimmed = strings while len(strings) > 0 and all(x[0] == " " for x in trimmed): trimmed = [x[1:] for x in trimmed] return trimmed def _validate_join_method(method: str): if method not in ["left", "right", "inner", "outer"]: raise ValueError(f"do not recognize join method {method}") def default_index(n: int) -> "RangeIndex": from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) def maybe_extract_name(name, obj, cls) -> Label: """ If no name is passed, then extract it from data, validating hashability. """ if name is None and isinstance(obj, (Index, ABCSeries)): # Note we don't just check for "name" attribute since that would # pick up e.g. dtype.name name = obj.name # GH#29069 if not is_hashable(name): raise TypeError(f"{cls.__name__}.name must be a hashable type") return name def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: """ If a dtype is passed, cast to the closest matching dtype that is supported by Index. Parameters ---------- data : np.ndarray dtype : np.dtype copy : bool Returns ------- np.ndarray """ # we need to avoid having numpy coerce # things that look like ints/floats to ints unless # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 if is_integer_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) if inferred == "integer": data = maybe_cast_to_integer_array(data, dtype, copy=copy) elif inferred in ["floating", "mixed-integer-float"]: if isna(data).any(): raise ValueError("cannot convert float NaN to integer") if inferred == "mixed-integer-float": data = maybe_cast_to_integer_array(data, dtype) # If we are actually all equal to integers, # then coerce to integer. try: data = _try_convert_to_int_array(data, copy, dtype) except ValueError: data = np.array(data, dtype=np.float64, copy=copy) elif inferred == "string": pass else: data = data.astype(dtype) elif is_float_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) if inferred == "string": pass else: data = data.astype(dtype) else: data = np.array(data, dtype=dtype, copy=copy) return data def _maybe_cast_data_without_dtype(subarr): """ If we have an arraylike input but no passed dtype, try to infer a supported dtype. Parameters ---------- subarr : np.ndarray, Index, or Series Returns ------- converted : np.ndarray or ExtensionArray dtype : np.dtype or ExtensionDtype """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( DatetimeArray, IntervalArray, PeriodArray, TimedeltaArray, ) inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "integer": try: data = _try_convert_to_int_array(subarr, False, None) return data, data.dtype except ValueError: pass return subarr, object elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future return subarr, np.float64 elif inferred == "interval": try: data = IntervalArray._from_sequence(subarr, copy=False) return data, data.dtype except ValueError: # GH27172: mixed closed Intervals --> object dtype pass elif inferred == "boolean": # don't support boolean explicitly ATM pass elif inferred != "string": if inferred.startswith("datetime"): try: data = DatetimeArray._from_sequence(subarr, copy=False) return data, data.dtype except (ValueError, OutOfBoundsDatetime): # GH 27011 # If we have mixed timezones, just send it # down the base constructor pass elif inferred.startswith("timedelta"): data = TimedeltaArray._from_sequence(subarr, copy=False) return data, data.dtype elif inferred == "period": try: data = PeriodArray._from_sequence(subarr) return data, data.dtype except IncompatibleFrequency: pass return subarr, subarr.dtype def _try_convert_to_int_array( data: np.ndarray, copy: bool, dtype: np.dtype ) -> np.ndarray: """ Attempt to convert an array of data into an integer array. Parameters ---------- data : The data to convert. copy : bool Whether to copy the data or not. dtype : np.dtype Returns ------- int_array : data converted to either an ndarray[int64] or ndarray[uint64] Raises ------ ValueError if the conversion was not successful. """ if not is_unsigned_integer_dtype(dtype): # skip int64 conversion attempt if uint-like dtype is passed, as # this could return Int64Index when UInt64Index is what's desired try: res = data.astype("i8", copy=False) if (res == data).all(): return res # TODO: might still need to copy except (OverflowError, TypeError, ValueError): pass # Conversion to int64 failed (possibly due to overflow) or was skipped, # so let's try now with uint64. try: res = data.astype("u8", copy=False) if (res == data).all(): return res # TODO: might still need to copy except (OverflowError, TypeError, ValueError): pass raise ValueError def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs): """ If an object dtype was specified, create the non-object Index and then convert it to object. Parameters ---------- dtype : np.dtype, ExtensionDtype, str klass : Index subclass data : list-like copy : bool name : hashable **kwargs Returns ------- Index Notes ----- We assume that calling .astype(object) on this klass will make a copy. """ # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So # we leave it out of this step and cast to object-dtype after # the DatetimeIndex construction. if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy index = klass(data, copy=False, name=name, **kwargs) return index.astype(object) return klass(data, dtype=dtype, copy=copy, name=name, **kwargs) def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]: """ Return common name if all indices agree, otherwise None (level-by-level). Parameters ---------- indexes : list of Index objects Returns ------- list A list representing the unanimous 'names' found. """ name_tups = [tuple(i.names) for i in indexes] name_sets = [{*ns} for ns in zip_longest(*name_tups)] names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names def unpack_nested_dtype(other: Index) -> Index: """ When checking if our dtype is comparable with another, we need to unpack CategoricalDtype to look at its categories.dtype. Parameters ---------- other : Index Returns ------- Index """ dtype = other.dtype if is_categorical_dtype(dtype): # If there is ever a SparseIndex, this could get dispatched # here too. return dtype.categories return other