""" Base and utility classes for pandas objects. """ from __future__ import annotations import textwrap from typing import ( TYPE_CHECKING, Any, Generic, Hashable, Iterator, Literal, TypeVar, cast, final, overload, ) import numpy as np from pandas._config import using_copy_on_write from pandas._libs import lib from pandas._typing import ( Axis, AxisInt, DtypeObj, IndexLabel, NDFrameT, Shape, npt, ) from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import ( cache_readonly, doc, ) from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_categorical_dtype, is_dict_like, is_extension_array_dtype, is_object_dtype, is_scalar, ) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndex, ABCSeries, ) from pandas.core.dtypes.missing import ( isna, remove_na_arraylike, ) from pandas.core import ( algorithms, nanops, ops, ) from pandas.core.accessor import DirNamesMixin from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, ) if TYPE_CHECKING: from pandas._typing import ( DropKeep, NumpySorter, NumpyValueArrayLike, ScalarLike_co, ) from pandas import ( Categorical, Index, Series, ) _shared_docs: dict[str, str] = {} _indexops_doc_kwargs = { "klass": "IndexOpsMixin", "inplace": "", "unique": "IndexOpsMixin", "duplicated": "IndexOpsMixin", } _T = TypeVar("_T", bound="IndexOpsMixin") class PandasObject(DirNamesMixin): """ Baseclass for various pandas objects. """ # results from calls to methods decorated with cache_readonly get added to _cache _cache: dict[str, Any] @property def _constructor(self): """ Class constructor (for this class it's just `__class__`. """ return type(self) def __repr__(self) -> str: """ Return a string representation for a particular object. """ # Should be overwritten by base classes return object.__repr__(self) def _reset_cache(self, key: str | None = None) -> None: """ Reset cached properties. If ``key`` is passed, only clears that key. """ if not hasattr(self, "_cache"): return if key is None: self._cache.clear() else: self._cache.pop(key, None) def __sizeof__(self) -> int: """ Generates the total memory usage for an object that returns either a value or Series of values """ memory_usage = getattr(self, "memory_usage", None) if memory_usage: mem = memory_usage(deep=True) # pylint: disable=not-callable return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' return super().__sizeof__() class NoNewAttributesMixin: """ Mixin which prevents adding new attributes. Prevents additional attributes via xxx.attribute = "something" after a call to `self.__freeze()`. Mainly used to prevent the user from using wrong attributes on an accessor (`Series.cat/.str/.dt`). If you really want to add a new attribute at a later time, you need to use `object.__setattr__(self, key, value)`. """ def _freeze(self) -> None: """ Prevents setting additional attributes. """ object.__setattr__(self, "__frozen", True) # prevent adding any attribute via s.xxx.new_attribute = ... def __setattr__(self, key: str, value) -> None: # _cache is used by a decorator # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key) # because # 1.) getattr is false for attributes that raise errors # 2.) cls.__dict__ doesn't traverse into base classes if getattr(self, "__frozen", False) and not ( key == "_cache" or key in type(self).__dict__ or getattr(self, key, None) is not None ): raise AttributeError(f"You cannot add any new attribute '{key}'") object.__setattr__(self, key, value) class SelectionMixin(Generic[NDFrameT]): """ mixin implementing the selection & aggregation interface on a group-like object sub-classes need to define: obj, exclusions """ obj: NDFrameT _selection: IndexLabel | None = None exclusions: frozenset[Hashable] _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) @final @property def _selection_list(self): if not isinstance( self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray) ): return [self._selection] return self._selection @cache_readonly def _selected_obj(self): if self._selection is None or isinstance(self.obj, ABCSeries): return self.obj else: return self.obj[self._selection] @final @cache_readonly def ndim(self) -> int: return self._selected_obj.ndim @final @cache_readonly def _obj_with_exclusions(self): if isinstance(self.obj, ABCSeries): return self.obj if self._selection is not None: return self.obj._getitem_nocopy(self._selection_list) if len(self.exclusions) > 0: # equivalent to `self.obj.drop(self.exclusions, axis=1) # but this avoids consolidating and making a copy # TODO: following GH#45287 can we now use .drop directly without # making a copy? return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True) else: return self.obj def __getitem__(self, key): if self._selection is not None: raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(set(key)): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) else: if key not in self.obj: raise KeyError(f"Column not found: {key}") ndim = self.obj[key].ndim return self._gotitem(key, ndim=ndim) def _gotitem(self, key, ndim: int, subset=None): """ sub-classes to define return a sliced object Parameters ---------- key : str / list of selections ndim : {1, 2} requested ndim of result subset : object, default None subset to act on """ raise AbstractMethodError(self) def aggregate(self, func, *args, **kwargs): raise AbstractMethodError(self) agg = aggregate class IndexOpsMixin(OpsMixin): """ Common ops mixin to support a unified interface / docs for Series / Index """ # ndarray compatibility __array_priority__ = 1000 _hidden_attrs: frozenset[str] = frozenset( ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) @property def dtype(self) -> DtypeObj: # must be defined here as a property for mypy raise AbstractMethodError(self) @property def _values(self) -> ExtensionArray | np.ndarray: # must be defined here as a property for mypy raise AbstractMethodError(self) @final def transpose(self: _T, *args, **kwargs) -> _T: """ Return the transpose, which is by definition self. Returns ------- %(klass)s """ nv.validate_transpose(args, kwargs) return self T = property( transpose, doc=""" Return the transpose, which is by definition self. """, ) @property def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. Examples -------- >>> s = pd.Series([1, 2, 3]) >>> s.shape (3,) """ return self._values.shape def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) @property def ndim(self) -> Literal[1]: """ Number of dimensions of the underlying data, by definition 1. """ return 1 @final def item(self): """ Return the first element of the underlying data as a Python scalar. Returns ------- scalar The first element of %(klass)s. Raises ------ ValueError If the data is not length-1. """ if len(self) == 1: return next(iter(self)) raise ValueError("can only convert an array of size 1 to a Python scalar") @property def nbytes(self) -> int: """ Return the number of bytes in the underlying data. """ return self._values.nbytes @property def size(self) -> int: """ Return the number of elements in the underlying data. """ return len(self._values) @property def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. Returns ------- ExtensionArray An ExtensionArray of the values stored within. For extension types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. ``.array`` differs ``.values`` which may require converting the data to a different form. See Also -------- Index.to_numpy : Similar method that always returns a NumPy array. Series.to_numpy : Similar method that always returns a NumPy array. Notes ----- This table lays out the different array types for each extension dtype within pandas. ================== ============================= dtype array type ================== ============================= category Categorical period PeriodArray interval IntervalArray IntegerNA IntegerArray string StringArray boolean BooleanArray datetime64[ns, tz] DatetimeArray ================== ============================= For any 3rd-party extension types, the array type will be an ExtensionArray. For all remaining dtypes ``.array`` will be a :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray stored within. If you absolutely need a NumPy array (possibly with copying / coercing data), then use :meth:`Series.to_numpy` instead. Examples -------- For regular NumPy types like int, and float, a PandasArray is returned. >>> pd.Series([1, 2, 3]).array [1, 2, 3] Length: 3, dtype: int64 For extension types, like Categorical, the actual ExtensionArray is returned >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] """ raise AbstractMethodError(self) @final def to_numpy( self, dtype: npt.DTypeLike | None = None, copy: bool = False, na_value: object = lib.no_default, **kwargs, ) -> np.ndarray: """ A NumPy ndarray representing the values in this Series or Index. Parameters ---------- dtype : str or numpy.dtype, optional The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. na_value : Any, optional The value to use for missing values. The default value depends on `dtype` and the type of the array. **kwargs Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming ``copy=False``). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and default return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns] datetime64[ns] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) Specify the `dtype` to control how datetime-aware data is represented. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'), Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object) Or ``dtype='datetime64[ns]'`` to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") ... # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ if is_extension_array_dtype(self.dtype): return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) elif kwargs: bad_keys = list(kwargs.keys())[0] raise TypeError( f"to_numpy() got an unexpected keyword argument '{bad_keys}'" ) if na_value is not lib.no_default: values = self._values if not can_hold_element(values, na_value): # if we can't hold the na_value asarray either makes a copy or we # error before modifying values. The asarray later on thus won't make # another copy values = np.asarray(values, dtype=dtype) else: values = values.copy() values[np.asanyarray(self.isna())] = na_value else: values = self._values result = np.asarray(values, dtype=dtype) if (copy and na_value is lib.no_default) or ( not copy and using_copy_on_write() ): if np.shares_memory(self._values[:2], result[:2]): # Take slices to improve performance of check if using_copy_on_write() and not copy: result = result.view() result.flags.writeable = False else: result = result.copy() return result @final @property def empty(self) -> bool: return not self.size def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the maximum value of the Index. Parameters ---------- axis : int, optional For compatibility with NumPy. Only 0 or None are allowed. skipna : bool, default True Exclude NA/null values when showing the result. *args, **kwargs Additional arguments and keywords for compatibility with NumPy. Returns ------- scalar Maximum value. See Also -------- Index.min : Return the minimum value in an Index. Series.max : Return the maximum value in a Series. DataFrame.max : Return the maximum values in a DataFrame. Examples -------- >>> idx = pd.Index([3, 2, 1]) >>> idx.max() 3 >>> idx = pd.Index(['c', 'b', 'a']) >>> idx.max() 'c' For a MultiIndex, the maximum is determined lexicographically. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) >>> idx.max() ('b', 2) """ nv.validate_minmax_axis(axis) nv.validate_max(args, kwargs) return nanops.nanmax(self._values, skipna=skipna) @doc(op="max", oppose="min", value="largest") def argmax( self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs ) -> int: """ Return int position of the {value} value in the Series. If the {op}imum is achieved in multiple locations, the first row position is returned. Parameters ---------- axis : {{None}} Unused. Parameter needed for compatibility with DataFrame. skipna : bool, default True Exclude NA/null values when showing the result. *args, **kwargs Additional arguments and keywords for compatibility with NumPy. Returns ------- int Row position of the {op}imum value. See Also -------- Series.arg{op} : Return position of the {op}imum value. Series.arg{oppose} : Return position of the {oppose}imum value. numpy.ndarray.arg{op} : Equivalent method for numpy arrays. Series.idxmax : Return index label of the maximum values. Series.idxmin : Return index label of the minimum values. Examples -------- Consider dataset containing cereal calories >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) >>> s Corn Flakes 100.0 Almond Delight 110.0 Cinnamon Toast Crunch 120.0 Cocoa Puff 110.0 dtype: float64 >>> s.argmax() 2 >>> s.argmin() 0 The maximum cereal calories is the third element and the minimum cereal calories is the first element, since series is zero-indexed. """ delegate = self._values nv.validate_minmax_axis(axis) skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): if not skipna and delegate.isna().any(): return -1 else: return delegate.argmax() else: # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return nanops.nanargmax( # type: ignore[return-value] delegate, skipna=skipna ) def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs): """ Return the minimum value of the Index. Parameters ---------- axis : {None} Dummy argument for consistency with Series. skipna : bool, default True Exclude NA/null values when showing the result. *args, **kwargs Additional arguments and keywords for compatibility with NumPy. Returns ------- scalar Minimum value. See Also -------- Index.max : Return the maximum value of the object. Series.min : Return the minimum value in a Series. DataFrame.min : Return the minimum values in a DataFrame. Examples -------- >>> idx = pd.Index([3, 2, 1]) >>> idx.min() 1 >>> idx = pd.Index(['c', 'b', 'a']) >>> idx.min() 'a' For a MultiIndex, the minimum is determined lexicographically. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) >>> idx.min() ('a', 1) """ nv.validate_minmax_axis(axis) nv.validate_min(args, kwargs) return nanops.nanmin(self._values, skipna=skipna) @doc(argmax, op="min", oppose="max", value="smallest") def argmin( self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs ) -> int: delegate = self._values nv.validate_minmax_axis(axis) skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) if isinstance(delegate, ExtensionArray): if not skipna and delegate.isna().any(): return -1 else: return delegate.argmin() else: # error: Incompatible return value type (got "Union[int, ndarray]", expected # "int") return nanops.nanargmin( # type: ignore[return-value] delegate, skipna=skipna ) def tolist(self): """ Return a list of the values. These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) Returns ------- list See Also -------- numpy.ndarray.tolist : Return the array as an a.ndim-levels deep nested list of Python scalars. """ return self._values.tolist() to_list = tolist def __iter__(self) -> Iterator: """ Return an iterator of the values. These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) Returns ------- iterator """ # We are explicitly making element iterators. if not isinstance(self._values, np.ndarray): # Check type instead of dtype to catch DTA/TDA return iter(self._values) else: return map(self._values.item, range(self._values.size)) @cache_readonly def hasnans(self) -> bool: """ Return True if there are any NaNs. Enables various performance speedups. Returns ------- bool """ # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]" # has no attribute "any" return bool(isna(self).any()) # type: ignore[union-attr] def isna(self) -> npt.NDArray[np.bool_]: return isna(self._values) def _reduce( self, op, name: str, *, axis: Axis = 0, skipna: bool = True, numeric_only=None, filter_type=None, **kwds, ): """ Perform the reduction type operation if we can. """ func = getattr(self, name, None) if func is None: raise TypeError( f"{type(self).__name__} cannot perform the operation {name}" ) return func(skipna=skipna, **kwds) @final def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[ np.nan if isinstance(x, float) and np.isnan(x) else x ] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well from pandas import Series if len(mapper) == 0: mapper = Series(mapper, dtype=np.float64) else: mapper = Series(mapper) if isinstance(mapper, ABCSeries): if na_action not in (None, "ignore"): msg = ( "na_action must either be 'ignore' or None, " f"{na_action} was passed" ) raise ValueError(msg) if na_action == "ignore": mapper = mapper[mapper.index.notna()] # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values cat = cast("Categorical", self._values) return cat.map(mapper) values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_nd(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self._values.astype(object) if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( values, f, isna(values).view(np.uint8) ) elif na_action is None: map_f = lib.map_infer else: msg = ( "na_action must either be 'ignore' or None, " f"{na_action} was passed" ) raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) return new_values @final def value_counts( self, normalize: bool = False, sort: bool = True, ascending: bool = False, bins=None, dropna: bool = True, ) -> Series: """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : bool, default False If True then the object returned will contain the relative frequencies of the unique values. sort : bool, default True Sort by frequencies. ascending : bool, default False Sort in ascending order. bins : int, optional Rather than count values, group them into half-open bins, a convenience for ``pd.cut``, only works with numeric data. dropna : bool, default True Don't include counts of NaN. Returns ------- Series See Also -------- Series.count: Number of non-NA elements in a Series. DataFrame.count: Number of non-NA elements in a DataFrame. DataFrame.value_counts: Equivalent method on DataFrames. Examples -------- >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 1.0 1 2.0 1 4.0 1 Name: count, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 1.0 0.2 2.0 0.2 4.0 0.2 Name: proportion, dtype: float64 **bins** Bins can be useful for going from a continuous variable to a categorical variable; instead of counting unique apparitions of values, divide the index in the specified number of half-open bins. >>> s.value_counts(bins=3) (0.996, 2.0] 2 (2.0, 3.0] 2 (3.0, 4.0] 1 Name: count, dtype: int64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> s.value_counts(dropna=False) 3.0 2 1.0 1 2.0 1 4.0 1 NaN 1 Name: count, dtype: int64 """ return algorithms.value_counts( self, sort=sort, ascending=ascending, normalize=normalize, bins=bins, dropna=dropna, ) def unique(self): values = self._values if not isinstance(values, np.ndarray): # i.e. ExtensionArray result = values.unique() else: result = algorithms.unique1d(values) return result @final def nunique(self, dropna: bool = True) -> int: """ Return number of unique elements in the object. Excludes NA values by default. Parameters ---------- dropna : bool, default True Don't include NaN in the count. Returns ------- int See Also -------- DataFrame.nunique: Method nunique for DataFrame. Series.count: Count non-NA/null observations in the Series. Examples -------- >>> s = pd.Series([1, 3, 5, 7, 7]) >>> s 0 1 1 3 2 5 3 7 4 7 dtype: int64 >>> s.nunique() 4 """ uniqs = self.unique() if dropna: uniqs = remove_na_arraylike(uniqs) return len(uniqs) @property def is_unique(self) -> bool: """ Return boolean if values in the object are unique. Returns ------- bool """ return self.nunique(dropna=False) == len(self) @property def is_monotonic_increasing(self) -> bool: """ Return boolean if values in the object are monotonically increasing. Returns ------- bool """ from pandas import Index return Index(self).is_monotonic_increasing @property def is_monotonic_decreasing(self) -> bool: """ Return boolean if values in the object are monotonically decreasing. Returns ------- bool """ from pandas import Index return Index(self).is_monotonic_decreasing @final def _memory_usage(self, deep: bool = False) -> int: """ Memory usage of the values. Parameters ---------- deep : bool, default False Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption. Returns ------- bytes used See Also -------- numpy.ndarray.nbytes : Total bytes consumed by the elements of the array. Notes ----- Memory usage does not include memory consumed by elements that are not components of the array if deep=False or if used on PyPy """ if hasattr(self.array, "memory_usage"): return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues] deep=deep, ) v = self.array.nbytes if deep and is_object_dtype(self) and not PYPY: values = cast(np.ndarray, self._values) v += lib.memory_usage_of_objects(values) return v @doc( algorithms.factorize, values="", order="", size_hint="", sort=textwrap.dedent( """\ sort : bool, default False Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), ) def factorize( self, sort: bool = False, use_na_sentinel: bool = True, ) -> tuple[npt.NDArray[np.intp], Index]: codes, uniques = algorithms.factorize( self._values, sort=sort, use_na_sentinel=use_na_sentinel ) if uniques.dtype == np.float16: uniques = uniques.astype(np.float32) if isinstance(self, ABCIndex): # preserve e.g. MultiIndex uniques = self._constructor(uniques) else: from pandas import Index uniques = Index(uniques) return codes, uniques _shared_docs[ "searchsorted" ] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted {klass} `self` such that, if the corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. .. note:: The {klass} *must* be monotonically sorted, otherwise wrong locations will likely be returned. Pandas does *not* check this for you. Parameters ---------- value : array-like or scalar Values to insert into `self`. side : {{'left', 'right'}}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). sorter : 1-D array-like, optional Optional array of integer indices that sort `self` into ascending order. They are typically the result of ``np.argsort``. Returns ------- int or array of int A scalar or array of insertion points with the same shape as `value`. See Also -------- sort_values : Sort by the values along either axis. numpy.searchsorted : Similar method from NumPy. Notes ----- Binary search is used to find the required insertion points. Examples -------- >>> ser = pd.Series([1, 2, 3]) >>> ser 0 1 1 2 2 3 dtype: int64 >>> ser.searchsorted(4) 3 >>> ser.searchsorted([0, 4]) array([0, 3]) >>> ser.searchsorted([1, 3], side='left') array([0, 2]) >>> ser.searchsorted([1, 3], side='right') array([1, 3]) >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'])) >>> ser 0 2000-03-11 1 2000-03-12 2 2000-03-13 dtype: datetime64[ns] >>> ser.searchsorted('3/14/2000') 3 >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True ... ) >>> ser ['apple', 'bread', 'bread', 'cheese', 'milk'] Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] >>> ser.searchsorted('bread') 1 >>> ser.searchsorted(['bread'], side='right') array([3]) If the values are not monotonically sorted, wrong locations may be returned: >>> ser = pd.Series([2, 1, 3]) >>> ser 0 2 1 1 2 3 dtype: int64 >>> ser.searchsorted(1) # doctest: +SKIP 0 # wrong result, correct would be 1 """ # This overload is needed so that the call to searchsorted in # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result @overload # The following ignore is also present in numpy/__init__.pyi # Possibly a mypy bug?? # error: Overloaded function signatures 1 and 2 overlap with incompatible # return types [misc] def searchsorted( # type: ignore[misc] self, value: ScalarLike_co, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., ) -> np.intp: ... @overload def searchsorted( self, value: npt.ArrayLike | ExtensionArray, side: Literal["left", "right"] = ..., sorter: NumpySorter = ..., ) -> npt.NDArray[np.intp]: ... @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted( self, value: NumpyValueArrayLike | ExtensionArray, side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: if isinstance(value, ABCDataFrame): msg = ( "Value must be 1-D array-like or scalar, " f"{type(value).__name__} is not supported" ) raise ValueError(msg) values = self._values if not isinstance(values, np.ndarray): # Going through EA.searchsorted directly improves performance GH#38083 return values.searchsorted(value, side=side, sorter=sorter) return algorithms.searchsorted( values, value, side=side, sorter=sorter, ) def drop_duplicates(self, *, keep: DropKeep = "first"): duplicated = self._duplicated(keep=keep) # error: Value of type "IndexOpsMixin" is not indexable return self[~duplicated] # type: ignore[index] @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: return algorithms.duplicated(self._values, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) lvalues = self._values rvalues = extract_array(other, extract_numpy=True, extract_range=True) rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape) rvalues = ensure_wrapped_if_datetimelike(rvalues) with np.errstate(all="ignore"): result = ops.arithmetic_op(lvalues, rvalues, op) return self._construct_result(result, name=res_name) def _construct_result(self, result, name): """ Construct an appropriately-wrapped result from the ArrayLike result of an arithmetic-like operation. """ raise AbstractMethodError(self)