cimport cython import numpy as np cimport numpy as cnp from numpy cimport ( int64_t, intp_t, ndarray, uint8_t, uint64_t, ) cnp.import_array() from pandas._libs cimport util from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, get_unit_from_dtype, ) from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport _Timedelta from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import ( algos, hashtable as _hash, ) from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( C_NA, checknull, is_matching_na, ) # Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 cdef bint is_definitely_invalid_key(object val): try: hash(val) except TypeError: return True return False cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): """ Return a ndarray[bool] of locations where val matches self.values. If val is not NA, this is equivalent to `self.values == val` """ # Caller is responsible for ensuring _check_type has already been called cdef: ndarray[uint8_t, ndim=1, cast=True] indexer Py_ssize_t i object item if values.descr.type_num == cnp.NPY_OBJECT: assert mask is None # no mask for object dtype # i.e. values.dtype == object if not checknull(val): indexer = eq_NA_compat(values, val) else: # We need to check for _matching_ NA values indexer = np.empty(len(values), dtype=np.uint8) for i in range(len(values)): item = values[i] indexer[i] = is_matching_na(item, val) else: if mask is not None: if val is C_NA: indexer = mask == 1 else: indexer = (values == val) & ~mask else: if util.is_nan(val): indexer = np.isnan(values) else: indexer = values == val return indexer.view(bool) # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val): """ Possibly unpack a boolean mask to a single indexer. """ # Returns ndarray[bool] or int cdef: ndarray[intp_t, ndim=1] found int count found = np.where(indexer)[0] count = len(found) if count > 1: return indexer if count == 1: return int(found[0]) raise KeyError(val) @cython.freelist(32) cdef class IndexEngine: cdef readonly: ndarray values ndarray mask HashTable mapping bint over_size_threshold cdef: bint unique, monotonic_inc, monotonic_dec bint need_monotonic_check, need_unique_check object _np_type def __init__(self, ndarray values): self.values = values self.mask = None self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.clear_mapping() self._np_type = values.dtype.type def __contains__(self, val: object) -> bool: hash(val) try: self.get_loc(val) except KeyError: return False return True cpdef get_loc(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: Py_ssize_t loc if is_definitely_invalid_key(val): raise TypeError(f"'{val}' is an invalid key") val = self._check_type(val) if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(val) values = self.values loc = self._searchsorted_left(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: raise KeyError(val) return loc self._ensure_mapping_populated() if not self.unique: return self._get_loc_duplicates(val) if self.mask is not None and val is C_NA: return self.mapping.get_na() try: return self.mapping.get_item(val) except OverflowError as err: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 # or if we are int64 and value is np.iinfo(np.int64).max+1 # (the uint64 with -1 case should actually be excluded by _check_type) raise KeyError(val) from err cdef Py_ssize_t _searchsorted_left(self, val) except? -1: """ See ObjectEngine._searchsorted_left.__doc__. """ # Caller is responsible for ensuring _check_type has already been called loc = self.values.searchsorted(self._np_type(val), side="left") return loc cdef _get_loc_duplicates(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: Py_ssize_t diff, left, right if self.is_monotonic_increasing: values = self.values try: left = values.searchsorted(val, side="left") right = values.searchsorted(val, side="right") except TypeError: # e.g. GH#29189 get_loc(None) with a Float64Index # 2021-09-29 Now only reached for object-dtype raise KeyError(val) diff = right - left if diff == 0: raise KeyError(val) elif diff == 1: return left else: return slice(left, right) return self._maybe_get_bool_indexer(val) cdef _maybe_get_bool_indexer(self, object val): # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer indexer = _get_bool_indexer(self.values, val, self.mask) return _unpack_bool_indexer(indexer, val) def sizeof(self, deep: bool = False) -> int: """ return the sizeof our mapping """ if not self.is_mapping_populated: return 0 return self.mapping.sizeof(deep=deep) def __sizeof__(self) -> int: return self.sizeof() @property def is_unique(self) -> bool: if self.need_unique_check: self._do_unique_check() return self.unique == 1 cdef _do_unique_check(self): self._ensure_mapping_populated() @property def is_monotonic_increasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_inc == 1 @property def is_monotonic_decreasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_dec == 1 cdef _do_monotonic_check(self): cdef: bint is_unique if self.mask is not None and np.any(self.mask): self.monotonic_inc = 0 self.monotonic_dec = 0 else: try: values = self.values self.monotonic_inc, self.monotonic_dec, is_unique = \ self._call_monotonic(values) except TypeError: self.monotonic_inc = 0 self.monotonic_dec = 0 is_unique = 0 self.need_monotonic_check = 0 # we can only be sure of uniqueness if is_unique=1 if is_unique: self.unique = 1 self.need_unique_check = 0 cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) cdef _make_hash_table(self, Py_ssize_t n): raise NotImplementedError # pragma: no cover cdef _check_type(self, object val): hash(val) return val @property def is_mapping_populated(self) -> bool: return self.mapping is not None cdef _ensure_mapping_populated(self): # this populates the mapping # if its not already populated # also satisfies the need_unique_check if not self.is_mapping_populated: values = self.values self.mapping = self._make_hash_table(len(values)) self.mapping.map_locations(values, self.mask) if len(self.mapping) == len(values): self.unique = 1 self.need_unique_check = 0 def clear_mapping(self): self.mapping = None self.need_monotonic_check = 1 self.need_unique_check = 1 self.unique = 0 self.monotonic_inc = 0 self.monotonic_dec = 0 def get_indexer(self, ndarray values) -> np.ndarray: self._ensure_mapping_populated() return self.mapping.lookup(values) def get_indexer_non_unique(self, ndarray targets): """ Return an indexer suitable for taking from a non unique index return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results Returns ------- indexer : np.ndarray[np.intp] missing : np.ndarray[np.intp] """ cdef: ndarray values ndarray[intp_t] result, missing set stargets, remaining_stargets, found_nas dict d = {} object val Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end bint check_na_values = False values = self.values stargets = set(targets) n = len(values) n_t = len(targets) if n > 10_000: n_alloc = 10_000 else: n_alloc = n result = np.empty(n_alloc, dtype=np.intp) missing = np.empty(n_t, dtype=np.intp) # map each starget to its position in the index if ( stargets and len(stargets) < 5 and not any([checknull(t) for t in stargets]) and self.is_monotonic_increasing ): # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget remaining_stargets = set() for starget in stargets: try: start = values.searchsorted(starget, side="left") end = values.searchsorted(starget, side="right") except TypeError: # e.g. if we tried to search for string in int array remaining_stargets.add(starget) else: if start != end: d[starget] = list(range(start, end)) stargets = remaining_stargets if stargets: # otherwise, map by iterating through all items in the index # short-circuit na check if values.dtype == object: check_na_values = True # keep track of nas in values found_nas = set() for i in range(n): val = values[i] # GH#43870 # handle lookup for nas # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat) if check_na_values and checknull(val): match = [na for na in found_nas if is_matching_na(val, na)] # matching na not found if not len(match): found_nas.add(val) # add na to stargets to utilize `in` for stargets/d lookup match_stargets = [ x for x in stargets if is_matching_na(val, x) ] if len(match_stargets): # add our 'standardized' na stargets.add(val) # matching na found else: assert len(match) == 1 val = match[0] if val in stargets: if val not in d: d[val] = [] d[val].append(i) for i in range(n_t): val = targets[i] # ensure there are nas in values before looking for a matching na if check_na_values and checknull(val): match = [na for na in found_nas if is_matching_na(val, na)] if len(match): assert len(match) == 1 val = match[0] # found if val in d: key = val for j in d[key]: # realloc if needed if count >= n_alloc: n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = j count += 1 # value not found else: if count >= n_alloc: n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i count_missing += 1 return result[0:count], missing[0:count_missing] cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: # GH#1757 ndarray.searchsorted is not safe to use with array of tuples # (treats a tuple `val` as a sequence of keys instead of a single key), # so we implement something similar. # This is equivalent to the stdlib's bisect.bisect_left cdef: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval if hi == 0 or (hi > 0 and val > values[hi]): return len(values) while lo < hi: mid = (lo + hi) // 2 pval = values[mid] if val < pval: hi = mid elif val > pval: lo = mid + 1 else: while mid > 0 and val == values[mid - 1]: mid -= 1 return mid if val <= values[mid]: return mid else: return mid + 1 cdef class ObjectEngine(IndexEngine): """ Index Engine for use with object-dtype Index, namely the base class Index. """ cdef _make_hash_table(self, Py_ssize_t n): return _hash.PyObjectHashTable(n) cdef Py_ssize_t _searchsorted_left(self, val) except? -1: # using values.searchsorted here would treat a tuple `val` as a sequence # instead of a single key, so we use a different implementation try: loc = _bin_search(self.values, val) except TypeError as err: raise KeyError(val) from err return loc cdef class DatetimeEngine(Int64Engine): cdef: NPY_DATETIMEUNIT _creso def __init__(self, ndarray values): super().__init__(values.view("i8")) self._creso = get_unit_from_dtype(values.dtype) cdef int64_t _unbox_scalar(self, scalar) except? -1: # NB: caller is responsible for ensuring tzawareness compat # before we get here if scalar is NaT: return NaT._value elif isinstance(scalar, _Timestamp): if scalar._creso == self._creso: return scalar._value else: # Note: caller is responsible for catching potential ValueError # from _as_creso return ( (<_Timestamp>scalar)._as_creso(self._creso, round_ok=False)._value ) raise TypeError(scalar) def __contains__(self, val: object) -> bool: # We assume before we get here: # - val is hashable try: self._unbox_scalar(val) except ValueError: return False try: self.get_loc(val) return True except KeyError: return False cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): # NB: the caller is responsible for ensuring that we are called # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine) cdef: Py_ssize_t loc if is_definitely_invalid_key(val): raise TypeError(f"'{val}' is an invalid key") try: conv = self._unbox_scalar(val) except (TypeError, ValueError) as err: raise KeyError(val) from err # Welcome to the spaghetti factory if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(conv) values = self.values loc = values.searchsorted(conv, side="left") if loc == len(values) or values[loc] != conv: raise KeyError(val) return loc self._ensure_mapping_populated() if not self.unique: return self._get_loc_duplicates(conv) try: return self.mapping.get_item(conv) except KeyError: raise KeyError(val) cdef class TimedeltaEngine(DatetimeEngine): cdef int64_t _unbox_scalar(self, scalar) except? -1: if scalar is NaT: return NaT._value elif isinstance(scalar, _Timedelta): if scalar._creso == self._creso: return scalar._value else: # Note: caller is responsible for catching potential ValueError # from _as_creso return ( (<_Timedelta>scalar)._as_creso(self._creso, round_ok=False)._value ) raise TypeError(scalar) cdef class PeriodEngine(Int64Engine): cdef int64_t _unbox_scalar(self, scalar) except? -1: if scalar is NaT: return scalar._value if is_period_object(scalar): # NB: we assume that we have the correct freq here. return scalar.ordinal raise TypeError(scalar) cpdef get_loc(self, object val): # NB: the caller is responsible for ensuring that we are called # with either a Period or NaT cdef: int64_t conv try: conv = self._unbox_scalar(val) except TypeError: raise KeyError(val) return Int64Engine.get_loc(self, conv) cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=True) cdef class BaseMultiIndexCodesEngine: """ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which represent each label in a MultiIndex as an integer, by juxtaposing the bits encoding each level, with appropriate offsets. For instance: if 3 levels have respectively 3, 6 and 1 possible values, then their labels can be represented using respectively 2, 3 and 1 bits, as follows: _ _ _ _____ _ __ __ __ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level) — — — ————— — —— —— —— |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level) — — — ————— — —— —— —— |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels) ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ and the resulting unsigned integer representation will be: _ _ _ _____ _ __ __ __ __ __ __ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0| ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ Offsets are calculated at initialization, labels are transformed by method _codes_to_ints. Keys are located by first locating each component against the respective level, then locating (the integer representation of) codes. """ def __init__(self, object levels, object labels, ndarray[uint64_t, ndim=1] offsets): """ Parameters ---------- levels : list-like of numpy arrays Levels of the MultiIndex. labels : list-like of numpy arrays of integer dtype Labels of the MultiIndex. offsets : numpy array of uint64 dtype Pre-calculated offsets, one for each level of the index. """ self.levels = levels self.offsets = offsets # Transform labels in a single array, and add 2 so that we are working # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift codes = labels_arr.astype("uint64", copy=False) self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the # number of bits to switch labels for each level: lab_ints = self._codes_to_ints(codes) # Initialize underlying index (e.g. libindex.UInt64Engine) with # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lab_ints) def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: raise NotImplementedError("Implemented by subclass") # pragma: no cover def _extract_level_codes(self, target) -> np.ndarray: """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. Parameters ---------- target : MultiIndex Returns ------ int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ zt = [target._get_level_values(i) for i in range(target.nlevels)] level_codes = [] for i, (lev, codes) in enumerate(zip(self.levels, zt)): result = lev.get_indexer_for(codes) + 1 result[result > 0] += 1 if self.level_has_nans[i] and codes.hasnans: result[codes.isna()] += 1 level_codes.append(result) return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) def get_indexer(self, target: np.ndarray) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not appear in `self.values` Parameters ---------- target : np.ndarray Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ return self._base.get_indexer(self, target) def get_indexer_with_fill(self, ndarray target, ndarray values, str method, object limit) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `values`, where -1 represents a value in `target` which does not appear in `values` If `method` is "backfill" then the position for a value in `target` which does not appear in `values` is that of the next greater value in `values` (if one exists), and -1 if there is no such value. Similarly, if the method is "pad" then the position for a value in `target` which does not appear in `values` is that of the next smaller value in `values` (if one exists), and -1 if there is no such value. Parameters ---------- target: ndarray[object] of tuples need not be sorted, but all must have the same length, which must be the same as the length of all tuples in `values` values : ndarray[object] of tuples must be sorted and all have the same length. Should be the set of the MultiIndex's values. method: string "backfill" or "pad" limit: int or None if provided, limit the number of fills to this value Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`, filled with the `method` (and optionally `limit`) specified """ assert method in ("backfill", "pad") cdef: int64_t i, j, next_code int64_t num_values, num_target_values ndarray[int64_t, ndim=1] target_order ndarray[object, ndim=1] target_values ndarray[int64_t, ndim=1] new_codes, new_target_codes ndarray[intp_t, ndim=1] sorted_indexer target_order = np.argsort(target).astype("int64") target_values = target[target_order] num_values, num_target_values = len(values), len(target_values) new_codes, new_target_codes = ( np.empty((num_values,)).astype("int64"), np.empty((num_target_values,)).astype("int64"), ) # `values` and `target_values` are both sorted, so we walk through them # and memoize the (ordered) set of indices in the (implicit) merged-and # sorted list of the two which belong to each of them # the effect of this is to create a factorization for the (sorted) # merger of the index values, where `new_codes` and `new_target_codes` # are the subset of the factors which appear in `values` and `target`, # respectively i, j, next_code = 0, 0, 0 while i < num_values and j < num_target_values: val, target_val = values[i], target_values[j] if val <= target_val: new_codes[i] = next_code i += 1 if target_val <= val: new_target_codes[j] = next_code j += 1 next_code += 1 # at this point, at least one should have reached the end # the remaining values of the other should be added to the end assert i == num_values or j == num_target_values while i < num_values: new_codes[i] = next_code i += 1 next_code += 1 while j < num_target_values: new_target_codes[j] = next_code j += 1 next_code += 1 # get the indexer, and undo the sorting of `target.values` algo = algos.backfill if method == "backfill" else algos.pad sorted_indexer = algo(new_codes, new_target_codes, limit=limit) return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) try: indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) # Transform indices into single integer: lab_int = self._codes_to_ints(np.array(indices, dtype="uint64")) return self._base.get_loc(self, lab_int) def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: indexer = self._base.get_indexer_non_unique(self, target) return indexer def __contains__(self, val: object) -> bool: # We assume before we get here: # - val is hashable # Default __contains__ looks in the underlying mapping, which in this # case only contains integer representations. try: self.get_loc(val) return True except (KeyError, TypeError, ValueError): return False # Generated from template. include "index_class_helper.pxi" cdef class BoolEngine(UInt8Engine): cdef _check_type(self, object val): if not util.is_bool_object(val): raise KeyError(val) return val cdef class MaskedBoolEngine(MaskedUInt8Engine): cdef _check_type(self, object val): if val is C_NA: return val if not util.is_bool_object(val): raise KeyError(val) return val @cython.internal @cython.freelist(32) cdef class SharedEngine: cdef readonly: object values # ExtensionArray bint over_size_threshold cdef: bint unique, monotonic_inc, monotonic_dec bint need_monotonic_check, need_unique_check def __contains__(self, val: object) -> bool: # We assume before we get here: # - val is hashable try: self.get_loc(val) return True except KeyError: return False def clear_mapping(self): # for compat with IndexEngine pass @property def is_unique(self) -> bool: if self.need_unique_check: arr = self.values.unique() self.unique = len(arr) == len(self.values) self.need_unique_check = False return self.unique cdef _do_monotonic_check(self): raise NotImplementedError @property def is_monotonic_increasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_inc == 1 @property def is_monotonic_decreasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_dec == 1 cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) def sizeof(self, deep: bool = False) -> int: """ return the sizeof our mapping """ return 0 def __sizeof__(self) -> int: return self.sizeof() cdef _check_type(self, object obj): raise NotImplementedError cpdef get_loc(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: Py_ssize_t loc if is_definitely_invalid_key(val): raise TypeError(f"'{val}' is an invalid key") self._check_type(val) if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: return self._get_loc_duplicates(val) values = self.values loc = self._searchsorted_left(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: raise KeyError(val) return loc if not self.unique: return self._get_loc_duplicates(val) return self._get_loc_duplicates(val) cdef _get_loc_duplicates(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: Py_ssize_t diff if self.is_monotonic_increasing: values = self.values try: left = values.searchsorted(val, side="left") right = values.searchsorted(val, side="right") except TypeError: # e.g. GH#29189 get_loc(None) with a Float64Index raise KeyError(val) diff = right - left if diff == 0: raise KeyError(val) elif diff == 1: return left else: return slice(left, right) return self._maybe_get_bool_indexer(val) cdef Py_ssize_t _searchsorted_left(self, val) except? -1: """ See ObjectEngine._searchsorted_left.__doc__. """ try: loc = self.values.searchsorted(val, side="left") except TypeError as err: # GH#35788 e.g. val=None with float64 values raise KeyError(val) return loc cdef ndarray _get_bool_indexer(self, val): raise NotImplementedError cdef _maybe_get_bool_indexer(self, object val): # Returns ndarray[bool] or int cdef: ndarray[uint8_t, ndim=1, cast=True] indexer indexer = self._get_bool_indexer(val) return _unpack_bool_indexer(indexer, val) def get_indexer(self, values) -> np.ndarray: # values : type(self.values) # Note: we only get here with self.is_unique cdef: Py_ssize_t i, N = len(values) res = np.empty(N, dtype=np.intp) for i in range(N): val = values[i] try: loc = self.get_loc(val) # Because we are unique, loc should always be an integer except KeyError: loc = -1 else: assert util.is_integer_object(loc), (loc, val) res[i] = loc return res def get_indexer_non_unique(self, targets): """ Return an indexer suitable for taking from a non unique index return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results Parameters ---------- targets : type(self.values) Returns ------- indexer : np.ndarray[np.intp] missing : np.ndarray[np.intp] """ cdef: Py_ssize_t i, N = len(targets) indexer = [] missing = [] # See also IntervalIndex.get_indexer_pointwise for i in range(N): val = targets[i] try: locs = self.get_loc(val) except KeyError: locs = np.array([-1], dtype=np.intp) missing.append(i) else: if isinstance(locs, slice): # Only needed for get_indexer_non_unique locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) elif util.is_integer_object(locs): locs = np.array([locs], dtype=np.intp) else: assert locs.dtype.kind == "b" locs = locs.nonzero()[0] indexer.append(locs) try: indexer = np.concatenate(indexer, dtype=np.intp) except TypeError: # numpy<1.20 doesn't accept dtype keyword indexer = np.concatenate(indexer).astype(np.intp, copy=False) missing = np.array(missing, dtype=np.intp) return indexer, missing cdef class ExtensionEngine(SharedEngine): def __init__(self, values: "ExtensionArray"): self.values = values self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.need_unique_check = True self.need_monotonic_check = True self.need_unique_check = True cdef _do_monotonic_check(self): cdef: bint is_unique values = self.values if values._hasna: self.monotonic_inc = 0 self.monotonic_dec = 0 nunique = len(values.unique()) self.unique = nunique == len(values) self.need_unique_check = 0 return try: ranks = values._rank() except TypeError: self.monotonic_inc = 0 self.monotonic_dec = 0 is_unique = 0 else: self.monotonic_inc, self.monotonic_dec, is_unique = \ self._call_monotonic(ranks) self.need_monotonic_check = 0 # we can only be sure of uniqueness if is_unique=1 if is_unique: self.unique = 1 self.need_unique_check = 0 cdef ndarray _get_bool_indexer(self, val): if checknull(val): return self.values.isna() try: return self.values == val except TypeError: # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool] try: return (self.values == val).to_numpy(dtype=bool, na_value=False) except (TypeError, AttributeError) as err: # e.g. (self.values == val) returned a bool # see test_get_loc_generator[string[pyarrow]] # e.g. self.value == val raises TypeError bc generator has no len # see test_get_loc_generator[string[python]] raise KeyError from err cdef _check_type(self, object val): hash(val) cdef class MaskedIndexEngine(IndexEngine): def __init__(self, object values): super().__init__(self._get_data(values)) self.mask = self._get_mask(values) def _get_data(self, object values) -> np.ndarray: if hasattr(values, "_mask"): return values._data # We are an ArrowExtensionArray # Set 1 as na_value to avoid ending up with NA and an object array # TODO: Remove when arrow engine is implemented return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype) def _get_mask(self, object values) -> np.ndarray: if hasattr(values, "_mask"): return values._mask # We are an ArrowExtensionArray return values.isna() def get_indexer(self, object values) -> np.ndarray: self._ensure_mapping_populated() return self.mapping.lookup(self._get_data(values), self._get_mask(values)) def get_indexer_non_unique(self, object targets): """ Return an indexer suitable for taking from a non unique index return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results Returns ------- indexer : np.ndarray[np.intp] missing : np.ndarray[np.intp] """ # TODO: Unify with parent class cdef: ndarray values, mask, target_vals, target_mask ndarray[intp_t] result, missing set stargets list na_pos dict d = {} object val Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx target_vals = self._get_data(targets) target_mask = self._get_mask(targets) values = self.values assert not values.dtype == object # go through object path instead mask = self.mask stargets = set(target_vals[~target_mask]) n = len(values) n_t = len(target_vals) if n > 10_000: n_alloc = 10_000 else: n_alloc = n result = np.empty(n_alloc, dtype=np.intp) missing = np.empty(n_t, dtype=np.intp) # map each starget to its position in the index if ( stargets and len(stargets) < 5 and not np.any(target_mask) and self.is_monotonic_increasing ): # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget for starget in stargets: start = values.searchsorted(starget, side="left") end = values.searchsorted(starget, side="right") if start != end: d[starget] = list(range(start, end)) stargets = set() if stargets: # otherwise, map by iterating through all items in the index na_pos = [] for i in range(n): val = values[i] if mask[i]: na_pos.append(i) else: if val in stargets: if val not in d: d[val] = [] d[val].append(i) for i in range(n_t): val = target_vals[i] if target_mask[i]: if na_pos: for na_idx in na_pos: # realloc if needed if count >= n_alloc: n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = na_idx count += 1 continue elif val in d: # found key = val for j in d[key]: # realloc if needed if count >= n_alloc: n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = j count += 1 continue # value not found if count >= n_alloc: n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i count_missing += 1 return result[0:count], missing[0:count_missing]