1281 lines
40 KiB
Cython
1281 lines
40 KiB
Cython
cimport cython
|
|
|
|
import numpy as np
|
|
|
|
cimport numpy as cnp
|
|
from numpy cimport (
|
|
int64_t,
|
|
intp_t,
|
|
ndarray,
|
|
uint8_t,
|
|
uint64_t,
|
|
)
|
|
|
|
cnp.import_array()
|
|
|
|
|
|
from pandas._libs cimport util
|
|
from pandas._libs.hashtable cimport HashTable
|
|
from pandas._libs.tslibs.nattype cimport c_NaT as NaT
|
|
from pandas._libs.tslibs.np_datetime cimport (
|
|
NPY_DATETIMEUNIT,
|
|
get_unit_from_dtype,
|
|
)
|
|
from pandas._libs.tslibs.period cimport is_period_object
|
|
from pandas._libs.tslibs.timedeltas cimport _Timedelta
|
|
from pandas._libs.tslibs.timestamps cimport _Timestamp
|
|
|
|
from pandas._libs import (
|
|
algos,
|
|
hashtable as _hash,
|
|
)
|
|
|
|
from pandas._libs.lib cimport eq_NA_compat
|
|
from pandas._libs.missing cimport (
|
|
C_NA,
|
|
checknull,
|
|
is_matching_na,
|
|
)
|
|
|
|
# Defines shift of MultiIndex codes to avoid negative codes (missing values)
|
|
multiindex_nulls_shift = 2
|
|
|
|
|
|
cdef bint is_definitely_invalid_key(object val):
|
|
try:
|
|
hash(val)
|
|
except TypeError:
|
|
return True
|
|
return False
|
|
|
|
|
|
cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
|
|
"""
|
|
Return a ndarray[bool] of locations where val matches self.values.
|
|
|
|
If val is not NA, this is equivalent to `self.values == val`
|
|
"""
|
|
# Caller is responsible for ensuring _check_type has already been called
|
|
cdef:
|
|
ndarray[uint8_t, ndim=1, cast=True] indexer
|
|
Py_ssize_t i
|
|
object item
|
|
|
|
if values.descr.type_num == cnp.NPY_OBJECT:
|
|
assert mask is None # no mask for object dtype
|
|
# i.e. values.dtype == object
|
|
if not checknull(val):
|
|
indexer = eq_NA_compat(values, val)
|
|
|
|
else:
|
|
# We need to check for _matching_ NA values
|
|
indexer = np.empty(len(values), dtype=np.uint8)
|
|
|
|
for i in range(len(values)):
|
|
item = values[i]
|
|
indexer[i] = is_matching_na(item, val)
|
|
|
|
else:
|
|
if mask is not None:
|
|
if val is C_NA:
|
|
indexer = mask == 1
|
|
else:
|
|
indexer = (values == val) & ~mask
|
|
else:
|
|
if util.is_nan(val):
|
|
indexer = np.isnan(values)
|
|
else:
|
|
indexer = values == val
|
|
|
|
return indexer.view(bool)
|
|
|
|
|
|
# Don't populate hash tables in monotonic indexes larger than this
|
|
_SIZE_CUTOFF = 1_000_000
|
|
|
|
|
|
cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
|
|
"""
|
|
Possibly unpack a boolean mask to a single indexer.
|
|
"""
|
|
# Returns ndarray[bool] or int
|
|
cdef:
|
|
ndarray[intp_t, ndim=1] found
|
|
int count
|
|
|
|
found = np.where(indexer)[0]
|
|
count = len(found)
|
|
|
|
if count > 1:
|
|
return indexer
|
|
if count == 1:
|
|
return int(found[0])
|
|
|
|
raise KeyError(val)
|
|
|
|
|
|
@cython.freelist(32)
|
|
cdef class IndexEngine:
|
|
|
|
cdef readonly:
|
|
ndarray values
|
|
ndarray mask
|
|
HashTable mapping
|
|
bint over_size_threshold
|
|
|
|
cdef:
|
|
bint unique, monotonic_inc, monotonic_dec
|
|
bint need_monotonic_check, need_unique_check
|
|
object _np_type
|
|
|
|
def __init__(self, ndarray values):
|
|
self.values = values
|
|
self.mask = None
|
|
|
|
self.over_size_threshold = len(values) >= _SIZE_CUTOFF
|
|
self.clear_mapping()
|
|
self._np_type = values.dtype.type
|
|
|
|
def __contains__(self, val: object) -> bool:
|
|
hash(val)
|
|
try:
|
|
self.get_loc(val)
|
|
except KeyError:
|
|
return False
|
|
return True
|
|
|
|
cpdef get_loc(self, object val):
|
|
# -> Py_ssize_t | slice | ndarray[bool]
|
|
cdef:
|
|
Py_ssize_t loc
|
|
|
|
if is_definitely_invalid_key(val):
|
|
raise TypeError(f"'{val}' is an invalid key")
|
|
|
|
val = self._check_type(val)
|
|
|
|
if self.over_size_threshold and self.is_monotonic_increasing:
|
|
if not self.is_unique:
|
|
return self._get_loc_duplicates(val)
|
|
values = self.values
|
|
|
|
loc = self._searchsorted_left(val)
|
|
if loc >= len(values):
|
|
raise KeyError(val)
|
|
if values[loc] != val:
|
|
raise KeyError(val)
|
|
return loc
|
|
|
|
self._ensure_mapping_populated()
|
|
if not self.unique:
|
|
return self._get_loc_duplicates(val)
|
|
if self.mask is not None and val is C_NA:
|
|
return self.mapping.get_na()
|
|
|
|
try:
|
|
return self.mapping.get_item(val)
|
|
except OverflowError as err:
|
|
# GH#41775 OverflowError e.g. if we are uint64 and val is -1
|
|
# or if we are int64 and value is np.iinfo(np.int64).max+1
|
|
# (the uint64 with -1 case should actually be excluded by _check_type)
|
|
raise KeyError(val) from err
|
|
|
|
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
|
|
"""
|
|
See ObjectEngine._searchsorted_left.__doc__.
|
|
"""
|
|
# Caller is responsible for ensuring _check_type has already been called
|
|
loc = self.values.searchsorted(self._np_type(val), side="left")
|
|
return loc
|
|
|
|
cdef _get_loc_duplicates(self, object val):
|
|
# -> Py_ssize_t | slice | ndarray[bool]
|
|
cdef:
|
|
Py_ssize_t diff, left, right
|
|
|
|
if self.is_monotonic_increasing:
|
|
values = self.values
|
|
try:
|
|
left = values.searchsorted(val, side="left")
|
|
right = values.searchsorted(val, side="right")
|
|
except TypeError:
|
|
# e.g. GH#29189 get_loc(None) with a Float64Index
|
|
# 2021-09-29 Now only reached for object-dtype
|
|
raise KeyError(val)
|
|
|
|
diff = right - left
|
|
if diff == 0:
|
|
raise KeyError(val)
|
|
elif diff == 1:
|
|
return left
|
|
else:
|
|
return slice(left, right)
|
|
|
|
return self._maybe_get_bool_indexer(val)
|
|
|
|
cdef _maybe_get_bool_indexer(self, object val):
|
|
# Returns ndarray[bool] or int
|
|
cdef:
|
|
ndarray[uint8_t, ndim=1, cast=True] indexer
|
|
|
|
indexer = _get_bool_indexer(self.values, val, self.mask)
|
|
return _unpack_bool_indexer(indexer, val)
|
|
|
|
def sizeof(self, deep: bool = False) -> int:
|
|
""" return the sizeof our mapping """
|
|
if not self.is_mapping_populated:
|
|
return 0
|
|
return self.mapping.sizeof(deep=deep)
|
|
|
|
def __sizeof__(self) -> int:
|
|
return self.sizeof()
|
|
|
|
@property
|
|
def is_unique(self) -> bool:
|
|
if self.need_unique_check:
|
|
self._do_unique_check()
|
|
|
|
return self.unique == 1
|
|
|
|
cdef _do_unique_check(self):
|
|
self._ensure_mapping_populated()
|
|
|
|
@property
|
|
def is_monotonic_increasing(self) -> bool:
|
|
if self.need_monotonic_check:
|
|
self._do_monotonic_check()
|
|
|
|
return self.monotonic_inc == 1
|
|
|
|
@property
|
|
def is_monotonic_decreasing(self) -> bool:
|
|
if self.need_monotonic_check:
|
|
self._do_monotonic_check()
|
|
|
|
return self.monotonic_dec == 1
|
|
|
|
cdef _do_monotonic_check(self):
|
|
cdef:
|
|
bint is_unique
|
|
if self.mask is not None and np.any(self.mask):
|
|
self.monotonic_inc = 0
|
|
self.monotonic_dec = 0
|
|
else:
|
|
try:
|
|
values = self.values
|
|
self.monotonic_inc, self.monotonic_dec, is_unique = \
|
|
self._call_monotonic(values)
|
|
except TypeError:
|
|
self.monotonic_inc = 0
|
|
self.monotonic_dec = 0
|
|
is_unique = 0
|
|
|
|
self.need_monotonic_check = 0
|
|
|
|
# we can only be sure of uniqueness if is_unique=1
|
|
if is_unique:
|
|
self.unique = 1
|
|
self.need_unique_check = 0
|
|
|
|
cdef _call_monotonic(self, values):
|
|
return algos.is_monotonic(values, timelike=False)
|
|
|
|
cdef _make_hash_table(self, Py_ssize_t n):
|
|
raise NotImplementedError # pragma: no cover
|
|
|
|
cdef _check_type(self, object val):
|
|
hash(val)
|
|
return val
|
|
|
|
@property
|
|
def is_mapping_populated(self) -> bool:
|
|
return self.mapping is not None
|
|
|
|
cdef _ensure_mapping_populated(self):
|
|
# this populates the mapping
|
|
# if its not already populated
|
|
# also satisfies the need_unique_check
|
|
|
|
if not self.is_mapping_populated:
|
|
|
|
values = self.values
|
|
self.mapping = self._make_hash_table(len(values))
|
|
self.mapping.map_locations(values, self.mask)
|
|
|
|
if len(self.mapping) == len(values):
|
|
self.unique = 1
|
|
|
|
self.need_unique_check = 0
|
|
|
|
def clear_mapping(self):
|
|
self.mapping = None
|
|
self.need_monotonic_check = 1
|
|
self.need_unique_check = 1
|
|
|
|
self.unique = 0
|
|
self.monotonic_inc = 0
|
|
self.monotonic_dec = 0
|
|
|
|
def get_indexer(self, ndarray values) -> np.ndarray:
|
|
self._ensure_mapping_populated()
|
|
return self.mapping.lookup(values)
|
|
|
|
def get_indexer_non_unique(self, ndarray targets):
|
|
"""
|
|
Return an indexer suitable for taking from a non unique index
|
|
return the labels in the same order as the target
|
|
and a missing indexer into the targets (which correspond
|
|
to the -1 indices in the results
|
|
|
|
Returns
|
|
-------
|
|
indexer : np.ndarray[np.intp]
|
|
missing : np.ndarray[np.intp]
|
|
"""
|
|
cdef:
|
|
ndarray values
|
|
ndarray[intp_t] result, missing
|
|
set stargets, remaining_stargets, found_nas
|
|
dict d = {}
|
|
object val
|
|
Py_ssize_t count = 0, count_missing = 0
|
|
Py_ssize_t i, j, n, n_t, n_alloc, start, end
|
|
bint check_na_values = False
|
|
|
|
values = self.values
|
|
stargets = set(targets)
|
|
|
|
n = len(values)
|
|
n_t = len(targets)
|
|
if n > 10_000:
|
|
n_alloc = 10_000
|
|
else:
|
|
n_alloc = n
|
|
|
|
result = np.empty(n_alloc, dtype=np.intp)
|
|
missing = np.empty(n_t, dtype=np.intp)
|
|
|
|
# map each starget to its position in the index
|
|
if (
|
|
stargets and
|
|
len(stargets) < 5 and
|
|
not any([checknull(t) for t in stargets]) and
|
|
self.is_monotonic_increasing
|
|
):
|
|
# if there are few enough stargets and the index is monotonically
|
|
# increasing, then use binary search for each starget
|
|
remaining_stargets = set()
|
|
for starget in stargets:
|
|
try:
|
|
start = values.searchsorted(starget, side="left")
|
|
end = values.searchsorted(starget, side="right")
|
|
except TypeError: # e.g. if we tried to search for string in int array
|
|
remaining_stargets.add(starget)
|
|
else:
|
|
if start != end:
|
|
d[starget] = list(range(start, end))
|
|
|
|
stargets = remaining_stargets
|
|
|
|
if stargets:
|
|
# otherwise, map by iterating through all items in the index
|
|
|
|
# short-circuit na check
|
|
if values.dtype == object:
|
|
check_na_values = True
|
|
# keep track of nas in values
|
|
found_nas = set()
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
# GH#43870
|
|
# handle lookup for nas
|
|
# (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
|
|
if check_na_values and checknull(val):
|
|
match = [na for na in found_nas if is_matching_na(val, na)]
|
|
|
|
# matching na not found
|
|
if not len(match):
|
|
found_nas.add(val)
|
|
|
|
# add na to stargets to utilize `in` for stargets/d lookup
|
|
match_stargets = [
|
|
x for x in stargets if is_matching_na(val, x)
|
|
]
|
|
|
|
if len(match_stargets):
|
|
# add our 'standardized' na
|
|
stargets.add(val)
|
|
|
|
# matching na found
|
|
else:
|
|
assert len(match) == 1
|
|
val = match[0]
|
|
|
|
if val in stargets:
|
|
if val not in d:
|
|
d[val] = []
|
|
d[val].append(i)
|
|
|
|
for i in range(n_t):
|
|
val = targets[i]
|
|
|
|
# ensure there are nas in values before looking for a matching na
|
|
if check_na_values and checknull(val):
|
|
match = [na for na in found_nas if is_matching_na(val, na)]
|
|
if len(match):
|
|
assert len(match) == 1
|
|
val = match[0]
|
|
|
|
# found
|
|
if val in d:
|
|
key = val
|
|
|
|
for j in d[key]:
|
|
|
|
# realloc if needed
|
|
if count >= n_alloc:
|
|
n_alloc += 10_000
|
|
result = np.resize(result, n_alloc)
|
|
|
|
result[count] = j
|
|
count += 1
|
|
|
|
# value not found
|
|
else:
|
|
|
|
if count >= n_alloc:
|
|
n_alloc += 10_000
|
|
result = np.resize(result, n_alloc)
|
|
result[count] = -1
|
|
count += 1
|
|
missing[count_missing] = i
|
|
count_missing += 1
|
|
|
|
return result[0:count], missing[0:count_missing]
|
|
|
|
|
|
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
|
|
# GH#1757 ndarray.searchsorted is not safe to use with array of tuples
|
|
# (treats a tuple `val` as a sequence of keys instead of a single key),
|
|
# so we implement something similar.
|
|
# This is equivalent to the stdlib's bisect.bisect_left
|
|
|
|
cdef:
|
|
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
|
|
object pval
|
|
|
|
if hi == 0 or (hi > 0 and val > values[hi]):
|
|
return len(values)
|
|
|
|
while lo < hi:
|
|
mid = (lo + hi) // 2
|
|
pval = values[mid]
|
|
if val < pval:
|
|
hi = mid
|
|
elif val > pval:
|
|
lo = mid + 1
|
|
else:
|
|
while mid > 0 and val == values[mid - 1]:
|
|
mid -= 1
|
|
return mid
|
|
|
|
if val <= values[mid]:
|
|
return mid
|
|
else:
|
|
return mid + 1
|
|
|
|
|
|
cdef class ObjectEngine(IndexEngine):
|
|
"""
|
|
Index Engine for use with object-dtype Index, namely the base class Index.
|
|
"""
|
|
cdef _make_hash_table(self, Py_ssize_t n):
|
|
return _hash.PyObjectHashTable(n)
|
|
|
|
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
|
|
# using values.searchsorted here would treat a tuple `val` as a sequence
|
|
# instead of a single key, so we use a different implementation
|
|
try:
|
|
loc = _bin_search(self.values, val)
|
|
except TypeError as err:
|
|
raise KeyError(val) from err
|
|
return loc
|
|
|
|
|
|
cdef class DatetimeEngine(Int64Engine):
|
|
|
|
cdef:
|
|
NPY_DATETIMEUNIT _creso
|
|
|
|
def __init__(self, ndarray values):
|
|
super().__init__(values.view("i8"))
|
|
self._creso = get_unit_from_dtype(values.dtype)
|
|
|
|
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
|
# NB: caller is responsible for ensuring tzawareness compat
|
|
# before we get here
|
|
if scalar is NaT:
|
|
return NaT._value
|
|
elif isinstance(scalar, _Timestamp):
|
|
if scalar._creso == self._creso:
|
|
return scalar._value
|
|
else:
|
|
# Note: caller is responsible for catching potential ValueError
|
|
# from _as_creso
|
|
return (
|
|
(<_Timestamp>scalar)._as_creso(self._creso, round_ok=False)._value
|
|
)
|
|
raise TypeError(scalar)
|
|
|
|
def __contains__(self, val: object) -> bool:
|
|
# We assume before we get here:
|
|
# - val is hashable
|
|
try:
|
|
self._unbox_scalar(val)
|
|
except ValueError:
|
|
return False
|
|
|
|
try:
|
|
self.get_loc(val)
|
|
return True
|
|
except KeyError:
|
|
return False
|
|
|
|
cdef _call_monotonic(self, values):
|
|
return algos.is_monotonic(values, timelike=True)
|
|
|
|
cpdef get_loc(self, object val):
|
|
# NB: the caller is responsible for ensuring that we are called
|
|
# with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
|
|
|
|
cdef:
|
|
Py_ssize_t loc
|
|
|
|
if is_definitely_invalid_key(val):
|
|
raise TypeError(f"'{val}' is an invalid key")
|
|
|
|
try:
|
|
conv = self._unbox_scalar(val)
|
|
except (TypeError, ValueError) as err:
|
|
raise KeyError(val) from err
|
|
|
|
# Welcome to the spaghetti factory
|
|
if self.over_size_threshold and self.is_monotonic_increasing:
|
|
if not self.is_unique:
|
|
return self._get_loc_duplicates(conv)
|
|
values = self.values
|
|
|
|
loc = values.searchsorted(conv, side="left")
|
|
|
|
if loc == len(values) or values[loc] != conv:
|
|
raise KeyError(val)
|
|
return loc
|
|
|
|
self._ensure_mapping_populated()
|
|
if not self.unique:
|
|
return self._get_loc_duplicates(conv)
|
|
|
|
try:
|
|
return self.mapping.get_item(conv)
|
|
except KeyError:
|
|
raise KeyError(val)
|
|
|
|
|
|
cdef class TimedeltaEngine(DatetimeEngine):
|
|
|
|
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
|
if scalar is NaT:
|
|
return NaT._value
|
|
elif isinstance(scalar, _Timedelta):
|
|
if scalar._creso == self._creso:
|
|
return scalar._value
|
|
else:
|
|
# Note: caller is responsible for catching potential ValueError
|
|
# from _as_creso
|
|
return (
|
|
(<_Timedelta>scalar)._as_creso(self._creso, round_ok=False)._value
|
|
)
|
|
raise TypeError(scalar)
|
|
|
|
|
|
cdef class PeriodEngine(Int64Engine):
|
|
|
|
cdef int64_t _unbox_scalar(self, scalar) except? -1:
|
|
if scalar is NaT:
|
|
return scalar._value
|
|
if is_period_object(scalar):
|
|
# NB: we assume that we have the correct freq here.
|
|
return scalar.ordinal
|
|
raise TypeError(scalar)
|
|
|
|
cpdef get_loc(self, object val):
|
|
# NB: the caller is responsible for ensuring that we are called
|
|
# with either a Period or NaT
|
|
cdef:
|
|
int64_t conv
|
|
|
|
try:
|
|
conv = self._unbox_scalar(val)
|
|
except TypeError:
|
|
raise KeyError(val)
|
|
|
|
return Int64Engine.get_loc(self, conv)
|
|
|
|
cdef _call_monotonic(self, values):
|
|
return algos.is_monotonic(values, timelike=True)
|
|
|
|
|
|
cdef class BaseMultiIndexCodesEngine:
|
|
"""
|
|
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
|
|
represent each label in a MultiIndex as an integer, by juxtaposing the bits
|
|
encoding each level, with appropriate offsets.
|
|
|
|
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
|
|
then their labels can be represented using respectively 2, 3 and 1 bits,
|
|
as follows:
|
|
_ _ _ _____ _ __ __ __
|
|
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
|
|
— — — ————— — —— —— ——
|
|
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
|
|
— — — ————— — —— —— ——
|
|
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
|
|
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
|
|
and the resulting unsigned integer representation will be:
|
|
_ _ _ _____ _ __ __ __ __ __ __
|
|
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
|
|
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
|
|
|
|
Offsets are calculated at initialization, labels are transformed by method
|
|
_codes_to_ints.
|
|
|
|
Keys are located by first locating each component against the respective
|
|
level, then locating (the integer representation of) codes.
|
|
"""
|
|
def __init__(self, object levels, object labels,
|
|
ndarray[uint64_t, ndim=1] offsets):
|
|
"""
|
|
Parameters
|
|
----------
|
|
levels : list-like of numpy arrays
|
|
Levels of the MultiIndex.
|
|
labels : list-like of numpy arrays of integer dtype
|
|
Labels of the MultiIndex.
|
|
offsets : numpy array of uint64 dtype
|
|
Pre-calculated offsets, one for each level of the index.
|
|
"""
|
|
self.levels = levels
|
|
self.offsets = offsets
|
|
|
|
# Transform labels in a single array, and add 2 so that we are working
|
|
# with positive integers (-1 for NaN becomes 1). This enables us to
|
|
# differentiate between values that are missing in other and matching
|
|
# NaNs. We will set values that are not found to 0 later:
|
|
labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
|
|
codes = labels_arr.astype("uint64", copy=False)
|
|
self.level_has_nans = [-1 in lab for lab in labels]
|
|
|
|
# Map each codes combination in the index to an integer unambiguously
|
|
# (no collisions possible), based on the "offsets", which describe the
|
|
# number of bits to switch labels for each level:
|
|
lab_ints = self._codes_to_ints(codes)
|
|
|
|
# Initialize underlying index (e.g. libindex.UInt64Engine) with
|
|
# integers representing labels: we will use its get_loc and get_indexer
|
|
self._base.__init__(self, lab_ints)
|
|
|
|
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
|
|
raise NotImplementedError("Implemented by subclass") # pragma: no cover
|
|
|
|
def _extract_level_codes(self, target) -> np.ndarray:
|
|
"""
|
|
Map the requested list of (tuple) keys to their integer representations
|
|
for searching in the underlying integer index.
|
|
|
|
Parameters
|
|
----------
|
|
target : MultiIndex
|
|
|
|
Returns
|
|
------
|
|
int_keys : 1-dimensional array of dtype uint64 or object
|
|
Integers representing one combination each
|
|
"""
|
|
zt = [target._get_level_values(i) for i in range(target.nlevels)]
|
|
level_codes = []
|
|
for i, (lev, codes) in enumerate(zip(self.levels, zt)):
|
|
result = lev.get_indexer_for(codes) + 1
|
|
result[result > 0] += 1
|
|
if self.level_has_nans[i] and codes.hasnans:
|
|
result[codes.isna()] += 1
|
|
level_codes.append(result)
|
|
return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
|
|
|
|
def get_indexer(self, target: np.ndarray) -> np.ndarray:
|
|
"""
|
|
Returns an array giving the positions of each value of `target` in
|
|
`self.values`, where -1 represents a value in `target` which does not
|
|
appear in `self.values`
|
|
|
|
Parameters
|
|
----------
|
|
target : np.ndarray
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[intp_t, ndim=1] of the indexer of `target` into
|
|
`self.values`
|
|
"""
|
|
return self._base.get_indexer(self, target)
|
|
|
|
def get_indexer_with_fill(self, ndarray target, ndarray values,
|
|
str method, object limit) -> np.ndarray:
|
|
"""
|
|
Returns an array giving the positions of each value of `target` in
|
|
`values`, where -1 represents a value in `target` which does not
|
|
appear in `values`
|
|
|
|
If `method` is "backfill" then the position for a value in `target`
|
|
which does not appear in `values` is that of the next greater value
|
|
in `values` (if one exists), and -1 if there is no such value.
|
|
|
|
Similarly, if the method is "pad" then the position for a value in
|
|
`target` which does not appear in `values` is that of the next smaller
|
|
value in `values` (if one exists), and -1 if there is no such value.
|
|
|
|
Parameters
|
|
----------
|
|
target: ndarray[object] of tuples
|
|
need not be sorted, but all must have the same length, which must be
|
|
the same as the length of all tuples in `values`
|
|
values : ndarray[object] of tuples
|
|
must be sorted and all have the same length. Should be the set of
|
|
the MultiIndex's values.
|
|
method: string
|
|
"backfill" or "pad"
|
|
limit: int or None
|
|
if provided, limit the number of fills to this value
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
|
|
filled with the `method` (and optionally `limit`) specified
|
|
"""
|
|
assert method in ("backfill", "pad")
|
|
cdef:
|
|
int64_t i, j, next_code
|
|
int64_t num_values, num_target_values
|
|
ndarray[int64_t, ndim=1] target_order
|
|
ndarray[object, ndim=1] target_values
|
|
ndarray[int64_t, ndim=1] new_codes, new_target_codes
|
|
ndarray[intp_t, ndim=1] sorted_indexer
|
|
|
|
target_order = np.argsort(target).astype("int64")
|
|
target_values = target[target_order]
|
|
num_values, num_target_values = len(values), len(target_values)
|
|
new_codes, new_target_codes = (
|
|
np.empty((num_values,)).astype("int64"),
|
|
np.empty((num_target_values,)).astype("int64"),
|
|
)
|
|
|
|
# `values` and `target_values` are both sorted, so we walk through them
|
|
# and memoize the (ordered) set of indices in the (implicit) merged-and
|
|
# sorted list of the two which belong to each of them
|
|
# the effect of this is to create a factorization for the (sorted)
|
|
# merger of the index values, where `new_codes` and `new_target_codes`
|
|
# are the subset of the factors which appear in `values` and `target`,
|
|
# respectively
|
|
i, j, next_code = 0, 0, 0
|
|
while i < num_values and j < num_target_values:
|
|
val, target_val = values[i], target_values[j]
|
|
if val <= target_val:
|
|
new_codes[i] = next_code
|
|
i += 1
|
|
if target_val <= val:
|
|
new_target_codes[j] = next_code
|
|
j += 1
|
|
next_code += 1
|
|
|
|
# at this point, at least one should have reached the end
|
|
# the remaining values of the other should be added to the end
|
|
assert i == num_values or j == num_target_values
|
|
while i < num_values:
|
|
new_codes[i] = next_code
|
|
i += 1
|
|
next_code += 1
|
|
while j < num_target_values:
|
|
new_target_codes[j] = next_code
|
|
j += 1
|
|
next_code += 1
|
|
|
|
# get the indexer, and undo the sorting of `target.values`
|
|
algo = algos.backfill if method == "backfill" else algos.pad
|
|
sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
|
|
return sorted_indexer[np.argsort(target_order)]
|
|
|
|
def get_loc(self, object key):
|
|
if is_definitely_invalid_key(key):
|
|
raise TypeError(f"'{key}' is an invalid key")
|
|
if not isinstance(key, tuple):
|
|
raise KeyError(key)
|
|
try:
|
|
indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
|
|
for lev, v in zip(self.levels, key)]
|
|
except KeyError:
|
|
raise KeyError(key)
|
|
|
|
# Transform indices into single integer:
|
|
lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
|
|
|
|
return self._base.get_loc(self, lab_int)
|
|
|
|
def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
|
|
indexer = self._base.get_indexer_non_unique(self, target)
|
|
|
|
return indexer
|
|
|
|
def __contains__(self, val: object) -> bool:
|
|
# We assume before we get here:
|
|
# - val is hashable
|
|
# Default __contains__ looks in the underlying mapping, which in this
|
|
# case only contains integer representations.
|
|
try:
|
|
self.get_loc(val)
|
|
return True
|
|
except (KeyError, TypeError, ValueError):
|
|
return False
|
|
|
|
|
|
# Generated from template.
|
|
include "index_class_helper.pxi"
|
|
|
|
|
|
cdef class BoolEngine(UInt8Engine):
|
|
cdef _check_type(self, object val):
|
|
if not util.is_bool_object(val):
|
|
raise KeyError(val)
|
|
return <uint8_t>val
|
|
|
|
|
|
cdef class MaskedBoolEngine(MaskedUInt8Engine):
|
|
cdef _check_type(self, object val):
|
|
if val is C_NA:
|
|
return val
|
|
if not util.is_bool_object(val):
|
|
raise KeyError(val)
|
|
return <uint8_t>val
|
|
|
|
|
|
@cython.internal
|
|
@cython.freelist(32)
|
|
cdef class SharedEngine:
|
|
cdef readonly:
|
|
object values # ExtensionArray
|
|
bint over_size_threshold
|
|
|
|
cdef:
|
|
bint unique, monotonic_inc, monotonic_dec
|
|
bint need_monotonic_check, need_unique_check
|
|
|
|
def __contains__(self, val: object) -> bool:
|
|
# We assume before we get here:
|
|
# - val is hashable
|
|
try:
|
|
self.get_loc(val)
|
|
return True
|
|
except KeyError:
|
|
return False
|
|
|
|
def clear_mapping(self):
|
|
# for compat with IndexEngine
|
|
pass
|
|
|
|
@property
|
|
def is_unique(self) -> bool:
|
|
if self.need_unique_check:
|
|
arr = self.values.unique()
|
|
self.unique = len(arr) == len(self.values)
|
|
|
|
self.need_unique_check = False
|
|
return self.unique
|
|
|
|
cdef _do_monotonic_check(self):
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def is_monotonic_increasing(self) -> bool:
|
|
if self.need_monotonic_check:
|
|
self._do_monotonic_check()
|
|
|
|
return self.monotonic_inc == 1
|
|
|
|
@property
|
|
def is_monotonic_decreasing(self) -> bool:
|
|
if self.need_monotonic_check:
|
|
self._do_monotonic_check()
|
|
|
|
return self.monotonic_dec == 1
|
|
|
|
cdef _call_monotonic(self, values):
|
|
return algos.is_monotonic(values, timelike=False)
|
|
|
|
def sizeof(self, deep: bool = False) -> int:
|
|
""" return the sizeof our mapping """
|
|
return 0
|
|
|
|
def __sizeof__(self) -> int:
|
|
return self.sizeof()
|
|
|
|
cdef _check_type(self, object obj):
|
|
raise NotImplementedError
|
|
|
|
cpdef get_loc(self, object val):
|
|
# -> Py_ssize_t | slice | ndarray[bool]
|
|
cdef:
|
|
Py_ssize_t loc
|
|
|
|
if is_definitely_invalid_key(val):
|
|
raise TypeError(f"'{val}' is an invalid key")
|
|
|
|
self._check_type(val)
|
|
|
|
if self.over_size_threshold and self.is_monotonic_increasing:
|
|
if not self.is_unique:
|
|
return self._get_loc_duplicates(val)
|
|
|
|
values = self.values
|
|
|
|
loc = self._searchsorted_left(val)
|
|
if loc >= len(values):
|
|
raise KeyError(val)
|
|
if values[loc] != val:
|
|
raise KeyError(val)
|
|
return loc
|
|
|
|
if not self.unique:
|
|
return self._get_loc_duplicates(val)
|
|
|
|
return self._get_loc_duplicates(val)
|
|
|
|
cdef _get_loc_duplicates(self, object val):
|
|
# -> Py_ssize_t | slice | ndarray[bool]
|
|
cdef:
|
|
Py_ssize_t diff
|
|
|
|
if self.is_monotonic_increasing:
|
|
values = self.values
|
|
try:
|
|
left = values.searchsorted(val, side="left")
|
|
right = values.searchsorted(val, side="right")
|
|
except TypeError:
|
|
# e.g. GH#29189 get_loc(None) with a Float64Index
|
|
raise KeyError(val)
|
|
|
|
diff = right - left
|
|
if diff == 0:
|
|
raise KeyError(val)
|
|
elif diff == 1:
|
|
return left
|
|
else:
|
|
return slice(left, right)
|
|
|
|
return self._maybe_get_bool_indexer(val)
|
|
|
|
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
|
|
"""
|
|
See ObjectEngine._searchsorted_left.__doc__.
|
|
"""
|
|
try:
|
|
loc = self.values.searchsorted(val, side="left")
|
|
except TypeError as err:
|
|
# GH#35788 e.g. val=None with float64 values
|
|
raise KeyError(val)
|
|
return loc
|
|
|
|
cdef ndarray _get_bool_indexer(self, val):
|
|
raise NotImplementedError
|
|
|
|
cdef _maybe_get_bool_indexer(self, object val):
|
|
# Returns ndarray[bool] or int
|
|
cdef:
|
|
ndarray[uint8_t, ndim=1, cast=True] indexer
|
|
|
|
indexer = self._get_bool_indexer(val)
|
|
return _unpack_bool_indexer(indexer, val)
|
|
|
|
def get_indexer(self, values) -> np.ndarray:
|
|
# values : type(self.values)
|
|
# Note: we only get here with self.is_unique
|
|
cdef:
|
|
Py_ssize_t i, N = len(values)
|
|
|
|
res = np.empty(N, dtype=np.intp)
|
|
|
|
for i in range(N):
|
|
val = values[i]
|
|
try:
|
|
loc = self.get_loc(val)
|
|
# Because we are unique, loc should always be an integer
|
|
except KeyError:
|
|
loc = -1
|
|
else:
|
|
assert util.is_integer_object(loc), (loc, val)
|
|
res[i] = loc
|
|
|
|
return res
|
|
|
|
def get_indexer_non_unique(self, targets):
|
|
"""
|
|
Return an indexer suitable for taking from a non unique index
|
|
return the labels in the same order as the target
|
|
and a missing indexer into the targets (which correspond
|
|
to the -1 indices in the results
|
|
Parameters
|
|
----------
|
|
targets : type(self.values)
|
|
Returns
|
|
-------
|
|
indexer : np.ndarray[np.intp]
|
|
missing : np.ndarray[np.intp]
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, N = len(targets)
|
|
|
|
indexer = []
|
|
missing = []
|
|
|
|
# See also IntervalIndex.get_indexer_pointwise
|
|
for i in range(N):
|
|
val = targets[i]
|
|
|
|
try:
|
|
locs = self.get_loc(val)
|
|
except KeyError:
|
|
locs = np.array([-1], dtype=np.intp)
|
|
missing.append(i)
|
|
else:
|
|
if isinstance(locs, slice):
|
|
# Only needed for get_indexer_non_unique
|
|
locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
|
|
elif util.is_integer_object(locs):
|
|
locs = np.array([locs], dtype=np.intp)
|
|
else:
|
|
assert locs.dtype.kind == "b"
|
|
locs = locs.nonzero()[0]
|
|
|
|
indexer.append(locs)
|
|
|
|
try:
|
|
indexer = np.concatenate(indexer, dtype=np.intp)
|
|
except TypeError:
|
|
# numpy<1.20 doesn't accept dtype keyword
|
|
indexer = np.concatenate(indexer).astype(np.intp, copy=False)
|
|
missing = np.array(missing, dtype=np.intp)
|
|
|
|
return indexer, missing
|
|
|
|
|
|
cdef class ExtensionEngine(SharedEngine):
|
|
def __init__(self, values: "ExtensionArray"):
|
|
self.values = values
|
|
|
|
self.over_size_threshold = len(values) >= _SIZE_CUTOFF
|
|
self.need_unique_check = True
|
|
self.need_monotonic_check = True
|
|
self.need_unique_check = True
|
|
|
|
cdef _do_monotonic_check(self):
|
|
cdef:
|
|
bint is_unique
|
|
|
|
values = self.values
|
|
if values._hasna:
|
|
self.monotonic_inc = 0
|
|
self.monotonic_dec = 0
|
|
|
|
nunique = len(values.unique())
|
|
self.unique = nunique == len(values)
|
|
self.need_unique_check = 0
|
|
return
|
|
|
|
try:
|
|
ranks = values._rank()
|
|
|
|
except TypeError:
|
|
self.monotonic_inc = 0
|
|
self.monotonic_dec = 0
|
|
is_unique = 0
|
|
else:
|
|
self.monotonic_inc, self.monotonic_dec, is_unique = \
|
|
self._call_monotonic(ranks)
|
|
|
|
self.need_monotonic_check = 0
|
|
|
|
# we can only be sure of uniqueness if is_unique=1
|
|
if is_unique:
|
|
self.unique = 1
|
|
self.need_unique_check = 0
|
|
|
|
cdef ndarray _get_bool_indexer(self, val):
|
|
if checknull(val):
|
|
return self.values.isna()
|
|
|
|
try:
|
|
return self.values == val
|
|
except TypeError:
|
|
# e.g. if __eq__ returns a BooleanArray instead of ndarray[bool]
|
|
try:
|
|
return (self.values == val).to_numpy(dtype=bool, na_value=False)
|
|
except (TypeError, AttributeError) as err:
|
|
# e.g. (self.values == val) returned a bool
|
|
# see test_get_loc_generator[string[pyarrow]]
|
|
# e.g. self.value == val raises TypeError bc generator has no len
|
|
# see test_get_loc_generator[string[python]]
|
|
raise KeyError from err
|
|
|
|
cdef _check_type(self, object val):
|
|
hash(val)
|
|
|
|
|
|
cdef class MaskedIndexEngine(IndexEngine):
|
|
def __init__(self, object values):
|
|
super().__init__(self._get_data(values))
|
|
self.mask = self._get_mask(values)
|
|
|
|
def _get_data(self, object values) -> np.ndarray:
|
|
if hasattr(values, "_mask"):
|
|
return values._data
|
|
# We are an ArrowExtensionArray
|
|
# Set 1 as na_value to avoid ending up with NA and an object array
|
|
# TODO: Remove when arrow engine is implemented
|
|
return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)
|
|
|
|
def _get_mask(self, object values) -> np.ndarray:
|
|
if hasattr(values, "_mask"):
|
|
return values._mask
|
|
# We are an ArrowExtensionArray
|
|
return values.isna()
|
|
|
|
def get_indexer(self, object values) -> np.ndarray:
|
|
self._ensure_mapping_populated()
|
|
return self.mapping.lookup(self._get_data(values), self._get_mask(values))
|
|
|
|
def get_indexer_non_unique(self, object targets):
|
|
"""
|
|
Return an indexer suitable for taking from a non unique index
|
|
return the labels in the same order as the target
|
|
and a missing indexer into the targets (which correspond
|
|
to the -1 indices in the results
|
|
|
|
Returns
|
|
-------
|
|
indexer : np.ndarray[np.intp]
|
|
missing : np.ndarray[np.intp]
|
|
"""
|
|
# TODO: Unify with parent class
|
|
cdef:
|
|
ndarray values, mask, target_vals, target_mask
|
|
ndarray[intp_t] result, missing
|
|
set stargets
|
|
list na_pos
|
|
dict d = {}
|
|
object val
|
|
Py_ssize_t count = 0, count_missing = 0
|
|
Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
|
|
|
|
target_vals = self._get_data(targets)
|
|
target_mask = self._get_mask(targets)
|
|
|
|
values = self.values
|
|
assert not values.dtype == object # go through object path instead
|
|
|
|
mask = self.mask
|
|
stargets = set(target_vals[~target_mask])
|
|
|
|
n = len(values)
|
|
n_t = len(target_vals)
|
|
if n > 10_000:
|
|
n_alloc = 10_000
|
|
else:
|
|
n_alloc = n
|
|
|
|
result = np.empty(n_alloc, dtype=np.intp)
|
|
missing = np.empty(n_t, dtype=np.intp)
|
|
|
|
# map each starget to its position in the index
|
|
if (
|
|
stargets and
|
|
len(stargets) < 5 and
|
|
not np.any(target_mask) and
|
|
self.is_monotonic_increasing
|
|
):
|
|
# if there are few enough stargets and the index is monotonically
|
|
# increasing, then use binary search for each starget
|
|
for starget in stargets:
|
|
start = values.searchsorted(starget, side="left")
|
|
end = values.searchsorted(starget, side="right")
|
|
if start != end:
|
|
d[starget] = list(range(start, end))
|
|
|
|
stargets = set()
|
|
|
|
if stargets:
|
|
# otherwise, map by iterating through all items in the index
|
|
|
|
na_pos = []
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
if mask[i]:
|
|
na_pos.append(i)
|
|
|
|
else:
|
|
if val in stargets:
|
|
if val not in d:
|
|
d[val] = []
|
|
d[val].append(i)
|
|
|
|
for i in range(n_t):
|
|
val = target_vals[i]
|
|
|
|
if target_mask[i]:
|
|
if na_pos:
|
|
for na_idx in na_pos:
|
|
# realloc if needed
|
|
if count >= n_alloc:
|
|
n_alloc += 10_000
|
|
result = np.resize(result, n_alloc)
|
|
|
|
result[count] = na_idx
|
|
count += 1
|
|
continue
|
|
|
|
elif val in d:
|
|
# found
|
|
key = val
|
|
|
|
for j in d[key]:
|
|
|
|
# realloc if needed
|
|
if count >= n_alloc:
|
|
n_alloc += 10_000
|
|
result = np.resize(result, n_alloc)
|
|
|
|
result[count] = j
|
|
count += 1
|
|
continue
|
|
|
|
# value not found
|
|
if count >= n_alloc:
|
|
n_alloc += 10_000
|
|
result = np.resize(result, n_alloc)
|
|
result[count] = -1
|
|
count += 1
|
|
missing[count_missing] = i
|
|
count_missing += 1
|
|
|
|
return result[0:count], missing[0:count_missing]
|