1507 lines
50 KiB
Cython
1507 lines
50 KiB
Cython
"""
|
|
Template for each `dtype` helper function for hashtable
|
|
|
|
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
|
|
"""
|
|
|
|
|
|
{{py:
|
|
|
|
# name
|
|
complex_types = ['complex64',
|
|
'complex128']
|
|
}}
|
|
|
|
{{for name in complex_types}}
|
|
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
|
|
cdef kh{{name}}_t res
|
|
res.real = val.real
|
|
res.imag = val.imag
|
|
return res
|
|
|
|
{{endfor}}
|
|
|
|
|
|
{{py:
|
|
|
|
|
|
# name
|
|
c_types = ['khcomplex128_t',
|
|
'khcomplex64_t',
|
|
'float64_t',
|
|
'float32_t',
|
|
'int64_t',
|
|
'int32_t',
|
|
'int16_t',
|
|
'int8_t',
|
|
'uint64_t',
|
|
'uint32_t',
|
|
'uint16_t',
|
|
'uint8_t']
|
|
}}
|
|
|
|
{{for c_type in c_types}}
|
|
|
|
cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
|
|
{{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
|
|
return val.real != val.real or val.imag != val.imag
|
|
{{elif c_type in {'float64_t', 'float32_t'} }}
|
|
return val != val
|
|
{{else}}
|
|
return False
|
|
{{endif}}
|
|
|
|
|
|
{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
|
|
# are_equivalent_{{c_type}} is cimported via khash.pxd
|
|
{{else}}
|
|
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
|
|
return val1 == val2
|
|
{{endif}}
|
|
|
|
{{endfor}}
|
|
|
|
|
|
{{py:
|
|
|
|
# name
|
|
cimported_types = ['complex64',
|
|
'complex128',
|
|
'float32',
|
|
'float64',
|
|
'int8',
|
|
'int16',
|
|
'int32',
|
|
'int64',
|
|
'pymap',
|
|
'str',
|
|
'strbox',
|
|
'uint8',
|
|
'uint16',
|
|
'uint32',
|
|
'uint64']
|
|
}}
|
|
|
|
{{for name in cimported_types}}
|
|
from pandas._libs.khash cimport (
|
|
kh_destroy_{{name}},
|
|
kh_exist_{{name}},
|
|
kh_get_{{name}},
|
|
kh_init_{{name}},
|
|
kh_put_{{name}},
|
|
kh_resize_{{name}},
|
|
)
|
|
|
|
{{endfor}}
|
|
|
|
# ----------------------------------------------------------------------
|
|
# VectorData
|
|
# ----------------------------------------------------------------------
|
|
|
|
from pandas._libs.tslibs.util cimport get_c_string
|
|
from pandas._libs.missing cimport C_NA
|
|
|
|
|
|
{{py:
|
|
|
|
# name, dtype, c_type
|
|
# the generated StringVector is not actually used
|
|
# but is included for completeness (rather ObjectVector is used
|
|
# for uniques in hashtables)
|
|
|
|
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
|
|
('Complex64', 'complex64', 'khcomplex64_t'),
|
|
('Float64', 'float64', 'float64_t'),
|
|
('Float32', 'float32', 'float32_t'),
|
|
('Int64', 'int64', 'int64_t'),
|
|
('Int32', 'int32', 'int32_t'),
|
|
('Int16', 'int16', 'int16_t'),
|
|
('Int8', 'int8', 'int8_t'),
|
|
('String', 'string', 'char *'),
|
|
('UInt64', 'uint64', 'uint64_t'),
|
|
('UInt32', 'uint32', 'uint32_t'),
|
|
('UInt16', 'uint16', 'uint16_t'),
|
|
('UInt8', 'uint8', 'uint8_t')]
|
|
}}
|
|
|
|
{{for name, dtype, c_type in dtypes}}
|
|
|
|
|
|
{{if dtype != 'int64'}}
|
|
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
|
|
# by IntervalTree
|
|
|
|
ctypedef struct {{name}}VectorData:
|
|
{{c_type}} *data
|
|
Py_ssize_t n, m
|
|
|
|
{{endif}}
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
cdef void append_data_{{dtype}}({{name}}VectorData *data,
|
|
{{c_type}} x) nogil:
|
|
|
|
data.data[data.n] = x
|
|
data.n += 1
|
|
|
|
{{endfor}}
|
|
|
|
ctypedef fused vector_data:
|
|
Int64VectorData
|
|
Int32VectorData
|
|
Int16VectorData
|
|
Int8VectorData
|
|
UInt64VectorData
|
|
UInt32VectorData
|
|
UInt16VectorData
|
|
UInt8VectorData
|
|
Float64VectorData
|
|
Float32VectorData
|
|
Complex128VectorData
|
|
Complex64VectorData
|
|
StringVectorData
|
|
|
|
cdef bint needs_resize(vector_data *data) nogil:
|
|
return data.n == data.m
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Vector
|
|
# ----------------------------------------------------------------------
|
|
|
|
cdef class Vector:
|
|
# cdef readonly:
|
|
# bint external_view_exists
|
|
|
|
def __cinit__(self):
|
|
self.external_view_exists = False
|
|
|
|
|
|
{{py:
|
|
|
|
# name, dtype, c_type
|
|
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
|
|
('Complex64', 'complex64', 'khcomplex64_t'),
|
|
('Float64', 'float64', 'float64_t'),
|
|
('UInt64', 'uint64', 'uint64_t'),
|
|
('Int64', 'int64', 'int64_t'),
|
|
('Float32', 'float32', 'float32_t'),
|
|
('UInt32', 'uint32', 'uint32_t'),
|
|
('Int32', 'int32', 'int32_t'),
|
|
('UInt16', 'uint16', 'uint16_t'),
|
|
('Int16', 'int16', 'int16_t'),
|
|
('UInt8', 'uint8', 'uint8_t'),
|
|
('Int8', 'int8', 'int8_t')]
|
|
|
|
}}
|
|
|
|
{{for name, dtype, c_type in dtypes}}
|
|
|
|
cdef class {{name}}Vector(Vector):
|
|
|
|
# For int64 we have to put this declaration in the .pxd file;
|
|
# Int64Vector is the only one we need exposed for other cython files.
|
|
{{if dtype != 'int64'}}
|
|
cdef:
|
|
{{name}}VectorData *data
|
|
ndarray ao
|
|
{{endif}}
|
|
|
|
def __cinit__(self):
|
|
self.data = <{{name}}VectorData *>PyMem_Malloc(
|
|
sizeof({{name}}VectorData))
|
|
if not self.data:
|
|
raise MemoryError()
|
|
self.data.n = 0
|
|
self.data.m = _INIT_VEC_CAP
|
|
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
|
|
self.data.data = <{{c_type}}*>self.ao.data
|
|
|
|
cdef resize(self):
|
|
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
|
|
self.ao.resize(self.data.m, refcheck=False)
|
|
self.data.data = <{{c_type}}*>self.ao.data
|
|
|
|
def __dealloc__(self):
|
|
if self.data is not NULL:
|
|
PyMem_Free(self.data)
|
|
self.data = NULL
|
|
|
|
def __len__(self) -> int:
|
|
return self.data.n
|
|
|
|
cpdef ndarray to_array(self):
|
|
if self.data.m != self.data.n:
|
|
if self.external_view_exists:
|
|
# should never happen
|
|
raise ValueError("should have raised on append()")
|
|
self.ao.resize(self.data.n, refcheck=False)
|
|
self.data.m = self.data.n
|
|
self.external_view_exists = True
|
|
return self.ao
|
|
|
|
cdef void append(self, {{c_type}} x):
|
|
|
|
if needs_resize(self.data):
|
|
if self.external_view_exists:
|
|
raise ValueError("external reference but "
|
|
"Vector.resize() needed")
|
|
self.resize()
|
|
|
|
append_data_{{dtype}}(self.data, x)
|
|
|
|
cdef extend(self, const {{c_type}}[:] x):
|
|
for i in range(len(x)):
|
|
self.append(x[i])
|
|
|
|
{{endfor}}
|
|
|
|
cdef class StringVector(Vector):
|
|
|
|
cdef:
|
|
StringVectorData *data
|
|
|
|
def __cinit__(self):
|
|
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
|
|
if not self.data:
|
|
raise MemoryError()
|
|
self.data.n = 0
|
|
self.data.m = _INIT_VEC_CAP
|
|
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
|
|
if not self.data.data:
|
|
raise MemoryError()
|
|
|
|
cdef resize(self):
|
|
cdef:
|
|
char **orig_data
|
|
Py_ssize_t i, m
|
|
|
|
m = self.data.m
|
|
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
|
|
|
|
orig_data = self.data.data
|
|
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
|
|
if not self.data.data:
|
|
raise MemoryError()
|
|
for i in range(m):
|
|
self.data.data[i] = orig_data[i]
|
|
|
|
def __dealloc__(self):
|
|
if self.data is not NULL:
|
|
if self.data.data is not NULL:
|
|
free(self.data.data)
|
|
PyMem_Free(self.data)
|
|
self.data = NULL
|
|
|
|
def __len__(self) -> int:
|
|
return self.data.n
|
|
|
|
cpdef ndarray[object, ndim=1] to_array(self):
|
|
cdef:
|
|
ndarray ao
|
|
Py_ssize_t n
|
|
object val
|
|
|
|
ao = np.empty(self.data.n, dtype=object)
|
|
for i in range(self.data.n):
|
|
val = self.data.data[i]
|
|
ao[i] = val
|
|
self.external_view_exists = True
|
|
self.data.m = self.data.n
|
|
return ao
|
|
|
|
cdef void append(self, char *x):
|
|
|
|
if needs_resize(self.data):
|
|
self.resize()
|
|
|
|
append_data_string(self.data, x)
|
|
|
|
cdef extend(self, ndarray[object] x):
|
|
for i in range(len(x)):
|
|
self.append(x[i])
|
|
|
|
|
|
cdef class ObjectVector(Vector):
|
|
|
|
cdef:
|
|
PyObject **data
|
|
Py_ssize_t n, m
|
|
ndarray ao
|
|
|
|
def __cinit__(self):
|
|
self.n = 0
|
|
self.m = _INIT_VEC_CAP
|
|
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
|
|
self.data = <PyObject**>self.ao.data
|
|
|
|
def __len__(self) -> int:
|
|
return self.n
|
|
|
|
cdef append(self, object obj):
|
|
if self.n == self.m:
|
|
if self.external_view_exists:
|
|
raise ValueError("external reference but "
|
|
"Vector.resize() needed")
|
|
self.m = max(self.m * 2, _INIT_VEC_CAP)
|
|
self.ao.resize(self.m, refcheck=False)
|
|
self.data = <PyObject**>self.ao.data
|
|
|
|
Py_INCREF(obj)
|
|
self.data[self.n] = <PyObject*>obj
|
|
self.n += 1
|
|
|
|
cpdef ndarray[object, ndim=1] to_array(self):
|
|
if self.m != self.n:
|
|
if self.external_view_exists:
|
|
raise ValueError("should have raised on append()")
|
|
self.ao.resize(self.n, refcheck=False)
|
|
self.m = self.n
|
|
self.external_view_exists = True
|
|
return self.ao
|
|
|
|
cdef extend(self, ndarray[object] x):
|
|
for i in range(len(x)):
|
|
self.append(x[i])
|
|
|
|
# ----------------------------------------------------------------------
|
|
# HashTable
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
cdef class HashTable:
|
|
|
|
pass
|
|
|
|
{{py:
|
|
|
|
# name, dtype, c_type, to_c_type
|
|
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
|
|
('Float64', 'float64', 'float64_t', ''),
|
|
('UInt64', 'uint64', 'uint64_t', ''),
|
|
('Int64', 'int64', 'int64_t', ''),
|
|
('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
|
|
('Float32', 'float32', 'float32_t', ''),
|
|
('UInt32', 'uint32', 'uint32_t', ''),
|
|
('Int32', 'int32', 'int32_t', ''),
|
|
('UInt16', 'uint16', 'uint16_t', ''),
|
|
('Int16', 'int16', 'int16_t', ''),
|
|
('UInt8', 'uint8', 'uint8_t', ''),
|
|
('Int8', 'int8', 'int8_t', '')]
|
|
|
|
}}
|
|
|
|
|
|
{{for name, dtype, c_type, to_c_type in dtypes}}
|
|
|
|
cdef class {{name}}HashTable(HashTable):
|
|
|
|
def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
|
|
self.table = kh_init_{{dtype}}()
|
|
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
|
|
kh_resize_{{dtype}}(self.table, size_hint)
|
|
|
|
self.uses_mask = uses_mask
|
|
self.na_position = -1
|
|
|
|
def __len__(self) -> int:
|
|
return self.table.size + (0 if self.na_position == -1 else 1)
|
|
|
|
def __dealloc__(self):
|
|
if self.table is not NULL:
|
|
kh_destroy_{{dtype}}(self.table)
|
|
self.table = NULL
|
|
|
|
def __contains__(self, object key) -> bool:
|
|
# The caller is responsible to check for compatible NA values in case
|
|
# of masked arrays.
|
|
cdef:
|
|
khiter_t k
|
|
{{c_type}} ckey
|
|
|
|
if self.uses_mask and checknull(key):
|
|
return -1 != self.na_position
|
|
|
|
ckey = {{to_c_type}}(key)
|
|
k = kh_get_{{dtype}}(self.table, ckey)
|
|
return k != self.table.n_buckets
|
|
|
|
def sizeof(self, deep: bool = False) -> int:
|
|
""" return the size of my table in bytes """
|
|
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
|
|
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
|
|
for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
|
|
sizeof(Py_ssize_t)) # vals
|
|
return overhead + for_flags + for_pairs
|
|
|
|
def get_state(self) -> dict[str, int]:
|
|
""" returns infos about the state of the hashtable"""
|
|
return {
|
|
'n_buckets' : self.table.n_buckets,
|
|
'size' : self.table.size,
|
|
'n_occupied' : self.table.n_occupied,
|
|
'upper_bound' : self.table.upper_bound,
|
|
}
|
|
|
|
cpdef get_item(self, {{dtype}}_t val):
|
|
"""Extracts the position of val from the hashtable.
|
|
|
|
Parameters
|
|
----------
|
|
val : Scalar
|
|
The value that is looked up in the hashtable
|
|
|
|
Returns
|
|
-------
|
|
The position of the requested integer.
|
|
"""
|
|
|
|
# Used in core.sorting, IndexEngine.get_loc
|
|
# Caller is responsible for checking for pd.NA
|
|
cdef:
|
|
khiter_t k
|
|
{{c_type}} cval
|
|
|
|
cval = {{to_c_type}}(val)
|
|
k = kh_get_{{dtype}}(self.table, cval)
|
|
if k != self.table.n_buckets:
|
|
return self.table.vals[k]
|
|
else:
|
|
raise KeyError(val)
|
|
|
|
cpdef get_na(self):
|
|
"""Extracts the position of na_value from the hashtable.
|
|
|
|
Returns
|
|
-------
|
|
The position of the last na value.
|
|
"""
|
|
|
|
if not self.uses_mask:
|
|
raise NotImplementedError
|
|
|
|
if self.na_position == -1:
|
|
raise KeyError("NA")
|
|
return self.na_position
|
|
|
|
cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
|
|
# Used in libjoin
|
|
# Caller is responsible for checking for pd.NA
|
|
cdef:
|
|
khiter_t k
|
|
int ret = 0
|
|
{{c_type}} ckey
|
|
|
|
ckey = {{to_c_type}}(key)
|
|
k = kh_put_{{dtype}}(self.table, ckey, &ret)
|
|
if kh_exist_{{dtype}}(self.table, k):
|
|
self.table.vals[k] = val
|
|
else:
|
|
raise KeyError(key)
|
|
|
|
cpdef set_na(self, Py_ssize_t val):
|
|
# Caller is responsible for checking for pd.NA
|
|
cdef:
|
|
khiter_t k
|
|
int ret = 0
|
|
{{c_type}} ckey
|
|
|
|
if not self.uses_mask:
|
|
raise NotImplementedError
|
|
|
|
self.na_position = val
|
|
|
|
{{if dtype == "int64" }}
|
|
# We only use this for int64, can reduce build size and make .pyi
|
|
# more accurate by only implementing it for int64
|
|
@cython.boundscheck(False)
|
|
def map_keys_to_values(
|
|
self, const {{dtype}}_t[:] keys, const int64_t[:] values
|
|
) -> None:
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
{{c_type}} key
|
|
khiter_t k
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
key = {{to_c_type}}(keys[i])
|
|
k = kh_put_{{dtype}}(self.table, key, &ret)
|
|
self.table.vals[k] = <Py_ssize_t>values[i]
|
|
{{endif}}
|
|
|
|
@cython.boundscheck(False)
|
|
def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
|
|
# Used in libindex, safe_sort
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
{{c_type}} val
|
|
khiter_t k
|
|
int8_t na_position = self.na_position
|
|
|
|
if self.uses_mask and mask is None:
|
|
raise NotImplementedError # pragma: no cover
|
|
|
|
with nogil:
|
|
if self.uses_mask:
|
|
for i in range(n):
|
|
if mask[i]:
|
|
na_position = i
|
|
else:
|
|
val= {{to_c_type}}(values[i])
|
|
k = kh_put_{{dtype}}(self.table, val, &ret)
|
|
self.table.vals[k] = i
|
|
else:
|
|
for i in range(n):
|
|
val= {{to_c_type}}(values[i])
|
|
k = kh_put_{{dtype}}(self.table, val, &ret)
|
|
self.table.vals[k] = i
|
|
self.na_position = na_position
|
|
|
|
@cython.boundscheck(False)
|
|
def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
|
|
# -> np.ndarray[np.intp]
|
|
# Used in safe_sort, IndexEngine.get_indexer
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
{{c_type}} val
|
|
khiter_t k
|
|
intp_t[::1] locs = np.empty(n, dtype=np.intp)
|
|
int8_t na_position = self.na_position
|
|
|
|
if self.uses_mask and mask is None:
|
|
raise NotImplementedError # pragma: no cover
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
if self.uses_mask and mask[i]:
|
|
locs[i] = na_position
|
|
else:
|
|
val = {{to_c_type}}(values[i])
|
|
k = kh_get_{{dtype}}(self.table, val)
|
|
if k != self.table.n_buckets:
|
|
locs[i] = self.table.vals[k]
|
|
else:
|
|
locs[i] = -1
|
|
|
|
return np.asarray(locs)
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, bint ignore_na=False,
|
|
object mask=None, bint return_inverse=False, bint use_result_mask=False):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[{{dtype}}]
|
|
Array of values of which unique will be calculated
|
|
uniques : {{name}}Vector
|
|
Vector into which uniques will be written
|
|
count_prior : Py_ssize_t, default 0
|
|
Number of existing entries in uniques
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then
|
|
any value "val" satisfying val != val is considered missing.
|
|
If na_value is not None, then _additionally_, any value "val"
|
|
satisfying val == na_value is considered missing.
|
|
ignore_na : bool, default False
|
|
Whether NA-values should be ignored for calculating the uniques. If
|
|
True, the labels corresponding to missing values will be set to
|
|
na_sentinel.
|
|
mask : ndarray[bool], optional
|
|
If not None, the mask is used as indicator for missing values
|
|
(True = missing, False = valid) instead of `na_value` or
|
|
condition "val != val".
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
use_result_mask: bool, default False
|
|
Whether to create a result mask for the unique values. Not supported
|
|
with return_inverse=True.
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[{{dtype}}]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse=True)
|
|
The labels from values to uniques
|
|
result_mask: ndarray[bool], if use_result_mask is true
|
|
The mask for the result values.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, idx, count = count_prior, n = len(values)
|
|
intp_t[::1] labels
|
|
int ret = 0
|
|
{{c_type}} val, na_value2
|
|
khiter_t k
|
|
{{name}}VectorData *ud
|
|
UInt8Vector result_mask
|
|
UInt8VectorData *rmd
|
|
bint use_na_value, use_mask, seen_na = False
|
|
uint8_t[:] mask_values
|
|
|
|
if return_inverse:
|
|
labels = np.empty(n, dtype=np.intp)
|
|
ud = uniques.data
|
|
use_na_value = na_value is not None
|
|
use_mask = mask is not None
|
|
if not use_mask and use_result_mask:
|
|
raise NotImplementedError # pragma: no cover
|
|
|
|
if use_result_mask and return_inverse:
|
|
raise NotImplementedError # pragma: no cover
|
|
|
|
result_mask = UInt8Vector()
|
|
rmd = result_mask.data
|
|
|
|
if use_mask:
|
|
mask_values = mask.view("uint8")
|
|
|
|
if use_na_value:
|
|
# We need this na_value2 because we want to allow users
|
|
# to *optionally* specify an NA sentinel *of the correct* type.
|
|
# We use None, to make it optional, which requires `object` type
|
|
# for the parameter. To please the compiler, we use na_value2,
|
|
# which is only used if it's *specified*.
|
|
na_value2 = {{to_c_type}}(na_value)
|
|
else:
|
|
na_value2 = {{to_c_type}}(0)
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
val = {{to_c_type}}(values[i])
|
|
|
|
if ignore_na and use_mask:
|
|
if mask_values[i]:
|
|
labels[i] = na_sentinel
|
|
continue
|
|
elif ignore_na and (
|
|
is_nan_{{c_type}}(val) or
|
|
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
|
|
):
|
|
# if missing values do not count as unique values (i.e. if
|
|
# ignore_na is True), skip the hashtable entry for them,
|
|
# and replace the corresponding label with na_sentinel
|
|
labels[i] = na_sentinel
|
|
continue
|
|
elif not ignore_na and use_result_mask:
|
|
if mask_values[i]:
|
|
if seen_na:
|
|
continue
|
|
|
|
seen_na = True
|
|
if needs_resize(ud):
|
|
with gil:
|
|
if uniques.external_view_exists:
|
|
raise ValueError("external reference to "
|
|
"uniques held, but "
|
|
"Vector.resize() needed")
|
|
uniques.resize()
|
|
if result_mask.external_view_exists:
|
|
raise ValueError("external reference to "
|
|
"result_mask held, but "
|
|
"Vector.resize() needed")
|
|
result_mask.resize()
|
|
append_data_{{dtype}}(ud, val)
|
|
append_data_uint8(rmd, 1)
|
|
continue
|
|
|
|
k = kh_get_{{dtype}}(self.table, val)
|
|
|
|
if k == self.table.n_buckets:
|
|
# k hasn't been seen yet
|
|
k = kh_put_{{dtype}}(self.table, val, &ret)
|
|
|
|
if needs_resize(ud):
|
|
with gil:
|
|
if uniques.external_view_exists:
|
|
raise ValueError("external reference to "
|
|
"uniques held, but "
|
|
"Vector.resize() needed")
|
|
uniques.resize()
|
|
if use_result_mask:
|
|
if result_mask.external_view_exists:
|
|
raise ValueError("external reference to "
|
|
"result_mask held, but "
|
|
"Vector.resize() needed")
|
|
result_mask.resize()
|
|
append_data_{{dtype}}(ud, val)
|
|
if use_result_mask:
|
|
append_data_uint8(rmd, 0)
|
|
|
|
if return_inverse:
|
|
self.table.vals[k] = count
|
|
labels[i] = count
|
|
count += 1
|
|
elif return_inverse:
|
|
# k falls into a previous bucket
|
|
# only relevant in case we need to construct the inverse
|
|
idx = self.table.vals[k]
|
|
labels[i] = idx
|
|
|
|
if return_inverse:
|
|
return uniques.to_array(), labels.base # .base -> underlying ndarray
|
|
if use_result_mask:
|
|
return uniques.to_array(), result_mask.to_array()
|
|
return uniques.to_array()
|
|
|
|
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[{{dtype}}]
|
|
Array of values of which unique will be calculated
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
mask : ndarray[bool], optional
|
|
If not None, the mask is used as indicator for missing values
|
|
(True = missing, False = valid) instead of `na_value` or
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[{{dtype}}]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse)
|
|
The labels from values to uniques
|
|
result_mask: ndarray[bool], if mask is given as input
|
|
The mask for the result values.
|
|
"""
|
|
uniques = {{name}}Vector()
|
|
use_result_mask = True if mask is not None else False
|
|
return self._unique(values, uniques, ignore_na=False,
|
|
return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
|
|
|
|
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, object mask=None, ignore_na=True):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Missing values are not included in the "uniques" for this method.
|
|
The labels for any missing values will be set to "na_sentinel"
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[{{dtype}}]
|
|
Array of values of which unique will be calculated
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then
|
|
any value "val" satisfying val != val is considered missing.
|
|
If na_value is not None, then _additionally_, any value "val"
|
|
satisfying val == na_value is considered missing.
|
|
mask : ndarray[bool], optional
|
|
If not None, the mask is used as indicator for missing values
|
|
(True = missing, False = valid) instead of `na_value` or
|
|
condition "val != val".
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[{{dtype}}]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t]
|
|
The labels from values to uniques
|
|
"""
|
|
uniques_vector = {{name}}Vector()
|
|
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
|
|
na_value=na_value, ignore_na=ignore_na, mask=mask,
|
|
return_inverse=True)
|
|
|
|
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, object mask=None):
|
|
# -> np.ndarray[np.intp]
|
|
_, labels = self._unique(values, uniques, count_prior=count_prior,
|
|
na_sentinel=na_sentinel, na_value=na_value,
|
|
ignore_na=True, return_inverse=True, mask=mask)
|
|
return labels
|
|
|
|
{{if dtype == 'int64'}}
|
|
@cython.boundscheck(False)
|
|
def get_labels_groupby(
|
|
self, const {{dtype}}_t[:] values
|
|
) -> tuple[ndarray, ndarray]:
|
|
# tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
intp_t[::1] labels
|
|
Py_ssize_t idx, count = 0
|
|
int ret = 0
|
|
{{c_type}} val
|
|
khiter_t k
|
|
{{name}}Vector uniques = {{name}}Vector()
|
|
{{name}}VectorData *ud
|
|
|
|
labels = np.empty(n, dtype=np.intp)
|
|
ud = uniques.data
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
val = {{to_c_type}}(values[i])
|
|
|
|
# specific for groupby
|
|
if val < 0:
|
|
labels[i] = -1
|
|
continue
|
|
|
|
k = kh_get_{{dtype}}(self.table, val)
|
|
if k != self.table.n_buckets:
|
|
idx = self.table.vals[k]
|
|
labels[i] = idx
|
|
else:
|
|
k = kh_put_{{dtype}}(self.table, val, &ret)
|
|
self.table.vals[k] = count
|
|
|
|
if needs_resize(ud):
|
|
with gil:
|
|
uniques.resize()
|
|
append_data_{{dtype}}(ud, val)
|
|
labels[i] = count
|
|
count += 1
|
|
|
|
arr_uniques = uniques.to_array()
|
|
|
|
return np.asarray(labels), arr_uniques
|
|
{{endif}}
|
|
|
|
|
|
cdef class {{name}}Factorizer(Factorizer):
|
|
cdef public:
|
|
{{name}}HashTable table
|
|
{{name}}Vector uniques
|
|
|
|
def __cinit__(self, size_hint: int):
|
|
self.table = {{name}}HashTable(size_hint)
|
|
self.uniques = {{name}}Vector()
|
|
|
|
def factorize(self, const {{c_type}}[:] values,
|
|
na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
|
|
"""
|
|
Returns
|
|
-------
|
|
ndarray[intp_t]
|
|
|
|
Examples
|
|
--------
|
|
Factorize values with nans replaced by na_sentinel
|
|
|
|
>>> fac = {{name}}Factorizer(3)
|
|
>>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
|
|
array([0, 1, 2])
|
|
"""
|
|
cdef:
|
|
ndarray[intp_t] labels
|
|
|
|
if self.uniques.external_view_exists:
|
|
uniques = {{name}}Vector()
|
|
uniques.extend(self.uniques.to_array())
|
|
self.uniques = uniques
|
|
labels = self.table.get_labels(values, self.uniques,
|
|
self.count, na_sentinel,
|
|
na_value=na_value, mask=mask)
|
|
self.count = len(self.uniques)
|
|
return labels
|
|
|
|
{{endfor}}
|
|
|
|
|
|
cdef class StringHashTable(HashTable):
|
|
# these by-definition *must* be strings
|
|
# or a sentinel np.nan / None missing value
|
|
na_string_sentinel = '__nan__'
|
|
|
|
def __init__(self, int64_t size_hint=1):
|
|
self.table = kh_init_str()
|
|
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
|
|
kh_resize_str(self.table, size_hint)
|
|
|
|
def __dealloc__(self):
|
|
if self.table is not NULL:
|
|
kh_destroy_str(self.table)
|
|
self.table = NULL
|
|
|
|
def sizeof(self, deep: bool = False) -> int:
|
|
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
|
|
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
|
|
for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
|
|
sizeof(Py_ssize_t)) # vals
|
|
return overhead + for_flags + for_pairs
|
|
|
|
def get_state(self) -> dict[str, int]:
|
|
""" returns infos about the state of the hashtable"""
|
|
return {
|
|
'n_buckets' : self.table.n_buckets,
|
|
'size' : self.table.size,
|
|
'n_occupied' : self.table.n_occupied,
|
|
'upper_bound' : self.table.upper_bound,
|
|
}
|
|
|
|
cpdef get_item(self, str val):
|
|
cdef:
|
|
khiter_t k
|
|
const char *v
|
|
v = get_c_string(val)
|
|
|
|
k = kh_get_str(self.table, v)
|
|
if k != self.table.n_buckets:
|
|
return self.table.vals[k]
|
|
else:
|
|
raise KeyError(val)
|
|
|
|
cpdef set_item(self, str key, Py_ssize_t val):
|
|
cdef:
|
|
khiter_t k
|
|
int ret = 0
|
|
const char *v
|
|
|
|
v = get_c_string(key)
|
|
|
|
k = kh_put_str(self.table, v, &ret)
|
|
if kh_exist_str(self.table, k):
|
|
self.table.vals[k] = val
|
|
else:
|
|
raise KeyError(key)
|
|
|
|
@cython.boundscheck(False)
|
|
def get_indexer(self, ndarray[object] values) -> ndarray:
|
|
# -> np.ndarray[np.intp]
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
|
|
intp_t *resbuf = <intp_t*>labels.data
|
|
khiter_t k
|
|
kh_str_t *table = self.table
|
|
const char *v
|
|
const char **vecs
|
|
|
|
vecs = <const char **>malloc(n * sizeof(char *))
|
|
for i in range(n):
|
|
val = values[i]
|
|
v = get_c_string(val)
|
|
vecs[i] = v
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
k = kh_get_str(table, vecs[i])
|
|
if k != table.n_buckets:
|
|
resbuf[i] = table.vals[k]
|
|
else:
|
|
resbuf[i] = -1
|
|
|
|
free(vecs)
|
|
return labels
|
|
|
|
@cython.boundscheck(False)
|
|
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
|
|
# -> np.ndarray[np.intp]
|
|
# mask not yet implemented
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
object val
|
|
const char *v
|
|
khiter_t k
|
|
intp_t[::1] locs = np.empty(n, dtype=np.intp)
|
|
|
|
# these by-definition *must* be strings
|
|
vecs = <const char **>malloc(n * sizeof(char *))
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
if isinstance(val, str):
|
|
# GH#31499 if we have a np.str_ get_c_string won't recognize
|
|
# it as a str, even though isinstance does.
|
|
v = get_c_string(<str>val)
|
|
else:
|
|
v = get_c_string(self.na_string_sentinel)
|
|
vecs[i] = v
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
v = vecs[i]
|
|
k = kh_get_str(self.table, v)
|
|
if k != self.table.n_buckets:
|
|
locs[i] = self.table.vals[k]
|
|
else:
|
|
locs[i] = -1
|
|
|
|
free(vecs)
|
|
return np.asarray(locs)
|
|
|
|
@cython.boundscheck(False)
|
|
def map_locations(self, ndarray[object] values, object mask = None) -> None:
|
|
# mask not yet implemented
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
object val
|
|
const char *v
|
|
const char **vecs
|
|
khiter_t k
|
|
|
|
# these by-definition *must* be strings
|
|
vecs = <const char **>malloc(n * sizeof(char *))
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
if isinstance(val, str):
|
|
# GH#31499 if we have a np.str_ get_c_string won't recognize
|
|
# it as a str, even though isinstance does.
|
|
v = get_c_string(<str>val)
|
|
else:
|
|
v = get_c_string(self.na_string_sentinel)
|
|
vecs[i] = v
|
|
|
|
with nogil:
|
|
for i in range(n):
|
|
v = vecs[i]
|
|
k = kh_put_str(self.table, v, &ret)
|
|
self.table.vals[k] = i
|
|
free(vecs)
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def _unique(self, ndarray[object] values, ObjectVector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, bint ignore_na=False,
|
|
bint return_inverse=False):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
uniques : ObjectVector
|
|
Vector into which uniques will be written
|
|
count_prior : Py_ssize_t, default 0
|
|
Number of existing entries in uniques
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then any value
|
|
that is not a string is considered missing. If na_value is
|
|
not None, then _additionally_ any value "val" satisfying
|
|
val == na_value is considered missing.
|
|
ignore_na : bool, default False
|
|
Whether NA-values should be ignored for calculating the uniques. If
|
|
True, the labels corresponding to missing values will be set to
|
|
na_sentinel.
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse=True)
|
|
The labels from values to uniques
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, idx, count = count_prior, n = len(values)
|
|
intp_t[::1] labels
|
|
int64_t[::1] uindexer
|
|
int ret = 0
|
|
object val
|
|
const char *v
|
|
const char **vecs
|
|
khiter_t k
|
|
bint use_na_value
|
|
|
|
if return_inverse:
|
|
labels = np.zeros(n, dtype=np.intp)
|
|
uindexer = np.empty(n, dtype=np.int64)
|
|
use_na_value = na_value is not None
|
|
|
|
# assign pointers and pre-filter out missing (if ignore_na)
|
|
vecs = <const char **>malloc(n * sizeof(char *))
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
if (ignore_na
|
|
and (not isinstance(val, str)
|
|
or (use_na_value and val == na_value))):
|
|
# if missing values do not count as unique values (i.e. if
|
|
# ignore_na is True), we can skip the actual value, and
|
|
# replace the label with na_sentinel directly
|
|
labels[i] = na_sentinel
|
|
else:
|
|
# if ignore_na is False, we also stringify NaN/None/etc.
|
|
try:
|
|
v = get_c_string(<str>val)
|
|
except UnicodeEncodeError:
|
|
v = get_c_string(<str>repr(val))
|
|
vecs[i] = v
|
|
|
|
# compute
|
|
with nogil:
|
|
for i in range(n):
|
|
if ignore_na and labels[i] == na_sentinel:
|
|
# skip entries for ignored missing values (see above)
|
|
continue
|
|
|
|
v = vecs[i]
|
|
k = kh_get_str(self.table, v)
|
|
if k == self.table.n_buckets:
|
|
# k hasn't been seen yet
|
|
k = kh_put_str(self.table, v, &ret)
|
|
uindexer[count] = i
|
|
if return_inverse:
|
|
self.table.vals[k] = count
|
|
labels[i] = count
|
|
count += 1
|
|
elif return_inverse:
|
|
# k falls into a previous bucket
|
|
# only relevant in case we need to construct the inverse
|
|
idx = self.table.vals[k]
|
|
labels[i] = idx
|
|
|
|
free(vecs)
|
|
|
|
# uniques
|
|
for i in range(count):
|
|
uniques.append(values[uindexer[i]])
|
|
|
|
if return_inverse:
|
|
return uniques.to_array(), labels.base # .base -> underlying ndarray
|
|
return uniques.to_array()
|
|
|
|
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
mask : ndarray[bool], optional
|
|
Not yet implemented for StringHashTable
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse)
|
|
The labels from values to uniques
|
|
"""
|
|
uniques = ObjectVector()
|
|
return self._unique(values, uniques, ignore_na=False,
|
|
return_inverse=return_inverse)
|
|
|
|
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, object mask=None, ignore_na=True):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Missing values are not included in the "uniques" for this method.
|
|
The labels for any missing values will be set to "na_sentinel"
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then any value
|
|
that is not a string is considered missing. If na_value is
|
|
not None, then _additionally_ any value "val" satisfying
|
|
val == na_value is considered missing.
|
|
mask : ndarray[bool], optional
|
|
Not yet implemented for StringHashTable.
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp]
|
|
The labels from values to uniques
|
|
"""
|
|
uniques_vector = ObjectVector()
|
|
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
|
|
na_value=na_value, ignore_na=ignore_na,
|
|
return_inverse=True)
|
|
|
|
def get_labels(self, ndarray[object] values, ObjectVector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None):
|
|
# -> np.ndarray[np.intp]
|
|
_, labels = self._unique(values, uniques, count_prior=count_prior,
|
|
na_sentinel=na_sentinel, na_value=na_value,
|
|
ignore_na=True, return_inverse=True)
|
|
return labels
|
|
|
|
|
|
cdef class PyObjectHashTable(HashTable):
|
|
|
|
def __init__(self, int64_t size_hint=1):
|
|
self.table = kh_init_pymap()
|
|
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
|
|
kh_resize_pymap(self.table, size_hint)
|
|
|
|
def __dealloc__(self):
|
|
if self.table is not NULL:
|
|
kh_destroy_pymap(self.table)
|
|
self.table = NULL
|
|
|
|
def __len__(self) -> int:
|
|
return self.table.size
|
|
|
|
def __contains__(self, object key) -> bool:
|
|
cdef:
|
|
khiter_t k
|
|
hash(key)
|
|
|
|
k = kh_get_pymap(self.table, <PyObject*>key)
|
|
return k != self.table.n_buckets
|
|
|
|
def sizeof(self, deep: bool = False) -> int:
|
|
""" return the size of my table in bytes """
|
|
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
|
|
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
|
|
for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
|
|
sizeof(Py_ssize_t)) # vals
|
|
return overhead + for_flags + for_pairs
|
|
|
|
def get_state(self) -> dict[str, int]:
|
|
"""
|
|
returns infos about the current state of the hashtable like size,
|
|
number of buckets and so on.
|
|
"""
|
|
return {
|
|
'n_buckets' : self.table.n_buckets,
|
|
'size' : self.table.size,
|
|
'n_occupied' : self.table.n_occupied,
|
|
'upper_bound' : self.table.upper_bound,
|
|
}
|
|
|
|
cpdef get_item(self, object val):
|
|
cdef:
|
|
khiter_t k
|
|
|
|
k = kh_get_pymap(self.table, <PyObject*>val)
|
|
if k != self.table.n_buckets:
|
|
return self.table.vals[k]
|
|
else:
|
|
raise KeyError(val)
|
|
|
|
cpdef set_item(self, object key, Py_ssize_t val):
|
|
cdef:
|
|
khiter_t k
|
|
int ret = 0
|
|
char* buf
|
|
|
|
hash(key)
|
|
|
|
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
|
|
if kh_exist_pymap(self.table, k):
|
|
self.table.vals[k] = val
|
|
else:
|
|
raise KeyError(key)
|
|
|
|
def map_locations(self, ndarray[object] values, object mask = None) -> None:
|
|
# mask not yet implemented
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
object val
|
|
khiter_t k
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
hash(val)
|
|
|
|
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
|
|
self.table.vals[k] = i
|
|
|
|
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
|
|
# -> np.ndarray[np.intp]
|
|
# mask not yet implemented
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
int ret = 0
|
|
object val
|
|
khiter_t k
|
|
intp_t[::1] locs = np.empty(n, dtype=np.intp)
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
hash(val)
|
|
|
|
k = kh_get_pymap(self.table, <PyObject*>val)
|
|
if k != self.table.n_buckets:
|
|
locs[i] = self.table.vals[k]
|
|
else:
|
|
locs[i] = -1
|
|
|
|
return np.asarray(locs)
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def _unique(self, ndarray[object] values, ObjectVector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, bint ignore_na=False,
|
|
bint return_inverse=False):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
uniques : ObjectVector
|
|
Vector into which uniques will be written
|
|
count_prior : Py_ssize_t, default 0
|
|
Number of existing entries in uniques
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then None _plus_
|
|
any value "val" satisfying val != val is considered missing.
|
|
If na_value is not None, then _additionally_, any value "val"
|
|
satisfying val == na_value is considered missing.
|
|
ignore_na : bool, default False
|
|
Whether NA-values should be ignored for calculating the uniques. If
|
|
True, the labels corresponding to missing values will be set to
|
|
na_sentinel.
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse=True)
|
|
The labels from values to uniques
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, idx, count = count_prior, n = len(values)
|
|
intp_t[::1] labels
|
|
int ret = 0
|
|
object val
|
|
khiter_t k
|
|
bint use_na_value
|
|
|
|
if return_inverse:
|
|
labels = np.empty(n, dtype=np.intp)
|
|
use_na_value = na_value is not None
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
hash(val)
|
|
|
|
if ignore_na and (
|
|
checknull(val)
|
|
or (use_na_value and val == na_value)
|
|
):
|
|
# if missing values do not count as unique values (i.e. if
|
|
# ignore_na is True), skip the hashtable entry for them, and
|
|
# replace the corresponding label with na_sentinel
|
|
labels[i] = na_sentinel
|
|
continue
|
|
|
|
k = kh_get_pymap(self.table, <PyObject*>val)
|
|
if k == self.table.n_buckets:
|
|
# k hasn't been seen yet
|
|
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
|
|
uniques.append(val)
|
|
if return_inverse:
|
|
self.table.vals[k] = count
|
|
labels[i] = count
|
|
count += 1
|
|
elif return_inverse:
|
|
# k falls into a previous bucket
|
|
# only relevant in case we need to construct the inverse
|
|
idx = self.table.vals[k]
|
|
labels[i] = idx
|
|
|
|
if return_inverse:
|
|
return uniques.to_array(), labels.base # .base -> underlying ndarray
|
|
return uniques.to_array()
|
|
|
|
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
return_inverse : bool, default False
|
|
Whether the mapping of the original array values to their location
|
|
in the vector of uniques should be returned.
|
|
mask : ndarray[bool], optional
|
|
Not yet implemented for PyObjectHashTable
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t] (if return_inverse)
|
|
The labels from values to uniques
|
|
"""
|
|
uniques = ObjectVector()
|
|
return self._unique(values, uniques, ignore_na=False,
|
|
return_inverse=return_inverse)
|
|
|
|
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None, object mask=None, ignore_na=True):
|
|
"""
|
|
Calculate unique values and labels (no sorting!)
|
|
|
|
Missing values are not included in the "uniques" for this method.
|
|
The labels for any missing values will be set to "na_sentinel"
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of values of which unique will be calculated
|
|
na_sentinel : Py_ssize_t, default -1
|
|
Sentinel value used for all NA-values in inverse
|
|
na_value : object, default None
|
|
Value to identify as missing. If na_value is None, then None _plus_
|
|
any value "val" satisfying val != val is considered missing.
|
|
If na_value is not None, then _additionally_, any value "val"
|
|
satisfying val == na_value is considered missing.
|
|
mask : ndarray[bool], optional
|
|
Not yet implemented for PyObjectHashTable.
|
|
|
|
Returns
|
|
-------
|
|
uniques : ndarray[object]
|
|
Unique values of input, not sorted
|
|
labels : ndarray[intp_t]
|
|
The labels from values to uniques
|
|
"""
|
|
uniques_vector = ObjectVector()
|
|
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
|
|
na_value=na_value, ignore_na=ignore_na,
|
|
return_inverse=True)
|
|
|
|
def get_labels(self, ndarray[object] values, ObjectVector uniques,
|
|
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
|
|
object na_value=None):
|
|
# -> np.ndarray[np.intp]
|
|
_, labels = self._unique(values, uniques, count_prior=count_prior,
|
|
na_sentinel=na_sentinel, na_value=na_value,
|
|
ignore_na=True, return_inverse=True)
|
|
return labels
|