Inzynierka/Lib/site-packages/pandas/_libs/hashtable_class_helper.pxi.in

1507 lines
50 KiB
Cython
Raw Permalink Normal View History

2023-06-02 12:51:02 +02:00
"""
Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""
{{py:
# name
complex_types = ['complex64',
'complex128']
}}
{{for name in complex_types}}
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
cdef kh{{name}}_t res
res.real = val.real
res.imag = val.imag
return res
{{endfor}}
{{py:
# name
c_types = ['khcomplex128_t',
'khcomplex64_t',
'float64_t',
'float32_t',
'int64_t',
'int32_t',
'int16_t',
'int8_t',
'uint64_t',
'uint32_t',
'uint16_t',
'uint8_t']
}}
{{for c_type in c_types}}
cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
{{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
return val.real != val.real or val.imag != val.imag
{{elif c_type in {'float64_t', 'float32_t'} }}
return val != val
{{else}}
return False
{{endif}}
{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
# are_equivalent_{{c_type}} is cimported via khash.pxd
{{else}}
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
return val1 == val2
{{endif}}
{{endfor}}
{{py:
# name
cimported_types = ['complex64',
'complex128',
'float32',
'float64',
'int8',
'int16',
'int32',
'int64',
'pymap',
'str',
'strbox',
'uint8',
'uint16',
'uint32',
'uint64']
}}
{{for name in cimported_types}}
from pandas._libs.khash cimport (
kh_destroy_{{name}},
kh_exist_{{name}},
kh_get_{{name}},
kh_init_{{name}},
kh_put_{{name}},
kh_resize_{{name}},
)
{{endfor}}
# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------
from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA
{{py:
# name, dtype, c_type
# the generated StringVector is not actually used
# but is included for completeness (rather ObjectVector is used
# for uniques in hashtables)
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('Float32', 'float32', 'float32_t'),
('Int64', 'int64', 'int64_t'),
('Int32', 'int32', 'int32_t'),
('Int16', 'int16', 'int16_t'),
('Int8', 'int8', 'int8_t'),
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t'),
('UInt32', 'uint32', 'uint32_t'),
('UInt16', 'uint16', 'uint16_t'),
('UInt8', 'uint8', 'uint8_t')]
}}
{{for name, dtype, c_type in dtypes}}
{{if dtype != 'int64'}}
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
# by IntervalTree
ctypedef struct {{name}}VectorData:
{{c_type}} *data
Py_ssize_t n, m
{{endif}}
@cython.wraparound(False)
@cython.boundscheck(False)
cdef void append_data_{{dtype}}({{name}}VectorData *data,
{{c_type}} x) nogil:
data.data[data.n] = x
data.n += 1
{{endfor}}
ctypedef fused vector_data:
Int64VectorData
Int32VectorData
Int16VectorData
Int8VectorData
UInt64VectorData
UInt32VectorData
UInt16VectorData
UInt8VectorData
Float64VectorData
Float32VectorData
Complex128VectorData
Complex64VectorData
StringVectorData
cdef bint needs_resize(vector_data *data) nogil:
return data.n == data.m
# ----------------------------------------------------------------------
# Vector
# ----------------------------------------------------------------------
cdef class Vector:
# cdef readonly:
# bint external_view_exists
def __cinit__(self):
self.external_view_exists = False
{{py:
# name, dtype, c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
('Complex64', 'complex64', 'khcomplex64_t'),
('Float64', 'float64', 'float64_t'),
('UInt64', 'uint64', 'uint64_t'),
('Int64', 'int64', 'int64_t'),
('Float32', 'float32', 'float32_t'),
('UInt32', 'uint32', 'uint32_t'),
('Int32', 'int32', 'int32_t'),
('UInt16', 'uint16', 'uint16_t'),
('Int16', 'int16', 'int16_t'),
('UInt8', 'uint8', 'uint8_t'),
('Int8', 'int8', 'int8_t')]
}}
{{for name, dtype, c_type in dtypes}}
cdef class {{name}}Vector(Vector):
# For int64 we have to put this declaration in the .pxd file;
# Int64Vector is the only one we need exposed for other cython files.
{{if dtype != 'int64'}}
cdef:
{{name}}VectorData *data
ndarray ao
{{endif}}
def __cinit__(self):
self.data = <{{name}}VectorData *>PyMem_Malloc(
sizeof({{name}}VectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
self.data.data = <{{c_type}}*>self.ao.data
cdef resize(self):
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
self.ao.resize(self.data.m, refcheck=False)
self.data.data = <{{c_type}}*>self.ao.data
def __dealloc__(self):
if self.data is not NULL:
PyMem_Free(self.data)
self.data = NULL
def __len__(self) -> int:
return self.data.n
cpdef ndarray to_array(self):
if self.data.m != self.data.n:
if self.external_view_exists:
# should never happen
raise ValueError("should have raised on append()")
self.ao.resize(self.data.n, refcheck=False)
self.data.m = self.data.n
self.external_view_exists = True
return self.ao
cdef void append(self, {{c_type}} x):
if needs_resize(self.data):
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.resize()
append_data_{{dtype}}(self.data, x)
cdef extend(self, const {{c_type}}[:] x):
for i in range(len(x)):
self.append(x[i])
{{endfor}}
cdef class StringVector(Vector):
cdef:
StringVectorData *data
def __cinit__(self):
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
if not self.data:
raise MemoryError()
self.data.n = 0
self.data.m = _INIT_VEC_CAP
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
cdef resize(self):
cdef:
char **orig_data
Py_ssize_t i, m
m = self.data.m
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
orig_data = self.data.data
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
if not self.data.data:
raise MemoryError()
for i in range(m):
self.data.data[i] = orig_data[i]
def __dealloc__(self):
if self.data is not NULL:
if self.data.data is not NULL:
free(self.data.data)
PyMem_Free(self.data)
self.data = NULL
def __len__(self) -> int:
return self.data.n
cpdef ndarray[object, ndim=1] to_array(self):
cdef:
ndarray ao
Py_ssize_t n
object val
ao = np.empty(self.data.n, dtype=object)
for i in range(self.data.n):
val = self.data.data[i]
ao[i] = val
self.external_view_exists = True
self.data.m = self.data.n
return ao
cdef void append(self, char *x):
if needs_resize(self.data):
self.resize()
append_data_string(self.data, x)
cdef extend(self, ndarray[object] x):
for i in range(len(x)):
self.append(x[i])
cdef class ObjectVector(Vector):
cdef:
PyObject **data
Py_ssize_t n, m
ndarray ao
def __cinit__(self):
self.n = 0
self.m = _INIT_VEC_CAP
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
self.data = <PyObject**>self.ao.data
def __len__(self) -> int:
return self.n
cdef append(self, object obj):
if self.n == self.m:
if self.external_view_exists:
raise ValueError("external reference but "
"Vector.resize() needed")
self.m = max(self.m * 2, _INIT_VEC_CAP)
self.ao.resize(self.m, refcheck=False)
self.data = <PyObject**>self.ao.data
Py_INCREF(obj)
self.data[self.n] = <PyObject*>obj
self.n += 1
cpdef ndarray[object, ndim=1] to_array(self):
if self.m != self.n:
if self.external_view_exists:
raise ValueError("should have raised on append()")
self.ao.resize(self.n, refcheck=False)
self.m = self.n
self.external_view_exists = True
return self.ao
cdef extend(self, ndarray[object] x):
for i in range(len(x)):
self.append(x[i])
# ----------------------------------------------------------------------
# HashTable
# ----------------------------------------------------------------------
cdef class HashTable:
pass
{{py:
# name, dtype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
('Float64', 'float64', 'float64_t', ''),
('UInt64', 'uint64', 'uint64_t', ''),
('Int64', 'int64', 'int64_t', ''),
('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
('Float32', 'float32', 'float32_t', ''),
('UInt32', 'uint32', 'uint32_t', ''),
('Int32', 'int32', 'int32_t', ''),
('UInt16', 'uint16', 'uint16_t', ''),
('Int16', 'int16', 'int16_t', ''),
('UInt8', 'uint8', 'uint8_t', ''),
('Int8', 'int8', 'int8_t', '')]
}}
{{for name, dtype, c_type, to_c_type in dtypes}}
cdef class {{name}}HashTable(HashTable):
def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
self.table = kh_init_{{dtype}}()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_{{dtype}}(self.table, size_hint)
self.uses_mask = uses_mask
self.na_position = -1
def __len__(self) -> int:
return self.table.size + (0 if self.na_position == -1 else 1)
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_{{dtype}}(self.table)
self.table = NULL
def __contains__(self, object key) -> bool:
# The caller is responsible to check for compatible NA values in case
# of masked arrays.
cdef:
khiter_t k
{{c_type}} ckey
if self.uses_mask and checknull(key):
return -1 != self.na_position
ckey = {{to_c_type}}(key)
k = kh_get_{{dtype}}(self.table, ckey)
return k != self.table.n_buckets
def sizeof(self, deep: bool = False) -> int:
""" return the size of my table in bytes """
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, {{dtype}}_t val):
"""Extracts the position of val from the hashtable.
Parameters
----------
val : Scalar
The value that is looked up in the hashtable
Returns
-------
The position of the requested integer.
"""
# Used in core.sorting, IndexEngine.get_loc
# Caller is responsible for checking for pd.NA
cdef:
khiter_t k
{{c_type}} cval
cval = {{to_c_type}}(val)
k = kh_get_{{dtype}}(self.table, cval)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef get_na(self):
"""Extracts the position of na_value from the hashtable.
Returns
-------
The position of the last na value.
"""
if not self.uses_mask:
raise NotImplementedError
if self.na_position == -1:
raise KeyError("NA")
return self.na_position
cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
# Used in libjoin
# Caller is responsible for checking for pd.NA
cdef:
khiter_t k
int ret = 0
{{c_type}} ckey
ckey = {{to_c_type}}(key)
k = kh_put_{{dtype}}(self.table, ckey, &ret)
if kh_exist_{{dtype}}(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
cpdef set_na(self, Py_ssize_t val):
# Caller is responsible for checking for pd.NA
cdef:
khiter_t k
int ret = 0
{{c_type}} ckey
if not self.uses_mask:
raise NotImplementedError
self.na_position = val
{{if dtype == "int64" }}
# We only use this for int64, can reduce build size and make .pyi
# more accurate by only implementing it for int64
@cython.boundscheck(False)
def map_keys_to_values(
self, const {{dtype}}_t[:] keys, const int64_t[:] values
) -> None:
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} key
khiter_t k
with nogil:
for i in range(n):
key = {{to_c_type}}(keys[i])
k = kh_put_{{dtype}}(self.table, key, &ret)
self.table.vals[k] = <Py_ssize_t>values[i]
{{endif}}
@cython.boundscheck(False)
def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
# Used in libindex, safe_sort
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} val
khiter_t k
int8_t na_position = self.na_position
if self.uses_mask and mask is None:
raise NotImplementedError # pragma: no cover
with nogil:
if self.uses_mask:
for i in range(n):
if mask[i]:
na_position = i
else:
val= {{to_c_type}}(values[i])
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = i
else:
for i in range(n):
val= {{to_c_type}}(values[i])
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = i
self.na_position = na_position
@cython.boundscheck(False)
def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# Used in safe_sort, IndexEngine.get_indexer
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
{{c_type}} val
khiter_t k
intp_t[::1] locs = np.empty(n, dtype=np.intp)
int8_t na_position = self.na_position
if self.uses_mask and mask is None:
raise NotImplementedError # pragma: no cover
with nogil:
for i in range(n):
if self.uses_mask and mask[i]:
locs[i] = na_position
else:
val = {{to_c_type}}(values[i])
k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
return np.asarray(locs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
object mask=None, bint return_inverse=False, bint use_result_mask=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
uniques : {{name}}Vector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
use_result_mask: bool, default False
Whether to create a result mask for the unique values. Not supported
with return_inverse=True.
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
result_mask: ndarray[bool], if use_result_mask is true
The mask for the result values.
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[::1] labels
int ret = 0
{{c_type}} val, na_value2
khiter_t k
{{name}}VectorData *ud
UInt8Vector result_mask
UInt8VectorData *rmd
bint use_na_value, use_mask, seen_na = False
uint8_t[:] mask_values
if return_inverse:
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
if not use_mask and use_result_mask:
raise NotImplementedError # pragma: no cover
if use_result_mask and return_inverse:
raise NotImplementedError # pragma: no cover
result_mask = UInt8Vector()
rmd = result_mask.data
if use_mask:
mask_values = mask.view("uint8")
if use_na_value:
# We need this na_value2 because we want to allow users
# to *optionally* specify an NA sentinel *of the correct* type.
# We use None, to make it optional, which requires `object` type
# for the parameter. To please the compiler, we use na_value2,
# which is only used if it's *specified*.
na_value2 = {{to_c_type}}(na_value)
else:
na_value2 = {{to_c_type}}(0)
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
if ignore_na and use_mask:
if mask_values[i]:
labels[i] = na_sentinel
continue
elif ignore_na and (
is_nan_{{c_type}}(val) or
(use_na_value and are_equivalent_{{c_type}}(val, na_value2))
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them,
# and replace the corresponding label with na_sentinel
labels[i] = na_sentinel
continue
elif not ignore_na and use_result_mask:
if mask_values[i]:
if seen_na:
continue
seen_na = True
if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
append_data_uint8(rmd, 1)
continue
k = kh_get_{{dtype}}(self.table, val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)
if needs_resize(ud):
with gil:
if uniques.external_view_exists:
raise ValueError("external reference to "
"uniques held, but "
"Vector.resize() needed")
uniques.resize()
if use_result_mask:
if result_mask.external_view_exists:
raise ValueError("external reference to "
"result_mask held, but "
"Vector.resize() needed")
result_mask.resize()
append_data_{{dtype}}(ud, val)
if use_result_mask:
append_data_uint8(rmd, 0)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
if use_result_mask:
return uniques.to_array(), result_mask.to_array()
return uniques.to_array()
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
result_mask: ndarray[bool], if mask is given as input
The mask for the result values.
"""
uniques = {{name}}Vector()
use_result_mask = True if mask is not None else False
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None, ignore_na=True):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[{{dtype}}]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = {{name}}Vector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=ignore_na, mask=mask,
return_inverse=True)
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True, mask=mask)
return labels
{{if dtype == 'int64'}}
@cython.boundscheck(False)
def get_labels_groupby(
self, const {{dtype}}_t[:] values
) -> tuple[ndarray, ndarray]:
# tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
cdef:
Py_ssize_t i, n = len(values)
intp_t[::1] labels
Py_ssize_t idx, count = 0
int ret = 0
{{c_type}} val
khiter_t k
{{name}}Vector uniques = {{name}}Vector()
{{name}}VectorData *ud
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
with nogil:
for i in range(n):
val = {{to_c_type}}(values[i])
# specific for groupby
if val < 0:
labels[i] = -1
continue
k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
idx = self.table.vals[k]
labels[i] = idx
else:
k = kh_put_{{dtype}}(self.table, val, &ret)
self.table.vals[k] = count
if needs_resize(ud):
with gil:
uniques.resize()
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
arr_uniques = uniques.to_array()
return np.asarray(labels), arr_uniques
{{endif}}
cdef class {{name}}Factorizer(Factorizer):
cdef public:
{{name}}HashTable table
{{name}}Vector uniques
def __cinit__(self, size_hint: int):
self.table = {{name}}HashTable(size_hint)
self.uniques = {{name}}Vector()
def factorize(self, const {{c_type}}[:] values,
na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
"""
Returns
-------
ndarray[intp_t]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> fac = {{name}}Factorizer(3)
>>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
array([0, 1, 2])
"""
cdef:
ndarray[intp_t] labels
if self.uniques.external_view_exists:
uniques = {{name}}Vector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel,
na_value=na_value, mask=mask)
self.count = len(self.uniques)
return labels
{{endfor}}
cdef class StringHashTable(HashTable):
# these by-definition *must* be strings
# or a sentinel np.nan / None missing value
na_string_sentinel = '__nan__'
def __init__(self, int64_t size_hint=1):
self.table = kh_init_str()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_str(self.table, size_hint)
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_str(self.table)
self.table = NULL
def sizeof(self, deep: bool = False) -> int:
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
""" returns infos about the state of the hashtable"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, str val):
cdef:
khiter_t k
const char *v
v = get_c_string(val)
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef set_item(self, str key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
const char *v
v = get_c_string(key)
k = kh_put_str(self.table, v, &ret)
if kh_exist_str(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
@cython.boundscheck(False)
def get_indexer(self, ndarray[object] values) -> ndarray:
# -> np.ndarray[np.intp]
cdef:
Py_ssize_t i, n = len(values)
ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
intp_t *resbuf = <intp_t*>labels.data
khiter_t k
kh_str_t *table = self.table
const char *v
const char **vecs
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
v = get_c_string(val)
vecs[i] = v
with nogil:
for i in range(n):
k = kh_get_str(table, vecs[i])
if k != table.n_buckets:
resbuf[i] = table.vals[k]
else:
resbuf[i] = -1
free(vecs)
return labels
@cython.boundscheck(False)
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
const char *v
khiter_t k
intp_t[::1] locs = np.empty(n, dtype=np.intp)
# these by-definition *must* be strings
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
with nogil:
for i in range(n):
v = vecs[i]
k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
free(vecs)
return np.asarray(locs)
@cython.boundscheck(False)
def map_locations(self, ndarray[object] values, object mask = None) -> None:
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
# these by-definition *must* be strings
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
vecs[i] = v
with nogil:
for i in range(n):
v = vecs[i]
k = kh_put_str(self.table, v, &ret)
self.table.vals[k] = i
free(vecs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then any value
that is not a string is considered missing. If na_value is
not None, then _additionally_ any value "val" satisfying
val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[::1] labels
int64_t[::1] uindexer
int ret = 0
object val
const char *v
const char **vecs
khiter_t k
bint use_na_value
if return_inverse:
labels = np.zeros(n, dtype=np.intp)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None
# assign pointers and pre-filter out missing (if ignore_na)
vecs = <const char **>malloc(n * sizeof(char *))
for i in range(n):
val = values[i]
if (ignore_na
and (not isinstance(val, str)
or (use_na_value and val == na_value))):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), we can skip the actual value, and
# replace the label with na_sentinel directly
labels[i] = na_sentinel
else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
v = get_c_string(<str>val)
except UnicodeEncodeError:
v = get_c_string(<str>repr(val))
vecs[i] = v
# compute
with nogil:
for i in range(n):
if ignore_na and labels[i] == na_sentinel:
# skip entries for ignored missing values (see above)
continue
v = vecs[i]
k = kh_get_str(self.table, v)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
uindexer[count] = i
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
free(vecs)
# uniques
for i in range(count):
uniques.append(values[uindexer[i]])
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
mask : ndarray[bool], optional
Not yet implemented for StringHashTable
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None, ignore_na=True):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then any value
that is not a string is considered missing. If na_value is
not None, then _additionally_ any value "val" satisfying
val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implemented for StringHashTable.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=ignore_na,
return_inverse=True)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels
cdef class PyObjectHashTable(HashTable):
def __init__(self, int64_t size_hint=1):
self.table = kh_init_pymap()
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
kh_resize_pymap(self.table, size_hint)
def __dealloc__(self):
if self.table is not NULL:
kh_destroy_pymap(self.table)
self.table = NULL
def __len__(self) -> int:
return self.table.size
def __contains__(self, object key) -> bool:
cdef:
khiter_t k
hash(key)
k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets
def sizeof(self, deep: bool = False) -> int:
""" return the size of my table in bytes """
overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
sizeof(Py_ssize_t)) # vals
return overhead + for_flags + for_pairs
def get_state(self) -> dict[str, int]:
"""
returns infos about the current state of the hashtable like size,
number of buckets and so on.
"""
return {
'n_buckets' : self.table.n_buckets,
'size' : self.table.size,
'n_occupied' : self.table.n_occupied,
'upper_bound' : self.table.upper_bound,
}
cpdef get_item(self, object val):
cdef:
khiter_t k
k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
else:
raise KeyError(val)
cpdef set_item(self, object key, Py_ssize_t val):
cdef:
khiter_t k
int ret = 0
char* buf
hash(key)
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
if kh_exist_pymap(self.table, k):
self.table.vals[k] = val
else:
raise KeyError(key)
def map_locations(self, ndarray[object] values, object mask = None) -> None:
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
for i in range(n):
val = values[i]
hash(val)
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
# -> np.ndarray[np.intp]
# mask not yet implemented
cdef:
Py_ssize_t i, n = len(values)
int ret = 0
object val
khiter_t k
intp_t[::1] locs = np.empty(n, dtype=np.intp)
for i in range(n):
val = values[i]
hash(val)
k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
else:
locs[i] = -1
return np.asarray(locs)
@cython.boundscheck(False)
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
uniques : ObjectVector
Vector into which uniques will be written
count_prior : Py_ssize_t, default 0
Number of existing entries in uniques
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
ignore_na : bool, default False
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
intp_t[::1] labels
int ret = 0
object val
khiter_t k
bint use_na_value
if return_inverse:
labels = np.empty(n, dtype=np.intp)
use_na_value = na_value is not None
for i in range(n):
val = values[i]
hash(val)
if ignore_na and (
checknull(val)
or (use_na_value and val == na_value)
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them, and
# replace the corresponding label with na_sentinel
labels[i] = na_sentinel
continue
k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx
if return_inverse:
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()
def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
"""
Calculate unique values and labels (no sorting!)
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
return_inverse : bool, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
mask : ndarray[bool], optional
Not yet implemented for PyObjectHashTable
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None, object mask=None, ignore_na=True):
"""
Calculate unique values and labels (no sorting!)
Missing values are not included in the "uniques" for this method.
The labels for any missing values will be set to "na_sentinel"
Parameters
----------
values : ndarray[object]
Array of values of which unique will be calculated
na_sentinel : Py_ssize_t, default -1
Sentinel value used for all NA-values in inverse
na_value : object, default None
Value to identify as missing. If na_value is None, then None _plus_
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implemented for PyObjectHashTable.
Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=ignore_na,
return_inverse=True)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
# -> np.ndarray[np.intp]
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
return labels