Inzynierka/Lib/site-packages/pandas/_libs/hashtable.pyx
2023-06-02 12:51:02 +02:00

126 lines
2.9 KiB
Cython

cimport cython
from cpython.mem cimport (
PyMem_Free,
PyMem_Malloc,
)
from cpython.ref cimport (
Py_INCREF,
PyObject,
)
from libc.stdlib cimport (
free,
malloc,
)
import numpy as np
cimport numpy as cnp
from numpy cimport ndarray
cnp.import_array()
from pandas._libs cimport util
from pandas._libs.dtypes cimport numeric_object_t
from pandas._libs.khash cimport (
KHASH_TRACE_DOMAIN,
are_equivalent_float32_t,
are_equivalent_float64_t,
are_equivalent_khcomplex64_t,
are_equivalent_khcomplex128_t,
kh_needed_n_buckets,
kh_python_hash_equal,
kh_python_hash_func,
khiter_t,
)
from pandas._libs.missing cimport checknull
def get_hashtable_trace_domain():
return KHASH_TRACE_DOMAIN
def object_hash(obj):
return kh_python_hash_func(obj)
def objects_are_equal(a, b):
return kh_python_hash_equal(a, b)
cdef int64_t NPY_NAT = util.get_nat()
SIZE_HINT_LIMIT = (1 << 20) + 7
cdef Py_ssize_t _INIT_VEC_CAP = 128
include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"
# map derived hash-map types onto basic hash-map types:
if np.dtype(np.intp) == np.dtype(np.int64):
IntpHashTable = Int64HashTable
unique_label_indices = _unique_label_indices_int64
elif np.dtype(np.intp) == np.dtype(np.int32):
IntpHashTable = Int32HashTable
unique_label_indices = _unique_label_indices_int32
else:
raise ValueError(np.dtype(np.intp))
cdef class Factorizer:
cdef readonly:
Py_ssize_t count
def __cinit__(self, size_hint: int):
self.count = 0
def get_count(self) -> int:
return self.count
def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray:
raise NotImplementedError
cdef class ObjectFactorizer(Factorizer):
cdef public:
PyObjectHashTable table
ObjectVector uniques
def __cinit__(self, size_hint: int):
self.table = PyObjectHashTable(size_hint)
self.uniques = ObjectVector()
def factorize(
self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None
) -> np.ndarray:
"""
Returns
-------
np.ndarray[np.intp]
Examples
--------
Factorize values with nans replaced by na_sentinel
>>> fac = ObjectFactorizer(3)
>>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels
if mask is not None:
raise NotImplementedError("mask not supported for ObjectFactorizer.")
if self.uniques.external_view_exists:
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
self.uniques = uniques
labels = self.table.get_labels(values, self.uniques,
self.count, na_sentinel, na_value)
self.count = len(self.uniques)
return labels