3RNN/Lib/site-packages/sklearn/feature_extraction/_hashing_fast.pyx

90 lines
2.9 KiB
Cython
Raw Permalink Normal View History

2024-05-26 19:49:15 +02:00
# Author: Lars Buitinck
# License: BSD 3 clause
from libc.stdlib cimport abs
from libcpp.vector cimport vector
cimport numpy as cnp
import numpy as np
from ..utils._typedefs cimport int32_t, int64_t
from ..utils.murmurhash cimport murmurhash3_bytes_s32
from ..utils._vector_sentinel cimport vector_to_nd_array
cnp.import_array()
def transform(raw_X, Py_ssize_t n_features, dtype,
bint alternate_sign=1, unsigned int seed=0):
"""Guts of FeatureHasher.transform.
Returns
-------
n_samples : integer
indices, indptr, values : lists
For constructing a scipy.sparse.csr_matrix.
"""
cdef int32_t h
cdef double value
cdef vector[int32_t] indices
cdef vector[int64_t] indptr
indptr.push_back(0)
# Since Python array does not understand Numpy dtypes, we grow the indices
# and values arrays ourselves. Use a Py_ssize_t capacity for safety.
cdef Py_ssize_t capacity = 8192 # arbitrary
cdef int64_t size = 0
cdef cnp.ndarray values = np.empty(capacity, dtype=dtype)
for x in raw_X:
for f, v in x:
if isinstance(v, (str, unicode)):
f = "%s%s%s" % (f, '=', v)
value = 1
else:
value = v
if value == 0:
continue
if isinstance(f, unicode):
f = (<unicode>f).encode("utf-8")
# Need explicit type check because Murmurhash does not propagate
# all exceptions. Add "except *" there?
elif not isinstance(f, bytes):
raise TypeError("feature names must be strings")
h = murmurhash3_bytes_s32(<bytes>f, seed)
if h == - 2147483648:
# abs(-2**31) is undefined behavior because h is a `np.int32`
# The following is defined such that it is equal to: abs(-2**31) % n_features
indices.push_back((2147483647 - (n_features - 1)) % n_features)
else:
indices.push_back(abs(h) % n_features)
# improve inner product preservation in the hashed space
if alternate_sign:
value *= (h >= 0) * 2 - 1
values[size] = value
size += 1
if size == capacity:
capacity *= 2
# can't use resize member because there might be multiple
# references to the arrays due to Cython's error checking
values = np.resize(values, capacity)
indptr.push_back(size)
indices_array = vector_to_nd_array(&indices)
indptr_array = vector_to_nd_array(&indptr)
if indptr_array[indptr_array.shape[0]-1] > np.iinfo(np.int32).max: # = 2**31 - 1
# both indices and indptr have the same dtype in CSR arrays
indices_array = indices_array.astype(np.int64, copy=False)
else:
indptr_array = indptr_array.astype(np.int32, copy=False)
return (indices_array, indptr_array, values[:size])