137 lines
4.4 KiB
Cython
137 lines
4.4 KiB
Cython
|
"""Cython wrapper for MurmurHash3 non-cryptographic hash function.
|
||
|
|
||
|
MurmurHash is an extensively tested and very fast hash function that has
|
||
|
good distribution properties suitable for machine learning use cases
|
||
|
such as feature hashing and random projections.
|
||
|
|
||
|
The original C++ code by Austin Appleby is released the public domain
|
||
|
and can be found here:
|
||
|
|
||
|
https://code.google.com/p/smhasher/
|
||
|
|
||
|
"""
|
||
|
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from ..utils._typedefs cimport int32_t, uint32_t
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
cdef extern from "src/MurmurHash3.h":
|
||
|
void MurmurHash3_x86_32(void *key, int len, uint32_t seed, void *out)
|
||
|
void MurmurHash3_x86_128(void *key, int len, uint32_t seed, void *out)
|
||
|
void MurmurHash3_x64_128 (void *key, int len, uint32_t seed, void *out)
|
||
|
|
||
|
|
||
|
cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed):
|
||
|
"""Compute the 32bit murmurhash3 of a int key at seed."""
|
||
|
cdef uint32_t out
|
||
|
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
|
||
|
return out
|
||
|
|
||
|
|
||
|
cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed):
|
||
|
"""Compute the 32bit murmurhash3 of a int key at seed."""
|
||
|
cdef int32_t out
|
||
|
MurmurHash3_x86_32(&key, sizeof(int), seed, &out)
|
||
|
return out
|
||
|
|
||
|
|
||
|
cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed):
|
||
|
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
|
||
|
cdef uint32_t out
|
||
|
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
|
||
|
return out
|
||
|
|
||
|
|
||
|
cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed):
|
||
|
"""Compute the 32bit murmurhash3 of a bytes key at seed."""
|
||
|
cdef int32_t out
|
||
|
MurmurHash3_x86_32(<char*> key, len(key), seed, &out)
|
||
|
return out
|
||
|
|
||
|
|
||
|
def _murmurhash3_bytes_array_u32(
|
||
|
const int32_t[:] key,
|
||
|
unsigned int seed,
|
||
|
):
|
||
|
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
|
||
|
# TODO make it possible to pass preallocated output array
|
||
|
cdef:
|
||
|
uint32_t[:] out = np.zeros(key.size, np.uint32)
|
||
|
Py_ssize_t i
|
||
|
for i in range(key.shape[0]):
|
||
|
out[i] = murmurhash3_int_u32(key[i], seed)
|
||
|
return np.asarray(out)
|
||
|
|
||
|
|
||
|
def _murmurhash3_bytes_array_s32(
|
||
|
const int32_t[:] key,
|
||
|
unsigned int seed,
|
||
|
):
|
||
|
"""Compute 32bit murmurhash3 hashes of a key int array at seed."""
|
||
|
# TODO make it possible to pass preallocated output array
|
||
|
cdef:
|
||
|
int32_t[:] out = np.zeros(key.size, np.int32)
|
||
|
Py_ssize_t i
|
||
|
for i in range(key.shape[0]):
|
||
|
out[i] = murmurhash3_int_s32(key[i], seed)
|
||
|
return np.asarray(out)
|
||
|
|
||
|
|
||
|
def murmurhash3_32(key, seed=0, positive=False):
|
||
|
"""Compute the 32bit murmurhash3 of key at seed.
|
||
|
|
||
|
The underlying implementation is MurmurHash3_x86_32 generating low
|
||
|
latency 32bits hash suitable for implementing lookup tables, Bloom
|
||
|
filters, count min sketch or feature hashing.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
key : np.int32, bytes, unicode or ndarray of dtype=np.int32
|
||
|
The physical object to hash.
|
||
|
|
||
|
seed : int, default=0
|
||
|
Integer seed for the hashing algorithm.
|
||
|
|
||
|
positive : bool, default=False
|
||
|
True: the results is casted to an unsigned int
|
||
|
from 0 to 2 ** 32 - 1
|
||
|
False: the results is casted to a signed int
|
||
|
from -(2 ** 31) to 2 ** 31 - 1
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.utils import murmurhash3_32
|
||
|
>>> murmurhash3_32(b"Hello World!", seed=42)
|
||
|
3565178
|
||
|
"""
|
||
|
if isinstance(key, bytes):
|
||
|
if positive:
|
||
|
return murmurhash3_bytes_u32(key, seed)
|
||
|
else:
|
||
|
return murmurhash3_bytes_s32(key, seed)
|
||
|
elif isinstance(key, unicode):
|
||
|
if positive:
|
||
|
return murmurhash3_bytes_u32(key.encode('utf-8'), seed)
|
||
|
else:
|
||
|
return murmurhash3_bytes_s32(key.encode('utf-8'), seed)
|
||
|
elif isinstance(key, int) or isinstance(key, np.int32):
|
||
|
if positive:
|
||
|
return murmurhash3_int_u32(<int32_t>key, seed)
|
||
|
else:
|
||
|
return murmurhash3_int_s32(<int32_t>key, seed)
|
||
|
elif isinstance(key, np.ndarray):
|
||
|
if key.dtype != np.int32:
|
||
|
raise TypeError(
|
||
|
"key.dtype should be int32, got %s" % key.dtype)
|
||
|
if positive:
|
||
|
return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape)
|
||
|
else:
|
||
|
return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape)
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
"key %r with type %s is not supported. "
|
||
|
"Explicit conversion to bytes is required" % (key, type(key)))
|