337 lines
11 KiB
Python
337 lines
11 KiB
Python
|
from contextlib import contextmanager
|
||
|
import tracemalloc
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas._libs import hashtable as ht
|
||
|
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
@contextmanager
|
||
|
def activated_tracemalloc():
|
||
|
tracemalloc.start()
|
||
|
try:
|
||
|
yield
|
||
|
finally:
|
||
|
tracemalloc.stop()
|
||
|
|
||
|
|
||
|
def get_allocated_khash_memory():
|
||
|
snapshot = tracemalloc.take_snapshot()
|
||
|
snapshot = snapshot.filter_traces(
|
||
|
(tracemalloc.DomainFilter(True, ht.get_hashtable_trace_domain()),)
|
||
|
)
|
||
|
return sum(map(lambda x: x.size, snapshot.traces))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"table_type, dtype",
|
||
|
[
|
||
|
(ht.PyObjectHashTable, np.object_),
|
||
|
(ht.Int64HashTable, np.int64),
|
||
|
(ht.UInt64HashTable, np.uint64),
|
||
|
(ht.Float64HashTable, np.float64),
|
||
|
(ht.Int32HashTable, np.int32),
|
||
|
(ht.UInt32HashTable, np.uint32),
|
||
|
(ht.Float32HashTable, np.float32),
|
||
|
(ht.Int16HashTable, np.int16),
|
||
|
(ht.UInt16HashTable, np.uint16),
|
||
|
(ht.Int8HashTable, np.int8),
|
||
|
(ht.UInt8HashTable, np.uint8),
|
||
|
],
|
||
|
)
|
||
|
class TestHashTable:
|
||
|
def test_get_set_contains_len(self, table_type, dtype):
|
||
|
index = 5
|
||
|
table = table_type(55)
|
||
|
assert len(table) == 0
|
||
|
assert index not in table
|
||
|
|
||
|
table.set_item(index, 42)
|
||
|
assert len(table) == 1
|
||
|
assert index in table
|
||
|
assert table.get_item(index) == 42
|
||
|
|
||
|
table.set_item(index + 1, 41)
|
||
|
assert index in table
|
||
|
assert index + 1 in table
|
||
|
assert len(table) == 2
|
||
|
assert table.get_item(index) == 42
|
||
|
assert table.get_item(index + 1) == 41
|
||
|
|
||
|
table.set_item(index, 21)
|
||
|
assert index in table
|
||
|
assert index + 1 in table
|
||
|
assert len(table) == 2
|
||
|
assert table.get_item(index) == 21
|
||
|
assert table.get_item(index + 1) == 41
|
||
|
assert index + 2 not in table
|
||
|
|
||
|
with pytest.raises(KeyError) as excinfo:
|
||
|
table.get_item(index + 2)
|
||
|
assert str(index + 2) in str(excinfo.value)
|
||
|
|
||
|
def test_map(self, table_type, dtype):
|
||
|
# PyObjectHashTable has no map-method
|
||
|
if table_type != ht.PyObjectHashTable:
|
||
|
N = 77
|
||
|
table = table_type()
|
||
|
keys = np.arange(N).astype(dtype)
|
||
|
vals = np.arange(N).astype(np.int64) + N
|
||
|
table.map(keys, vals)
|
||
|
for i in range(N):
|
||
|
assert table.get_item(keys[i]) == i + N
|
||
|
|
||
|
def test_map_locations(self, table_type, dtype):
|
||
|
N = 8
|
||
|
table = table_type()
|
||
|
keys = (np.arange(N) + N).astype(dtype)
|
||
|
table.map_locations(keys)
|
||
|
for i in range(N):
|
||
|
assert table.get_item(keys[i]) == i
|
||
|
|
||
|
def test_lookup(self, table_type, dtype):
|
||
|
N = 3
|
||
|
table = table_type()
|
||
|
keys = (np.arange(N) + N).astype(dtype)
|
||
|
table.map_locations(keys)
|
||
|
result = table.lookup(keys)
|
||
|
expected = np.arange(N)
|
||
|
tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
|
||
|
|
||
|
def test_lookup_wrong(self, table_type, dtype):
|
||
|
if dtype in (np.int8, np.uint8):
|
||
|
N = 100
|
||
|
else:
|
||
|
N = 512
|
||
|
table = table_type()
|
||
|
keys = (np.arange(N) + N).astype(dtype)
|
||
|
table.map_locations(keys)
|
||
|
wrong_keys = np.arange(N).astype(dtype)
|
||
|
result = table.lookup(wrong_keys)
|
||
|
assert np.all(result == -1)
|
||
|
|
||
|
def test_unique(self, table_type, dtype):
|
||
|
if dtype in (np.int8, np.uint8):
|
||
|
N = 88
|
||
|
else:
|
||
|
N = 1000
|
||
|
table = table_type()
|
||
|
expected = (np.arange(N) + N).astype(dtype)
|
||
|
keys = np.repeat(expected, 5)
|
||
|
unique = table.unique(keys)
|
||
|
tm.assert_numpy_array_equal(unique, expected)
|
||
|
|
||
|
def test_tracemalloc_works(self, table_type, dtype):
|
||
|
if dtype in (np.int8, np.uint8):
|
||
|
N = 256
|
||
|
else:
|
||
|
N = 30000
|
||
|
keys = np.arange(N).astype(dtype)
|
||
|
with activated_tracemalloc():
|
||
|
table = table_type()
|
||
|
table.map_locations(keys)
|
||
|
used = get_allocated_khash_memory()
|
||
|
my_size = table.sizeof()
|
||
|
assert used == my_size
|
||
|
del table
|
||
|
assert get_allocated_khash_memory() == 0
|
||
|
|
||
|
def test_tracemalloc_for_empty(self, table_type, dtype):
|
||
|
with activated_tracemalloc():
|
||
|
table = table_type()
|
||
|
used = get_allocated_khash_memory()
|
||
|
my_size = table.sizeof()
|
||
|
assert used == my_size
|
||
|
del table
|
||
|
assert get_allocated_khash_memory() == 0
|
||
|
|
||
|
|
||
|
def test_tracemalloc_works_for_StringHashTable():
|
||
|
N = 1000
|
||
|
keys = np.arange(N).astype(np.compat.unicode).astype(np.object_)
|
||
|
with activated_tracemalloc():
|
||
|
table = ht.StringHashTable()
|
||
|
table.map_locations(keys)
|
||
|
used = get_allocated_khash_memory()
|
||
|
my_size = table.sizeof()
|
||
|
assert used == my_size
|
||
|
del table
|
||
|
assert get_allocated_khash_memory() == 0
|
||
|
|
||
|
|
||
|
def test_tracemalloc_for_empty_StringHashTable():
|
||
|
with activated_tracemalloc():
|
||
|
table = ht.StringHashTable()
|
||
|
used = get_allocated_khash_memory()
|
||
|
my_size = table.sizeof()
|
||
|
assert used == my_size
|
||
|
del table
|
||
|
assert get_allocated_khash_memory() == 0
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"table_type, dtype",
|
||
|
[
|
||
|
(ht.Float64HashTable, np.float64),
|
||
|
(ht.Float32HashTable, np.float32),
|
||
|
],
|
||
|
)
|
||
|
class TestHashTableWithNans:
|
||
|
def test_get_set_contains_len(self, table_type, dtype):
|
||
|
index = float("nan")
|
||
|
table = table_type()
|
||
|
assert index not in table
|
||
|
|
||
|
table.set_item(index, 42)
|
||
|
assert len(table) == 1
|
||
|
assert index in table
|
||
|
assert table.get_item(index) == 42
|
||
|
|
||
|
table.set_item(index, 41)
|
||
|
assert len(table) == 1
|
||
|
assert index in table
|
||
|
assert table.get_item(index) == 41
|
||
|
|
||
|
def test_map(self, table_type, dtype):
|
||
|
N = 332
|
||
|
table = table_type()
|
||
|
keys = np.full(N, np.nan, dtype=dtype)
|
||
|
vals = (np.arange(N) + N).astype(np.int64)
|
||
|
table.map(keys, vals)
|
||
|
assert len(table) == 1
|
||
|
assert table.get_item(np.nan) == 2 * N - 1
|
||
|
|
||
|
def test_map_locations(self, table_type, dtype):
|
||
|
N = 10
|
||
|
table = table_type()
|
||
|
keys = np.full(N, np.nan, dtype=dtype)
|
||
|
table.map_locations(keys)
|
||
|
assert len(table) == 1
|
||
|
assert table.get_item(np.nan) == N - 1
|
||
|
|
||
|
def test_unique(self, table_type, dtype):
|
||
|
N = 1020
|
||
|
table = table_type()
|
||
|
keys = np.full(N, np.nan, dtype=dtype)
|
||
|
unique = table.unique(keys)
|
||
|
assert np.all(np.isnan(unique)) and len(unique) == 1
|
||
|
|
||
|
|
||
|
def get_ht_function(fun_name, type_suffix):
|
||
|
return getattr(ht, fun_name + "_" + type_suffix)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dtype, type_suffix",
|
||
|
[
|
||
|
(np.object_, "object"),
|
||
|
(np.int64, "int64"),
|
||
|
(np.uint64, "uint64"),
|
||
|
(np.float64, "float64"),
|
||
|
(np.int32, "int32"),
|
||
|
(np.uint32, "uint32"),
|
||
|
(np.float32, "float32"),
|
||
|
(np.int16, "int16"),
|
||
|
(np.uint16, "uint16"),
|
||
|
(np.int8, "int8"),
|
||
|
(np.uint8, "uint8"),
|
||
|
],
|
||
|
)
|
||
|
class TestHelpFunctions:
|
||
|
def test_value_count(self, dtype, type_suffix):
|
||
|
N = 43
|
||
|
value_count = get_ht_function("value_count", type_suffix)
|
||
|
expected = (np.arange(N) + N).astype(dtype)
|
||
|
values = np.repeat(expected, 5)
|
||
|
keys, counts = value_count(values, False)
|
||
|
tm.assert_numpy_array_equal(np.sort(keys), expected)
|
||
|
assert np.all(counts == 5)
|
||
|
|
||
|
def test_duplicated_first(self, dtype, type_suffix):
|
||
|
N = 100
|
||
|
duplicated = get_ht_function("duplicated", type_suffix)
|
||
|
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||
|
result = duplicated(values)
|
||
|
expected = np.ones_like(values, dtype=np.bool_)
|
||
|
expected[::5] = False
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_ismember_yes(self, dtype, type_suffix):
|
||
|
N = 127
|
||
|
ismember = get_ht_function("ismember", type_suffix)
|
||
|
arr = np.arange(N).astype(dtype)
|
||
|
values = np.arange(N).astype(dtype)
|
||
|
result = ismember(arr, values)
|
||
|
expected = np.ones_like(values, dtype=np.bool_)
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_ismember_no(self, dtype, type_suffix):
|
||
|
N = 17
|
||
|
ismember = get_ht_function("ismember", type_suffix)
|
||
|
arr = np.arange(N).astype(dtype)
|
||
|
values = (np.arange(N) + N).astype(dtype)
|
||
|
result = ismember(arr, values)
|
||
|
expected = np.zeros_like(values, dtype=np.bool_)
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_mode(self, dtype, type_suffix):
|
||
|
if dtype in (np.int8, np.uint8):
|
||
|
N = 53
|
||
|
else:
|
||
|
N = 11111
|
||
|
mode = get_ht_function("mode", type_suffix)
|
||
|
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||
|
values[0] = 42
|
||
|
result = mode(values, False)
|
||
|
assert result == 42
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"dtype, type_suffix",
|
||
|
[
|
||
|
(np.float64, "float64"),
|
||
|
(np.float32, "float32"),
|
||
|
],
|
||
|
)
|
||
|
class TestHelpFunctionsWithNans:
|
||
|
def test_value_count(self, dtype, type_suffix):
|
||
|
value_count = get_ht_function("value_count", type_suffix)
|
||
|
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||
|
keys, counts = value_count(values, True)
|
||
|
assert len(keys) == 0
|
||
|
keys, counts = value_count(values, False)
|
||
|
assert len(keys) == 1 and np.all(np.isnan(keys))
|
||
|
assert counts[0] == 3
|
||
|
|
||
|
def test_duplicated_first(self, dtype, type_suffix):
|
||
|
duplicated = get_ht_function("duplicated", type_suffix)
|
||
|
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||
|
result = duplicated(values)
|
||
|
expected = np.array([False, True, True])
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_ismember_yes(self, dtype, type_suffix):
|
||
|
ismember = get_ht_function("ismember", type_suffix)
|
||
|
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||
|
values = np.array([np.nan, np.nan], dtype=dtype)
|
||
|
result = ismember(arr, values)
|
||
|
expected = np.array([True, True, True], dtype=np.bool_)
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_ismember_no(self, dtype, type_suffix):
|
||
|
ismember = get_ht_function("ismember", type_suffix)
|
||
|
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||
|
values = np.array([1], dtype=dtype)
|
||
|
result = ismember(arr, values)
|
||
|
expected = np.array([False, False, False], dtype=np.bool_)
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_mode(self, dtype, type_suffix):
|
||
|
mode = get_ht_function("mode", type_suffix)
|
||
|
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
|
||
|
assert mode(values, True) == 42
|
||
|
assert np.isnan(mode(values, False))
|