Intelegentny_Pszczelarz/.venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
2023-06-19 00:49:18 +02:00

462 lines
15 KiB
Python

import numpy as np
from numpy.testing import assert_array_equal, assert_allclose
import pytest
from sklearn.ensemble._hist_gradient_boosting.binning import (
_BinMapper,
_find_binning_thresholds,
_map_to_bins,
)
from sklearn.ensemble._hist_gradient_boosting.common import X_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
from sklearn.ensemble._hist_gradient_boosting.common import ALMOST_INF
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
n_threads = _openmp_effective_n_threads()
DATA = (
np.random.RandomState(42)
.normal(loc=[0, 10], scale=[1, 0.01], size=(int(1e6), 2))
.astype(X_DTYPE)
)
def test_find_binning_thresholds_regular_data():
data = np.linspace(0, 10, 1001)
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds, [2, 4, 6, 8])
def test_find_binning_thresholds_small_regular_data():
data = np.linspace(0, 10, 11)
bin_thresholds = _find_binning_thresholds(data, max_bins=5)
assert_allclose(bin_thresholds, [2, 4, 6, 8])
bin_thresholds = _find_binning_thresholds(data, max_bins=10)
assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9])
bin_thresholds = _find_binning_thresholds(data, max_bins=11)
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
bin_thresholds = _find_binning_thresholds(data, max_bins=255)
assert_allclose(bin_thresholds, np.arange(10) + 0.5)
def test_find_binning_thresholds_random_data():
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=255) for i in range(2)
]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (254,) # 255 - 1
assert bin_thresholds[i].dtype == DATA.dtype
assert_allclose(
bin_thresholds[0][[64, 128, 192]], np.array([-0.7, 0.0, 0.7]), atol=1e-1
)
assert_allclose(
bin_thresholds[1][[64, 128, 192]], np.array([9.99, 10.00, 10.01]), atol=1e-2
)
def test_find_binning_thresholds_low_n_bins():
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=128) for i in range(2)
]
for i in range(len(bin_thresholds)):
assert bin_thresholds[i].shape == (127,) # 128 - 1
assert bin_thresholds[i].dtype == DATA.dtype
@pytest.mark.parametrize("n_bins", (2, 257))
def test_invalid_n_bins(n_bins):
err_msg = "n_bins={} should be no smaller than 3 and no larger than 256".format(
n_bins
)
with pytest.raises(ValueError, match=err_msg):
_BinMapper(n_bins=n_bins).fit(DATA)
def test_bin_mapper_n_features_transform():
mapper = _BinMapper(n_bins=42, random_state=42).fit(DATA)
err_msg = "This estimator was fitted with 2 features but 4 got passed"
with pytest.raises(ValueError, match=err_msg):
mapper.transform(np.repeat(DATA, 2, axis=1))
@pytest.mark.parametrize("max_bins", [16, 128, 255])
def test_map_to_bins(max_bins):
bin_thresholds = [
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
]
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
last_bin_idx = max_bins
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
assert binned.shape == DATA.shape
assert binned.dtype == np.uint8
assert binned.flags.f_contiguous
min_indices = DATA.argmin(axis=0)
max_indices = DATA.argmax(axis=0)
for feature_idx, min_idx in enumerate(min_indices):
assert binned[min_idx, feature_idx] == 0
for feature_idx, max_idx in enumerate(max_indices):
assert binned[max_idx, feature_idx] == max_bins - 1
@pytest.mark.parametrize("max_bins", [5, 10, 42])
def test_bin_mapper_random_data(max_bins):
n_samples, n_features = DATA.shape
expected_count_per_bin = n_samples // max_bins
tol = int(0.05 * expected_count_per_bin)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42).fit(DATA)
binned = mapper.transform(DATA)
assert binned.shape == (n_samples, n_features)
assert binned.dtype == np.uint8
assert_array_equal(binned.min(axis=0), np.array([0, 0]))
assert_array_equal(binned.max(axis=0), np.array([max_bins - 1, max_bins - 1]))
assert len(mapper.bin_thresholds_) == n_features
for bin_thresholds_feature in mapper.bin_thresholds_:
assert bin_thresholds_feature.shape == (max_bins - 1,)
assert bin_thresholds_feature.dtype == DATA.dtype
assert np.all(mapper.n_bins_non_missing_ == max_bins)
# Check that the binned data is approximately balanced across bins.
for feature_idx in range(n_features):
for bin_idx in range(max_bins):
count = (binned[:, feature_idx] == bin_idx).sum()
assert abs(count - expected_count_per_bin) < tol
@pytest.mark.parametrize("n_samples, max_bins", [(5, 5), (5, 10), (5, 11), (42, 255)])
def test_bin_mapper_small_random_data(n_samples, max_bins):
data = np.random.RandomState(42).normal(size=n_samples).reshape(-1, 1)
assert len(np.unique(data)) == n_samples
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
mapper = _BinMapper(n_bins=n_bins, random_state=42)
binned = mapper.fit_transform(data)
assert binned.shape == data.shape
assert binned.dtype == np.uint8
assert_array_equal(binned.ravel()[np.argsort(data.ravel())], np.arange(n_samples))
@pytest.mark.parametrize(
"max_bins, n_distinct, multiplier",
[
(5, 5, 1),
(5, 5, 3),
(255, 12, 42),
],
)
def test_bin_mapper_identity_repeated_values(max_bins, n_distinct, multiplier):
data = np.array(list(range(n_distinct)) * multiplier).reshape(-1, 1)
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(data, binned)
@pytest.mark.parametrize("n_distinct", [2, 7, 42])
def test_bin_mapper_repeated_values_invariance(n_distinct):
rng = np.random.RandomState(42)
distinct_values = rng.normal(size=n_distinct)
assert len(np.unique(distinct_values)) == n_distinct
repeated_indices = rng.randint(low=0, high=n_distinct, size=1000)
data = distinct_values[repeated_indices]
rng.shuffle(data)
assert_array_equal(np.unique(data), np.sort(distinct_values))
data = data.reshape(-1, 1)
mapper_1 = _BinMapper(n_bins=n_distinct + 1)
binned_1 = mapper_1.fit_transform(data)
assert_array_equal(np.unique(binned_1[:, 0]), np.arange(n_distinct))
# Adding more bins to the mapper yields the same results (same thresholds)
mapper_2 = _BinMapper(n_bins=min(256, n_distinct * 3) + 1)
binned_2 = mapper_2.fit_transform(data)
assert_allclose(mapper_1.bin_thresholds_[0], mapper_2.bin_thresholds_[0])
assert_array_equal(binned_1, binned_2)
@pytest.mark.parametrize(
"max_bins, scale, offset",
[
(3, 2, -1),
(42, 1, 0),
(255, 0.3, 42),
],
)
def test_bin_mapper_identity_small(max_bins, scale, offset):
data = np.arange(max_bins).reshape(-1, 1) * scale + offset
# max_bins is the number of bins for non-missing values
n_bins = max_bins + 1
binned = _BinMapper(n_bins=n_bins).fit_transform(data)
assert_array_equal(binned, np.arange(max_bins).reshape(-1, 1))
@pytest.mark.parametrize(
"max_bins_small, max_bins_large",
[
(2, 2),
(3, 3),
(4, 4),
(42, 42),
(255, 255),
(5, 17),
(42, 255),
],
)
def test_bin_mapper_idempotence(max_bins_small, max_bins_large):
assert max_bins_large >= max_bins_small
data = np.random.RandomState(42).normal(size=30000).reshape(-1, 1)
mapper_small = _BinMapper(n_bins=max_bins_small + 1)
mapper_large = _BinMapper(n_bins=max_bins_small + 1)
binned_small = mapper_small.fit_transform(data)
binned_large = mapper_large.fit_transform(binned_small)
assert_array_equal(binned_small, binned_large)
@pytest.mark.parametrize("n_bins", [10, 100, 256])
@pytest.mark.parametrize("diff", [-5, 0, 5])
def test_n_bins_non_missing(n_bins, diff):
# Check that n_bins_non_missing is n_unique_values when
# there are not a lot of unique values, else n_bins - 1.
n_unique_values = n_bins + diff
X = list(range(n_unique_values)) * 2
X = np.array(X).reshape(-1, 1)
mapper = _BinMapper(n_bins=n_bins).fit(X)
assert np.all(mapper.n_bins_non_missing_ == min(n_bins - 1, n_unique_values))
def test_subsample():
# Make sure bin thresholds are different when applying subsampling
mapper_no_subsample = _BinMapper(subsample=None, random_state=0).fit(DATA)
mapper_subsample = _BinMapper(subsample=256, random_state=0).fit(DATA)
for feature in range(DATA.shape[1]):
assert not np.allclose(
mapper_no_subsample.bin_thresholds_[feature],
mapper_subsample.bin_thresholds_[feature],
rtol=1e-4,
)
@pytest.mark.parametrize(
"n_bins, n_bins_non_missing, X_trans_expected",
[
(
256,
[4, 2, 2],
[
[0, 0, 0], # 255 <=> missing value
[255, 255, 0],
[1, 0, 0],
[255, 1, 1],
[2, 1, 1],
[3, 0, 0],
],
),
(
3,
[2, 2, 2],
[
[0, 0, 0], # 2 <=> missing value
[2, 2, 0],
[0, 0, 0],
[2, 1, 1],
[1, 1, 1],
[1, 0, 0],
],
),
],
)
def test_missing_values_support(n_bins, n_bins_non_missing, X_trans_expected):
# check for missing values: make sure nans are mapped to the last bin
# and that the _BinMapper attributes are correct
X = [
[1, 1, 0],
[np.NaN, np.NaN, 0],
[2, 1, 0],
[np.NaN, 2, 1],
[3, 2, 1],
[4, 1, 0],
]
X = np.array(X)
mapper = _BinMapper(n_bins=n_bins)
mapper.fit(X)
assert_array_equal(mapper.n_bins_non_missing_, n_bins_non_missing)
for feature_idx in range(X.shape[1]):
assert (
len(mapper.bin_thresholds_[feature_idx])
== n_bins_non_missing[feature_idx] - 1
)
assert mapper.missing_values_bin_idx_ == n_bins - 1
X_trans = mapper.transform(X)
assert_array_equal(X_trans, X_trans_expected)
def test_infinite_values():
# Make sure infinite values are properly handled.
bin_mapper = _BinMapper()
X = np.array([-np.inf, 0, 1, np.inf]).reshape(-1, 1)
bin_mapper.fit(X)
assert_allclose(bin_mapper.bin_thresholds_[0], [-np.inf, 0.5, ALMOST_INF])
assert bin_mapper.n_bins_non_missing_ == [4]
expected_binned_X = np.array([0, 1, 2, 3]).reshape(-1, 1)
assert_array_equal(bin_mapper.transform(X), expected_binned_X)
@pytest.mark.parametrize("n_bins", [15, 256])
def test_categorical_feature(n_bins):
# Basic test for categorical features
# we make sure that categories are mapped into [0, n_categories - 1] and
# that nans are mapped to the last bin
X = np.array(
[[4] * 500 + [1] * 3 + [10] * 4 + [0] * 4 + [13] + [7] * 5 + [np.nan] * 2],
dtype=X_DTYPE,
).T
known_categories = [np.unique(X[~np.isnan(X)])]
bin_mapper = _BinMapper(
n_bins=n_bins,
is_categorical=np.array([True]),
known_categories=known_categories,
).fit(X)
assert bin_mapper.n_bins_non_missing_ == [6]
assert_array_equal(bin_mapper.bin_thresholds_[0], [0, 1, 4, 7, 10, 13])
X = np.array([[0, 1, 4, np.nan, 7, 10, 13]], dtype=X_DTYPE).T
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)
# For unknown categories, the mapping is incorrect / undefined. This never
# happens in practice. This check is only for illustration purpose.
X = np.array([[-1, 100]], dtype=X_DTYPE).T
expected_trans = np.array([[0, 6]]).T
assert_array_equal(bin_mapper.transform(X), expected_trans)
@pytest.mark.parametrize("n_bins", (128, 256))
def test_categorical_with_numerical_features(n_bins):
# basic check for binmapper with mixed data
X1 = np.arange(10, 20).reshape(-1, 1) # numerical
X2 = np.arange(10, 15).reshape(-1, 1) # categorical
X2 = np.r_[X2, X2]
X = np.c_[X1, X2]
known_categories = [None, np.unique(X2).astype(X_DTYPE)]
bin_mapper = _BinMapper(
n_bins=n_bins,
is_categorical=np.array([False, True]),
known_categories=known_categories,
).fit(X)
assert_array_equal(bin_mapper.n_bins_non_missing_, [10, 5])
bin_thresholds = bin_mapper.bin_thresholds_
assert len(bin_thresholds) == 2
assert_array_equal(bin_thresholds[1], np.arange(10, 15))
expected_X_trans = [
[0, 0],
[1, 1],
[2, 2],
[3, 3],
[4, 4],
[5, 0],
[6, 1],
[7, 2],
[8, 3],
[9, 4],
]
assert_array_equal(bin_mapper.transform(X), expected_X_trans)
def test_make_known_categories_bitsets():
# Check the output of make_known_categories_bitsets
X = np.array(
[[14, 2, 30], [30, 4, 70], [40, 10, 180], [40, 240, 180]], dtype=X_DTYPE
)
bin_mapper = _BinMapper(
n_bins=256,
is_categorical=np.array([False, True, True]),
known_categories=[None, X[:, 1], X[:, 2]],
)
bin_mapper.fit(X)
known_cat_bitsets, f_idx_map = bin_mapper.make_known_categories_bitsets()
# Note that for non-categorical features, values are left to 0
expected_f_idx_map = np.array([0, 0, 1], dtype=np.uint8)
assert_allclose(expected_f_idx_map, f_idx_map)
expected_cat_bitset = np.zeros((2, 8), dtype=np.uint32)
# first categorical feature: [2, 4, 10, 240]
f_idx = 1
mapped_f_idx = f_idx_map[f_idx]
expected_cat_bitset[mapped_f_idx, 0] = 2**2 + 2**4 + 2**10
# 240 = 32**7 + 16, therefore the 16th bit of the 7th array is 1.
expected_cat_bitset[mapped_f_idx, 7] = 2**16
# second categorical feature [30, 70, 180]
f_idx = 2
mapped_f_idx = f_idx_map[f_idx]
expected_cat_bitset[mapped_f_idx, 0] = 2**30
expected_cat_bitset[mapped_f_idx, 2] = 2**6
expected_cat_bitset[mapped_f_idx, 5] = 2**20
assert_allclose(expected_cat_bitset, known_cat_bitsets)
@pytest.mark.parametrize(
"is_categorical, known_categories, match",
[
(np.array([True]), [None], "Known categories for feature 0 must be provided"),
(
np.array([False]),
np.array([1, 2, 3]),
"isn't marked as a categorical feature, but categories were passed",
),
],
)
def test_categorical_parameters(is_categorical, known_categories, match):
# test the validation of the is_categorical and known_categories parameters
X = np.array([[1, 2, 3]], dtype=X_DTYPE)
bin_mapper = _BinMapper(
is_categorical=is_categorical, known_categories=known_categories
)
with pytest.raises(ValueError, match=match):
bin_mapper.fit(X)