956 lines
34 KiB
Python
956 lines
34 KiB
Python
import numpy as np
|
|
import pytest
|
|
from numpy.testing import assert_array_equal
|
|
|
|
from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
|
|
from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
|
|
from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
|
|
from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
|
|
from sklearn.ensemble._hist_gradient_boosting.splitting import (
|
|
Splitter,
|
|
compute_node_value,
|
|
)
|
|
from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
|
|
from sklearn.utils._testing import skip_if_32bit
|
|
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
|
|
|
|
n_threads = _openmp_effective_n_threads()
|
|
|
|
|
|
@pytest.mark.parametrize("n_bins", [3, 32, 256])
|
|
def test_histogram_split(n_bins):
|
|
rng = np.random.RandomState(42)
|
|
feature_idx = 0
|
|
l2_regularization = 0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
X_binned = np.asfortranarray(
|
|
rng.randint(0, n_bins - 1, size=(int(1e4), 1)), dtype=X_BINNED_DTYPE
|
|
)
|
|
binned_feature = X_binned.T[feature_idx]
|
|
sample_indices = np.arange(binned_feature.shape[0], dtype=np.uint32)
|
|
ordered_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
|
all_hessians = ordered_hessians
|
|
sum_hessians = all_hessians.sum()
|
|
hessians_are_constant = False
|
|
|
|
for true_bin in range(1, n_bins - 2):
|
|
for sign in [-1, 1]:
|
|
ordered_gradients = np.full_like(binned_feature, sign, dtype=G_H_DTYPE)
|
|
ordered_gradients[binned_feature <= true_bin] *= -1
|
|
all_gradients = ordered_gradients
|
|
sum_gradients = all_gradients.sum()
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned,
|
|
n_bins,
|
|
all_gradients,
|
|
all_hessians,
|
|
hessians_are_constant,
|
|
n_threads,
|
|
)
|
|
n_bins_non_missing = np.array(
|
|
[n_bins - 1] * X_binned.shape[1], dtype=np.uint32
|
|
)
|
|
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
split_info = splitter.find_node_split(
|
|
sample_indices.shape[0], histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
|
|
assert split_info.bin_idx == true_bin
|
|
assert split_info.gain >= 0
|
|
assert split_info.feature_idx == feature_idx
|
|
assert (
|
|
split_info.n_samples_left + split_info.n_samples_right
|
|
== sample_indices.shape[0]
|
|
)
|
|
# Constant hessian: 1. per sample.
|
|
assert split_info.n_samples_left == split_info.sum_hessian_left
|
|
|
|
|
|
@skip_if_32bit
|
|
@pytest.mark.parametrize("constant_hessian", [True, False])
|
|
def test_gradient_and_hessian_sanity(constant_hessian):
|
|
# This test checks that the values of gradients and hessians are
|
|
# consistent in different places:
|
|
# - in split_info: si.sum_gradient_left + si.sum_gradient_right must be
|
|
# equal to the gradient at the node. Same for hessians.
|
|
# - in the histograms: summing 'sum_gradients' over the bins must be
|
|
# constant across all features, and those sums must be equal to the
|
|
# node's gradient. Same for hessians.
|
|
|
|
rng = np.random.RandomState(42)
|
|
|
|
n_bins = 10
|
|
n_features = 20
|
|
n_samples = 500
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
X_binned = rng.randint(
|
|
0, n_bins, size=(n_samples, n_features), dtype=X_BINNED_DTYPE
|
|
)
|
|
X_binned = np.asfortranarray(X_binned)
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
if constant_hessian:
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
sum_hessians = 1 * n_samples
|
|
else:
|
|
all_hessians = rng.lognormal(size=n_samples).astype(G_H_DTYPE)
|
|
sum_hessians = all_hessians.sum()
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, constant_hessian, n_threads
|
|
)
|
|
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
|
|
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
constant_hessian,
|
|
)
|
|
|
|
hists_parent = builder.compute_histograms_brute(sample_indices)
|
|
value_parent = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
si_parent = splitter.find_node_split(
|
|
n_samples, hists_parent, sum_gradients, sum_hessians, value_parent
|
|
)
|
|
sample_indices_left, sample_indices_right, _ = splitter.split_indices(
|
|
si_parent, sample_indices
|
|
)
|
|
|
|
hists_left = builder.compute_histograms_brute(sample_indices_left)
|
|
value_left = compute_node_value(
|
|
si_parent.sum_gradient_left,
|
|
si_parent.sum_hessian_left,
|
|
-np.inf,
|
|
np.inf,
|
|
l2_regularization,
|
|
)
|
|
hists_right = builder.compute_histograms_brute(sample_indices_right)
|
|
value_right = compute_node_value(
|
|
si_parent.sum_gradient_right,
|
|
si_parent.sum_hessian_right,
|
|
-np.inf,
|
|
np.inf,
|
|
l2_regularization,
|
|
)
|
|
si_left = splitter.find_node_split(
|
|
n_samples,
|
|
hists_left,
|
|
si_parent.sum_gradient_left,
|
|
si_parent.sum_hessian_left,
|
|
value_left,
|
|
)
|
|
si_right = splitter.find_node_split(
|
|
n_samples,
|
|
hists_right,
|
|
si_parent.sum_gradient_right,
|
|
si_parent.sum_hessian_right,
|
|
value_right,
|
|
)
|
|
|
|
# make sure that si.sum_gradient_left + si.sum_gradient_right have their
|
|
# expected value, same for hessians
|
|
for si, indices in (
|
|
(si_parent, sample_indices),
|
|
(si_left, sample_indices_left),
|
|
(si_right, sample_indices_right),
|
|
):
|
|
gradient = si.sum_gradient_right + si.sum_gradient_left
|
|
expected_gradient = all_gradients[indices].sum()
|
|
hessian = si.sum_hessian_right + si.sum_hessian_left
|
|
if constant_hessian:
|
|
expected_hessian = indices.shape[0] * all_hessians[0]
|
|
else:
|
|
expected_hessian = all_hessians[indices].sum()
|
|
|
|
assert np.isclose(gradient, expected_gradient)
|
|
assert np.isclose(hessian, expected_hessian)
|
|
|
|
# make sure sum of gradients in histograms are the same for all features,
|
|
# and make sure they're equal to their expected value
|
|
hists_parent = np.asarray(hists_parent, dtype=HISTOGRAM_DTYPE)
|
|
hists_left = np.asarray(hists_left, dtype=HISTOGRAM_DTYPE)
|
|
hists_right = np.asarray(hists_right, dtype=HISTOGRAM_DTYPE)
|
|
for hists, indices in (
|
|
(hists_parent, sample_indices),
|
|
(hists_left, sample_indices_left),
|
|
(hists_right, sample_indices_right),
|
|
):
|
|
# note: gradients and hessians have shape (n_features,),
|
|
# we're comparing them to *scalars*. This has the benefit of also
|
|
# making sure that all the entries are equal across features.
|
|
gradients = hists["sum_gradients"].sum(axis=1) # shape = (n_features,)
|
|
expected_gradient = all_gradients[indices].sum() # scalar
|
|
hessians = hists["sum_hessians"].sum(axis=1)
|
|
if constant_hessian:
|
|
# 0 is not the actual hessian, but it's not computed in this case
|
|
expected_hessian = 0.0
|
|
else:
|
|
expected_hessian = all_hessians[indices].sum()
|
|
|
|
assert np.allclose(gradients, expected_gradient)
|
|
assert np.allclose(hessians, expected_hessian)
|
|
|
|
|
|
def test_split_indices():
|
|
# Check that split_indices returns the correct splits and that
|
|
# splitter.partition is consistent with what is returned.
|
|
rng = np.random.RandomState(421)
|
|
|
|
n_bins = 5
|
|
n_samples = 10
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
# split will happen on feature 1 and on bin 3
|
|
X_binned = [
|
|
[0, 0],
|
|
[0, 3],
|
|
[0, 4],
|
|
[0, 0],
|
|
[0, 0],
|
|
[0, 0],
|
|
[0, 0],
|
|
[0, 4],
|
|
[0, 0],
|
|
[0, 4],
|
|
]
|
|
X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_gradients = rng.randn(n_samples).astype(G_H_DTYPE)
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
sum_hessians = 1 * n_samples
|
|
hessians_are_constant = True
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
|
)
|
|
n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
|
|
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
assert np.all(sample_indices == splitter.partition)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
si_root = splitter.find_node_split(
|
|
n_samples, histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
|
|
# sanity checks for best split
|
|
assert si_root.feature_idx == 1
|
|
assert si_root.bin_idx == 3
|
|
|
|
samples_left, samples_right, position_right = splitter.split_indices(
|
|
si_root, splitter.partition
|
|
)
|
|
assert set(samples_left) == set([0, 1, 3, 4, 5, 6, 8])
|
|
assert set(samples_right) == set([2, 7, 9])
|
|
|
|
assert list(samples_left) == list(splitter.partition[:position_right])
|
|
assert list(samples_right) == list(splitter.partition[position_right:])
|
|
|
|
# Check that the resulting split indices sizes are consistent with the
|
|
# count statistics anticipated when looking for the best split.
|
|
assert samples_left.shape[0] == si_root.n_samples_left
|
|
assert samples_right.shape[0] == si_root.n_samples_right
|
|
|
|
|
|
def test_min_gain_to_split():
|
|
# Try to split a pure node (all gradients are equal, same for hessians)
|
|
# with min_gain_to_split = 0 and make sure that the node is not split (best
|
|
# possible gain = -1). Note: before the strict inequality comparison, this
|
|
# test would fail because the node would be split with a gain of 0.
|
|
rng = np.random.RandomState(42)
|
|
l2_regularization = 0
|
|
min_hessian_to_split = 0
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
n_bins = 255
|
|
n_samples = 100
|
|
X_binned = np.asfortranarray(
|
|
rng.randint(0, n_bins, size=(n_samples, 1)), dtype=X_BINNED_DTYPE
|
|
)
|
|
binned_feature = X_binned[:, 0]
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_hessians = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
|
all_gradients = np.ones_like(binned_feature, dtype=G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
sum_hessians = all_hessians.sum()
|
|
hessians_are_constant = False
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
|
)
|
|
n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1], dtype=np.uint32)
|
|
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
split_info = splitter.find_node_split(
|
|
n_samples, histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
assert split_info.gain == -1
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X_binned, all_gradients, has_missing_values, n_bins_non_missing, "
|
|
" expected_split_on_nan, expected_bin_idx, expected_go_to_left",
|
|
[
|
|
# basic sanity check with no missing values: given the gradient
|
|
# values, the split must occur on bin_idx=3
|
|
(
|
|
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], # X_binned
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5], # gradients
|
|
False, # no missing values
|
|
10, # n_bins_non_missing
|
|
False, # don't split on nans
|
|
3, # expected_bin_idx
|
|
"not_applicable",
|
|
),
|
|
# We replace 2 samples by NaNs (bin_idx=8)
|
|
# These 2 samples were mapped to the left node before, so they should
|
|
# be mapped to left node again
|
|
# Notice how the bin_idx threshold changes from 3 to 1.
|
|
(
|
|
[8, 0, 1, 8, 2, 3, 4, 5, 6, 7], # 8 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
8, # n_bins_non_missing
|
|
False, # don't split on nans
|
|
1, # cut on bin_idx=1
|
|
True,
|
|
), # missing values go to left
|
|
# same as above, but with non-consecutive missing_values_bin
|
|
(
|
|
[9, 0, 1, 9, 2, 3, 4, 5, 6, 7], # 9 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
8, # n_bins_non_missing
|
|
False, # don't split on nans
|
|
1, # cut on bin_idx=1
|
|
True,
|
|
), # missing values go to left
|
|
# this time replacing 2 samples that were on the right.
|
|
(
|
|
[0, 1, 2, 3, 8, 4, 8, 5, 6, 7], # 8 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
8, # n_bins_non_missing
|
|
False, # don't split on nans
|
|
3, # cut on bin_idx=3 (like in first case)
|
|
False,
|
|
), # missing values go to right
|
|
# same as above, but with non-consecutive missing_values_bin
|
|
(
|
|
[0, 1, 2, 3, 9, 4, 9, 5, 6, 7], # 9 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
8, # n_bins_non_missing
|
|
False, # don't split on nans
|
|
3, # cut on bin_idx=3 (like in first case)
|
|
False,
|
|
), # missing values go to right
|
|
# For the following cases, split_on_nans is True (we replace all of
|
|
# the samples with nans, instead of just 2).
|
|
(
|
|
[0, 1, 2, 3, 4, 4, 4, 4, 4, 4], # 4 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
4, # n_bins_non_missing
|
|
True, # split on nans
|
|
3, # cut on bin_idx=3
|
|
False,
|
|
), # missing values go to right
|
|
# same as above, but with non-consecutive missing_values_bin
|
|
(
|
|
[0, 1, 2, 3, 9, 9, 9, 9, 9, 9], # 9 <=> missing
|
|
[1, 1, 1, 1, 1, 1, 5, 5, 5, 5],
|
|
True, # missing values
|
|
4, # n_bins_non_missing
|
|
True, # split on nans
|
|
3, # cut on bin_idx=3
|
|
False,
|
|
), # missing values go to right
|
|
(
|
|
[6, 6, 6, 6, 0, 1, 2, 3, 4, 5], # 6 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
6, # n_bins_non_missing
|
|
True, # split on nans
|
|
5, # cut on bin_idx=5
|
|
False,
|
|
), # missing values go to right
|
|
# same as above, but with non-consecutive missing_values_bin
|
|
(
|
|
[9, 9, 9, 9, 0, 1, 2, 3, 4, 5], # 9 <=> missing
|
|
[1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
|
|
True, # missing values
|
|
6, # n_bins_non_missing
|
|
True, # split on nans
|
|
5, # cut on bin_idx=5
|
|
False,
|
|
), # missing values go to right
|
|
],
|
|
)
|
|
def test_splitting_missing_values(
|
|
X_binned,
|
|
all_gradients,
|
|
has_missing_values,
|
|
n_bins_non_missing,
|
|
expected_split_on_nan,
|
|
expected_bin_idx,
|
|
expected_go_to_left,
|
|
):
|
|
# Make sure missing values are properly supported.
|
|
# we build an artificial example with gradients such that the best split
|
|
# is on bin_idx=3, when there are no missing values.
|
|
# Then we introduce missing values and:
|
|
# - make sure the chosen bin is correct (find_best_bin()): it's
|
|
# still the same split, even though the index of the bin may change
|
|
# - make sure the missing values are mapped to the correct child
|
|
# (split_indices())
|
|
|
|
n_bins = max(X_binned) + 1
|
|
n_samples = len(X_binned)
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
|
|
X_binned = np.asfortranarray(X_binned)
|
|
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
|
|
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
sum_hessians = 1 * n_samples
|
|
hessians_are_constant = True
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
|
)
|
|
|
|
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
split_info = splitter.find_node_split(
|
|
n_samples, histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
|
|
assert split_info.bin_idx == expected_bin_idx
|
|
if has_missing_values:
|
|
assert split_info.missing_go_to_left == expected_go_to_left
|
|
|
|
split_on_nan = split_info.bin_idx == n_bins_non_missing[0] - 1
|
|
assert split_on_nan == expected_split_on_nan
|
|
|
|
# Make sure the split is properly computed.
|
|
# This also make sure missing values are properly assigned to the correct
|
|
# child in split_indices()
|
|
samples_left, samples_right, _ = splitter.split_indices(
|
|
split_info, splitter.partition
|
|
)
|
|
|
|
if not expected_split_on_nan:
|
|
# When we don't split on nans, the split should always be the same.
|
|
assert set(samples_left) == set([0, 1, 2, 3])
|
|
assert set(samples_right) == set([4, 5, 6, 7, 8, 9])
|
|
else:
|
|
# When we split on nans, samples with missing values are always mapped
|
|
# to the right child.
|
|
missing_samples_indices = np.flatnonzero(
|
|
np.array(X_binned) == missing_values_bin_idx
|
|
)
|
|
non_missing_samples_indices = np.flatnonzero(
|
|
np.array(X_binned) != missing_values_bin_idx
|
|
)
|
|
|
|
assert set(samples_right) == set(missing_samples_indices)
|
|
assert set(samples_left) == set(non_missing_samples_indices)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X_binned, has_missing_values, n_bins_non_missing, ",
|
|
[
|
|
# one category
|
|
([0] * 20, False, 1),
|
|
# all categories appear less than MIN_CAT_SUPPORT (hardcoded to 10)
|
|
([0] * 9 + [1] * 8, False, 2),
|
|
# only one category appears more than MIN_CAT_SUPPORT
|
|
([0] * 12 + [1] * 8, False, 2),
|
|
# missing values + category appear less than MIN_CAT_SUPPORT
|
|
# 9 is missing
|
|
([0] * 9 + [1] * 8 + [9] * 4, True, 2),
|
|
# no non-missing category
|
|
([9] * 11, True, 0),
|
|
],
|
|
)
|
|
def test_splitting_categorical_cat_smooth(
|
|
X_binned, has_missing_values, n_bins_non_missing
|
|
):
|
|
# Checks categorical splits are correct when the MIN_CAT_SUPPORT constraint
|
|
# isn't respected: there are no splits
|
|
|
|
n_bins = max(X_binned) + 1
|
|
n_samples = len(X_binned)
|
|
X_binned = np.array([X_binned], dtype=X_BINNED_DTYPE).T
|
|
X_binned = np.asfortranarray(X_binned)
|
|
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_gradients = np.ones(n_samples, dtype=G_H_DTYPE)
|
|
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
sum_hessians = n_samples
|
|
hessians_are_constant = True
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
|
)
|
|
|
|
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
split_info = splitter.find_node_split(
|
|
n_samples, histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
|
|
# no split found
|
|
assert split_info.gain == -1
|
|
|
|
|
|
def _assert_categories_equals_bitset(categories, bitset):
|
|
# assert that the bitset exactly corresponds to the categories
|
|
# bitset is assumed to be an array of 8 uint32 elements
|
|
|
|
# form bitset from threshold
|
|
expected_bitset = np.zeros(8, dtype=np.uint32)
|
|
for cat in categories:
|
|
idx = cat // 32
|
|
shift = cat % 32
|
|
expected_bitset[idx] |= 1 << shift
|
|
|
|
# check for equality
|
|
assert_array_equal(expected_bitset, bitset)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X_binned, all_gradients, expected_categories_left, n_bins_non_missing,"
|
|
"missing_values_bin_idx, has_missing_values, expected_missing_go_to_left",
|
|
[
|
|
# 4 categories
|
|
(
|
|
[0, 1, 2, 3] * 11, # X_binned
|
|
[10, 1, 10, 10] * 11, # all_gradients
|
|
[1], # expected_categories_left
|
|
4, # n_bins_non_missing
|
|
4, # missing_values_bin_idx
|
|
False, # has_missing_values
|
|
None,
|
|
), # expected_missing_go_to_left, unchecked
|
|
# Make sure that the categories that are on the right (second half) of
|
|
# the sorted categories array can still go in the left child. In this
|
|
# case, the best split was found when scanning from right to left.
|
|
(
|
|
[0, 1, 2, 3] * 11, # X_binned
|
|
[10, 10, 10, 1] * 11, # all_gradients
|
|
[3], # expected_categories_left
|
|
4, # n_bins_non_missing
|
|
4, # missing_values_bin_idx
|
|
False, # has_missing_values
|
|
None,
|
|
), # expected_missing_go_to_left, unchecked
|
|
# categories that don't respect MIN_CAT_SUPPORT (cat 4) are always
|
|
# mapped to the right child
|
|
(
|
|
[0, 1, 2, 3] * 11 + [4] * 5, # X_binned
|
|
[10, 10, 10, 1] * 11 + [10] * 5, # all_gradients
|
|
[3], # expected_categories_left
|
|
4, # n_bins_non_missing
|
|
4, # missing_values_bin_idx
|
|
False, # has_missing_values
|
|
None,
|
|
), # expected_missing_go_to_left, unchecked
|
|
# categories that don't respect MIN_CAT_SUPPORT are always mapped to
|
|
# the right child: in this case a more sensible split could have been
|
|
# 3, 4 - 0, 1, 2
|
|
# But the split is still 3 - 0, 1, 2, 4. this is because we only scan
|
|
# up to the middle of the sorted category array (0, 1, 2, 3), and
|
|
# because we exclude cat 4 in this array.
|
|
(
|
|
[0, 1, 2, 3] * 11 + [4] * 5, # X_binned
|
|
[10, 10, 10, 1] * 11 + [1] * 5, # all_gradients
|
|
[3], # expected_categories_left
|
|
4, # n_bins_non_missing
|
|
4, # missing_values_bin_idx
|
|
False, # has_missing_values
|
|
None,
|
|
), # expected_missing_go_to_left, unchecked
|
|
# 4 categories with missing values that go to the right
|
|
(
|
|
[0, 1, 2] * 11 + [9] * 11, # X_binned
|
|
[10, 1, 10] * 11 + [10] * 11, # all_gradients
|
|
[1], # expected_categories_left
|
|
3, # n_bins_non_missing
|
|
9, # missing_values_bin_idx
|
|
True, # has_missing_values
|
|
False,
|
|
), # expected_missing_go_to_left
|
|
# 4 categories with missing values that go to the left
|
|
(
|
|
[0, 1, 2] * 11 + [9] * 11, # X_binned
|
|
[10, 1, 10] * 11 + [1] * 11, # all_gradients
|
|
[1, 9], # expected_categories_left
|
|
3, # n_bins_non_missing
|
|
9, # missing_values_bin_idx
|
|
True, # has_missing_values
|
|
True,
|
|
), # expected_missing_go_to_left
|
|
# split is on the missing value
|
|
(
|
|
[0, 1, 2, 3, 4] * 11 + [255] * 12, # X_binned
|
|
[10, 10, 10, 10, 10] * 11 + [1] * 12, # all_gradients
|
|
[255], # expected_categories_left
|
|
5, # n_bins_non_missing
|
|
255, # missing_values_bin_idx
|
|
True, # has_missing_values
|
|
True,
|
|
), # expected_missing_go_to_left
|
|
# split on even categories
|
|
(
|
|
list(range(60)) * 12, # X_binned
|
|
[10, 1] * 360, # all_gradients
|
|
list(range(1, 60, 2)), # expected_categories_left
|
|
59, # n_bins_non_missing
|
|
59, # missing_values_bin_idx
|
|
True, # has_missing_values
|
|
True,
|
|
), # expected_missing_go_to_left
|
|
# split on every 8 categories
|
|
(
|
|
list(range(256)) * 12, # X_binned
|
|
[10, 10, 10, 10, 10, 10, 10, 1] * 384, # all_gradients
|
|
list(range(7, 256, 8)), # expected_categories_left
|
|
255, # n_bins_non_missing
|
|
255, # missing_values_bin_idx
|
|
True, # has_missing_values
|
|
True,
|
|
), # expected_missing_go_to_left
|
|
],
|
|
)
|
|
def test_splitting_categorical_sanity(
|
|
X_binned,
|
|
all_gradients,
|
|
expected_categories_left,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
expected_missing_go_to_left,
|
|
):
|
|
# Tests various combinations of categorical splits
|
|
|
|
n_samples = len(X_binned)
|
|
n_bins = max(X_binned) + 1
|
|
|
|
X_binned = np.array(X_binned, dtype=X_BINNED_DTYPE).reshape(-1, 1)
|
|
X_binned = np.asfortranarray(X_binned)
|
|
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_gradients = np.array(all_gradients, dtype=G_H_DTYPE)
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
has_missing_values = np.array([has_missing_values], dtype=np.uint8)
|
|
sum_gradients = all_gradients.sum()
|
|
sum_hessians = n_samples
|
|
hessians_are_constant = True
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned, n_bins, all_gradients, all_hessians, hessians_are_constant, n_threads
|
|
)
|
|
|
|
n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.ones_like(monotonic_cst, dtype=np.uint8)
|
|
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
split_info = splitter.find_node_split(
|
|
n_samples, histograms, sum_gradients, sum_hessians, value
|
|
)
|
|
|
|
assert split_info.is_categorical
|
|
assert split_info.gain > 0
|
|
_assert_categories_equals_bitset(
|
|
expected_categories_left, split_info.left_cat_bitset
|
|
)
|
|
if has_missing_values:
|
|
assert split_info.missing_go_to_left == expected_missing_go_to_left
|
|
# If there is no missing value during training, the flag missing_go_to_left
|
|
# is set later in the grower.
|
|
|
|
# make sure samples are split correctly
|
|
samples_left, samples_right, _ = splitter.split_indices(
|
|
split_info, splitter.partition
|
|
)
|
|
|
|
left_mask = np.isin(X_binned.ravel(), expected_categories_left)
|
|
assert_array_equal(sample_indices[left_mask], samples_left)
|
|
assert_array_equal(sample_indices[~left_mask], samples_right)
|
|
|
|
|
|
def test_split_interaction_constraints():
|
|
"""Check that allowed_features are respected."""
|
|
n_features = 4
|
|
# features 1 and 2 are not allowed to be split on
|
|
allowed_features = np.array([0, 3], dtype=np.uint32)
|
|
n_bins = 5
|
|
n_samples = 10
|
|
l2_regularization = 0.0
|
|
min_hessian_to_split = 1e-3
|
|
min_samples_leaf = 1
|
|
min_gain_to_split = 0.0
|
|
|
|
sample_indices = np.arange(n_samples, dtype=np.uint32)
|
|
all_hessians = np.ones(1, dtype=G_H_DTYPE)
|
|
sum_hessians = n_samples
|
|
hessians_are_constant = True
|
|
|
|
split_features = []
|
|
|
|
# The loop is to ensure that we split at least once on each allowed feature (0, 3).
|
|
# This is tracked by split_features and checked at the end.
|
|
for i in range(10):
|
|
rng = np.random.RandomState(919 + i)
|
|
X_binned = np.asfortranarray(
|
|
rng.randint(0, n_bins - 1, size=(n_samples, n_features)),
|
|
dtype=X_BINNED_DTYPE,
|
|
)
|
|
X_binned = np.asfortranarray(X_binned, dtype=X_BINNED_DTYPE)
|
|
|
|
# Make feature 1 very important
|
|
all_gradients = (10 * X_binned[:, 1] + rng.randn(n_samples)).astype(G_H_DTYPE)
|
|
sum_gradients = all_gradients.sum()
|
|
|
|
builder = HistogramBuilder(
|
|
X_binned,
|
|
n_bins,
|
|
all_gradients,
|
|
all_hessians,
|
|
hessians_are_constant,
|
|
n_threads,
|
|
)
|
|
n_bins_non_missing = np.array([n_bins] * X_binned.shape[1], dtype=np.uint32)
|
|
has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
|
|
monotonic_cst = np.array(
|
|
[MonotonicConstraint.NO_CST] * X_binned.shape[1], dtype=np.int8
|
|
)
|
|
is_categorical = np.zeros_like(monotonic_cst, dtype=np.uint8)
|
|
missing_values_bin_idx = n_bins - 1
|
|
splitter = Splitter(
|
|
X_binned,
|
|
n_bins_non_missing,
|
|
missing_values_bin_idx,
|
|
has_missing_values,
|
|
is_categorical,
|
|
monotonic_cst,
|
|
l2_regularization,
|
|
min_hessian_to_split,
|
|
min_samples_leaf,
|
|
min_gain_to_split,
|
|
hessians_are_constant,
|
|
)
|
|
|
|
assert np.all(sample_indices == splitter.partition)
|
|
|
|
histograms = builder.compute_histograms_brute(sample_indices)
|
|
value = compute_node_value(
|
|
sum_gradients, sum_hessians, -np.inf, np.inf, l2_regularization
|
|
)
|
|
|
|
# with all features allowed, feature 1 should be split on as it is the most
|
|
# important one by construction of the gradients
|
|
si_root = splitter.find_node_split(
|
|
n_samples,
|
|
histograms,
|
|
sum_gradients,
|
|
sum_hessians,
|
|
value,
|
|
allowed_features=None,
|
|
)
|
|
assert si_root.feature_idx == 1
|
|
|
|
# only features 0 and 3 are allowed to be split on
|
|
si_root = splitter.find_node_split(
|
|
n_samples,
|
|
histograms,
|
|
sum_gradients,
|
|
sum_hessians,
|
|
value,
|
|
allowed_features=allowed_features,
|
|
)
|
|
split_features.append(si_root.feature_idx)
|
|
assert si_root.feature_idx in allowed_features
|
|
|
|
# make sure feature 0 and feature 3 are split on in the constraint setting
|
|
assert set(allowed_features) == set(split_features)
|