projektAI/venv/Lib/site-packages/sklearn/tree/tests/test_tree.py
2021-06-06 22:13:05 +02:00

2117 lines
77 KiB
Python

"""
Testing for the tree module (sklearn.tree).
"""
import copy
import pickle
from itertools import product
import struct
import pytest
import numpy as np
from numpy.testing import assert_allclose
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from sklearn.random_projection import _sparse_random_matrix
from sklearn.dummy import DummyRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_poisson_deviance
from sklearn.model_selection import train_test_split
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_warns
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import create_memmap_backed_data
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import skip_if_32bit
from sklearn.utils.estimator_checks import check_sample_weights_invariance
from sklearn.utils.validation import check_random_state
from sklearn.exceptions import NotFittedError
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import ExtraTreeRegressor
from sklearn import tree
from sklearn.tree._tree import TREE_LEAF, TREE_UNDEFINED
from sklearn.tree._classes import CRITERIA_CLF
from sklearn.tree._classes import CRITERIA_REG
from sklearn import datasets
from sklearn.utils import compute_sample_weight
CLF_CRITERIONS = ("gini", "entropy")
REG_CRITERIONS = ("mse", "mae", "friedman_mse", "poisson")
CLF_TREES = {
"DecisionTreeClassifier": DecisionTreeClassifier,
"ExtraTreeClassifier": ExtraTreeClassifier,
}
REG_TREES = {
"DecisionTreeRegressor": DecisionTreeRegressor,
"ExtraTreeRegressor": ExtraTreeRegressor,
}
ALL_TREES = dict()
ALL_TREES.update(CLF_TREES)
ALL_TREES.update(REG_TREES)
SPARSE_TREES = ["DecisionTreeClassifier", "DecisionTreeRegressor",
"ExtraTreeClassifier", "ExtraTreeRegressor"]
X_small = np.array([
[0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0, ],
[0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1, ],
[-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1, ],
[-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1, ],
[-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, ],
[-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1, ],
[2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1, ],
[2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1, ],
[2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1, ],
[2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0, ],
[2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0, ],
[2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0, ],
[2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0, ],
[1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0, ],
[3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1, ],
[2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1, ],
[2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1, ],
[2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1, ],
[2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1, ],
[2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1, ],
[2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1, ],
[1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1, ],
[3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0, ]])
y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
0, 0]
y_small_reg = [1.0, 2.1, 1.2, 0.05, 10, 2.4, 3.1, 1.01, 0.01, 2.98, 3.1, 1.1,
0.0, 1.2, 2, 11, 0, 0, 4.5, 0.201, 1.06, 0.9, 0]
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
true_result = [-1, 1, 1]
# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = np.random.RandomState(1)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
# also load the diabetes dataset
# and randomly permute it
diabetes = datasets.load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]
digits = datasets.load_digits()
perm = rng.permutation(digits.target.size)
digits.data = digits.data[perm]
digits.target = digits.target[perm]
random_state = check_random_state(0)
X_multilabel, y_multilabel = datasets.make_multilabel_classification(
random_state=0, n_samples=30, n_features=10)
# NB: despite their names X_sparse_* are numpy arrays (and not sparse matrices)
X_sparse_pos = random_state.uniform(size=(20, 5))
X_sparse_pos[X_sparse_pos <= 0.8] = 0.
y_random = random_state.randint(0, 4, size=(20, ))
X_sparse_mix = _sparse_random_matrix(20, 10, density=0.25,
random_state=0).toarray()
DATASETS = {
"iris": {"X": iris.data, "y": iris.target},
"diabetes": {"X": diabetes.data, "y": diabetes.target},
"digits": {"X": digits.data, "y": digits.target},
"toy": {"X": X, "y": y},
"clf_small": {"X": X_small, "y": y_small},
"reg_small": {"X": X_small, "y": y_small_reg},
"multilabel": {"X": X_multilabel, "y": y_multilabel},
"sparse-pos": {"X": X_sparse_pos, "y": y_random},
"sparse-neg": {"X": - X_sparse_pos, "y": y_random},
"sparse-mix": {"X": X_sparse_mix, "y": y_random},
"zeros": {"X": np.zeros((20, 3)), "y": y_random}
}
for name in DATASETS:
DATASETS[name]["X_sparse"] = csc_matrix(DATASETS[name]["X"])
def assert_tree_equal(d, s, message):
assert s.node_count == d.node_count, (
"{0}: inequal number of node ({1} != {2})"
"".format(message, s.node_count, d.node_count))
assert_array_equal(d.children_right, s.children_right,
message + ": inequal children_right")
assert_array_equal(d.children_left, s.children_left,
message + ": inequal children_left")
external = d.children_right == TREE_LEAF
internal = np.logical_not(external)
assert_array_equal(d.feature[internal], s.feature[internal],
message + ": inequal features")
assert_array_equal(d.threshold[internal], s.threshold[internal],
message + ": inequal threshold")
assert_array_equal(d.n_node_samples.sum(), s.n_node_samples.sum(),
message + ": inequal sum(n_node_samples)")
assert_array_equal(d.n_node_samples, s.n_node_samples,
message + ": inequal n_node_samples")
assert_almost_equal(d.impurity, s.impurity,
err_msg=message + ": inequal impurity")
assert_array_almost_equal(d.value[external], s.value[external],
err_msg=message + ": inequal value")
def test_classification_toy():
# Check classification on a toy dataset.
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result,
"Failed with {0}".format(name))
clf = Tree(max_features=1, random_state=1)
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result,
"Failed with {0}".format(name))
def test_weighted_classification_toy():
# Check classification on a weighted toy dataset.
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y, sample_weight=np.ones(len(X)))
assert_array_equal(clf.predict(T), true_result,
"Failed with {0}".format(name))
clf.fit(X, y, sample_weight=np.full(len(X), 0.5))
assert_array_equal(clf.predict(T), true_result,
"Failed with {0}".format(name))
@pytest.mark.parametrize("Tree", REG_TREES.values())
@pytest.mark.parametrize("criterion", REG_CRITERIONS)
def test_regression_toy(Tree, criterion):
# Check regression on a toy dataset.
if criterion == "poisson":
# make target positive while not touching the original y and
# true_result
a = np.abs(np.min(y)) + 1
y_train = np.array(y) + a
y_test = np.array(true_result) + a
else:
y_train = y
y_test = true_result
reg = Tree(criterion=criterion, random_state=1)
reg.fit(X, y_train)
assert_allclose(reg.predict(T), y_test)
clf = Tree(criterion=criterion, max_features=1, random_state=1)
clf.fit(X, y_train)
assert_allclose(reg.predict(T), y_test)
def test_xor():
# Check on a XOR problem
y = np.zeros((10, 10))
y[:5, :5] = 1
y[5:, 5:] = 1
gridx, gridy = np.indices(y.shape)
X = np.vstack([gridx.ravel(), gridy.ravel()]).T
y = y.ravel()
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y)
assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
clf = Tree(random_state=0, max_features=1)
clf.fit(X, y)
assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
def test_iris():
# Check consistency on dataset iris.
for (name, Tree), criterion in product(CLF_TREES.items(), CLF_CRITERIONS):
clf = Tree(criterion=criterion, random_state=0)
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
assert score > 0.9, (
"Failed with {0}, criterion = {1} and score = {2}"
"".format(name, criterion, score))
clf = Tree(criterion=criterion, max_features=2, random_state=0)
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
assert score > 0.5, (
"Failed with {0}, criterion = {1} and score = {2}"
"".format(name, criterion, score))
@pytest.mark.parametrize("name, Tree", REG_TREES.items())
@pytest.mark.parametrize("criterion", REG_CRITERIONS)
def test_diabetes_overfit(name, Tree, criterion):
# check consistency of overfitted trees on the diabetes dataset
# since the trees will overfit, we expect an MSE of 0
reg = Tree(criterion=criterion, random_state=0)
reg.fit(diabetes.data, diabetes.target)
score = mean_squared_error(diabetes.target, reg.predict(diabetes.data))
assert score == pytest.approx(0), (
f"Failed with {name}, criterion = {criterion} and score = {score}"
)
@skip_if_32bit
@pytest.mark.parametrize("name, Tree", REG_TREES.items())
@pytest.mark.parametrize(
"criterion, max_depth, metric, max_loss",
[("mse", 15, mean_squared_error, 60),
("mae", 20, mean_squared_error, 60),
("friedman_mse", 15, mean_squared_error, 60),
("poisson", 15, mean_poisson_deviance, 30)]
)
def test_diabetes_underfit(name, Tree, criterion, max_depth, metric, max_loss):
# check consistency of trees when the depth and the number of features are
# limited
reg = Tree(
criterion=criterion, max_depth=max_depth,
max_features=6, random_state=0
)
reg.fit(diabetes.data, diabetes.target)
loss = metric(diabetes.target, reg.predict(diabetes.data))
assert 0 < loss < max_loss
def test_probability():
# Predict probabilities using DecisionTreeClassifier.
for name, Tree in CLF_TREES.items():
clf = Tree(max_depth=1, max_features=1, random_state=42)
clf.fit(iris.data, iris.target)
prob_predict = clf.predict_proba(iris.data)
assert_array_almost_equal(np.sum(prob_predict, 1),
np.ones(iris.data.shape[0]),
err_msg="Failed with {0}".format(name))
assert_array_equal(np.argmax(prob_predict, 1),
clf.predict(iris.data),
err_msg="Failed with {0}".format(name))
assert_almost_equal(clf.predict_proba(iris.data),
np.exp(clf.predict_log_proba(iris.data)), 8,
err_msg="Failed with {0}".format(name))
def test_arrayrepr():
# Check the array representation.
# Check resize
X = np.arange(10000)[:, np.newaxis]
y = np.arange(10000)
for name, Tree in REG_TREES.items():
reg = Tree(max_depth=None, random_state=0)
reg.fit(X, y)
def test_pure_set():
# Check when y is pure.
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [1, 1, 1, 1, 1, 1]
for name, TreeClassifier in CLF_TREES.items():
clf = TreeClassifier(random_state=0)
clf.fit(X, y)
assert_array_equal(clf.predict(X), y,
err_msg="Failed with {0}".format(name))
for name, TreeRegressor in REG_TREES.items():
reg = TreeRegressor(random_state=0)
reg.fit(X, y)
assert_almost_equal(reg.predict(X), y,
err_msg="Failed with {0}".format(name))
def test_numerical_stability():
# Check numerical stability.
X = np.array([
[152.08097839, 140.40744019, 129.75102234, 159.90493774],
[142.50700378, 135.81935120, 117.82884979, 162.75781250],
[127.28772736, 140.40744019, 129.75102234, 159.90493774],
[132.37025452, 143.71923828, 138.35694885, 157.84558105],
[103.10237122, 143.71928406, 138.35696411, 157.84559631],
[127.71276855, 143.71923828, 138.35694885, 157.84558105],
[120.91514587, 140.40744019, 129.75102234, 159.90493774]])
y = np.array(
[1., 0.70209277, 0.53896582, 0., 0.90914464, 0.48026916, 0.49622521])
with np.errstate(all="raise"):
for name, Tree in REG_TREES.items():
reg = Tree(random_state=0)
reg.fit(X, y)
reg.fit(X, -y)
reg.fit(-X, y)
reg.fit(-X, -y)
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(n_samples=5000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
for name, Tree in CLF_TREES.items():
clf = Tree(random_state=0)
clf.fit(X, y)
importances = clf.feature_importances_
n_important = np.sum(importances > 0.1)
assert importances.shape[0] == 10, "Failed with {0}".format(name)
assert n_important == 3, "Failed with {0}".format(name)
# Check on iris that importances are the same for all builders
clf = DecisionTreeClassifier(random_state=0)
clf.fit(iris.data, iris.target)
clf2 = DecisionTreeClassifier(random_state=0,
max_leaf_nodes=len(iris.data))
clf2.fit(iris.data, iris.target)
assert_array_equal(clf.feature_importances_,
clf2.feature_importances_)
def test_importances_raises():
# Check if variable importance before fit raises ValueError.
clf = DecisionTreeClassifier()
with pytest.raises(ValueError):
getattr(clf, 'feature_importances_')
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth, we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_max_features():
# Check max_features.
for name, TreeRegressor in REG_TREES.items():
reg = TreeRegressor(max_features="auto")
reg.fit(diabetes.data, diabetes.target)
assert reg.max_features_ == diabetes.data.shape[1]
for name, TreeClassifier in CLF_TREES.items():
clf = TreeClassifier(max_features="auto")
clf.fit(iris.data, iris.target)
assert clf.max_features_ == 2
for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(max_features="sqrt")
est.fit(iris.data, iris.target)
assert (est.max_features_ ==
int(np.sqrt(iris.data.shape[1])))
est = TreeEstimator(max_features="log2")
est.fit(iris.data, iris.target)
assert (est.max_features_ ==
int(np.log2(iris.data.shape[1])))
est = TreeEstimator(max_features=1)
est.fit(iris.data, iris.target)
assert est.max_features_ == 1
est = TreeEstimator(max_features=3)
est.fit(iris.data, iris.target)
assert est.max_features_ == 3
est = TreeEstimator(max_features=0.01)
est.fit(iris.data, iris.target)
assert est.max_features_ == 1
est = TreeEstimator(max_features=0.5)
est.fit(iris.data, iris.target)
assert (est.max_features_ ==
int(0.5 * iris.data.shape[1]))
est = TreeEstimator(max_features=1.0)
est.fit(iris.data, iris.target)
assert est.max_features_ == iris.data.shape[1]
est = TreeEstimator(max_features=None)
est.fit(iris.data, iris.target)
assert est.max_features_ == iris.data.shape[1]
# use values of max_features that are invalid
est = TreeEstimator(max_features=10)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_features=-1)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_features=0.0)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_features=1.5)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_features="foobar")
with pytest.raises(ValueError):
est.fit(X, y)
def test_error():
# Test that it gives proper exception on deficient input.
for name, TreeEstimator in CLF_TREES.items():
# predict before fit
est = TreeEstimator()
with pytest.raises(NotFittedError):
est.predict_proba(X)
est.fit(X, y)
X2 = [[-2, -1, 1]] # wrong feature shape for sample
with pytest.raises(ValueError):
est.predict_proba(X2)
for name, TreeEstimator in ALL_TREES.items():
with pytest.raises(ValueError):
TreeEstimator(min_samples_leaf=-1).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_leaf=.6).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_leaf=0.).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_leaf=3.).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_weight_fraction_leaf=-1).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_weight_fraction_leaf=0.51).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_split=-1).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_split=0.0).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_split=1.1).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_samples_split=2.5).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(max_depth=-1).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(max_features=42).fit(X, y)
# min_impurity_split warning
with ignore_warnings(category=FutureWarning):
with pytest.raises(ValueError):
TreeEstimator(min_impurity_split=-1.0).fit(X, y)
with pytest.raises(ValueError):
TreeEstimator(min_impurity_decrease=-1.0).fit(X, y)
# Wrong dimensions
est = TreeEstimator()
y2 = y[:-1]
with pytest.raises(ValueError):
est.fit(X, y2)
# Test with arrays that are non-contiguous.
Xf = np.asfortranarray(X)
est = TreeEstimator()
est.fit(Xf, y)
assert_almost_equal(est.predict(T), true_result)
# predict before fitting
est = TreeEstimator()
with pytest.raises(NotFittedError):
est.predict(T)
# predict on vector with different dims
est.fit(X, y)
t = np.asarray(T)
with pytest.raises(ValueError):
est.predict(t[:, 1:])
# wrong sample shape
Xt = np.array(X).T
est = TreeEstimator()
est.fit(np.dot(X, Xt), y)
with pytest.raises(ValueError):
est.predict(X)
with pytest.raises(ValueError):
est.apply(X)
clf = TreeEstimator()
clf.fit(X, y)
with pytest.raises(ValueError):
clf.predict(Xt)
with pytest.raises(ValueError):
clf.apply(Xt)
# apply before fitting
est = TreeEstimator()
with pytest.raises(NotFittedError):
est.apply(T)
# non positive target for Poisson splitting Criterion
est = DecisionTreeRegressor(criterion="poisson")
with pytest.raises(ValueError, match="y is not positive.*Poisson"):
est.fit([[0, 1, 2]], [0, 0, 0])
with pytest.raises(ValueError, match="Some.*y are negative.*Poisson"):
est.fit([[0, 1, 2]], [5, -0.1, 2])
def test_min_samples_split():
"""Test min_samples_split parameter"""
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
y = iris.target
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
TreeEstimator = ALL_TREES[name]
# test for integer parameter
est = TreeEstimator(min_samples_split=10,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
# count samples on nodes, -1 means it is a leaf
node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
assert np.min(node_samples) > 9, "Failed with {0}".format(name)
# test for float parameter
est = TreeEstimator(min_samples_split=0.2,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
# count samples on nodes, -1 means it is a leaf
node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
assert np.min(node_samples) > 9, "Failed with {0}".format(name)
def test_min_samples_leaf():
# Test if leaves contain more than leaf_count training examples
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
y = iris.target
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
TreeEstimator = ALL_TREES[name]
# test integer parameter
est = TreeEstimator(min_samples_leaf=5,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
out = est.tree_.apply(X)
node_counts = np.bincount(out)
# drop inner nodes
leaf_count = node_counts[node_counts != 0]
assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
# test float parameter
est = TreeEstimator(min_samples_leaf=0.1,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
out = est.tree_.apply(X)
node_counts = np.bincount(out)
# drop inner nodes
leaf_count = node_counts[node_counts != 0]
assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
def check_min_weight_fraction_leaf(name, datasets, sparse=False):
"""Test if leaves contain at least min_weight_fraction_leaf of the
training set"""
if sparse:
X = DATASETS[datasets]["X_sparse"].astype(np.float32)
else:
X = DATASETS[datasets]["X"].astype(np.float32)
y = DATASETS[datasets]["y"]
weights = rng.rand(X.shape[0])
total_weight = np.sum(weights)
TreeEstimator = ALL_TREES[name]
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
est = TreeEstimator(min_weight_fraction_leaf=frac,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y, sample_weight=weights)
if sparse:
out = est.tree_.apply(X.tocsr())
else:
out = est.tree_.apply(X)
node_weights = np.bincount(out, weights=weights)
# drop inner nodes
leaf_weights = node_weights[node_weights != 0]
assert (
np.min(leaf_weights) >=
total_weight * est.min_weight_fraction_leaf), (
"Failed with {0} min_weight_fraction_leaf={1}".format(
name, est.min_weight_fraction_leaf))
# test case with no weights passed in
total_weight = X.shape[0]
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
est = TreeEstimator(min_weight_fraction_leaf=frac,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
if sparse:
out = est.tree_.apply(X.tocsr())
else:
out = est.tree_.apply(X)
node_weights = np.bincount(out)
# drop inner nodes
leaf_weights = node_weights[node_weights != 0]
assert (
np.min(leaf_weights) >=
total_weight * est.min_weight_fraction_leaf), (
"Failed with {0} min_weight_fraction_leaf={1}".format(
name, est.min_weight_fraction_leaf))
@pytest.mark.parametrize("name", ALL_TREES)
def test_min_weight_fraction_leaf_on_dense_input(name):
check_min_weight_fraction_leaf(name, "iris")
@pytest.mark.parametrize("name", SPARSE_TREES)
def test_min_weight_fraction_leaf_on_sparse_input(name):
check_min_weight_fraction_leaf(name, "multilabel", True)
def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets,
sparse=False):
"""Test the interaction between min_weight_fraction_leaf and
min_samples_leaf when sample_weights is not provided in fit."""
if sparse:
X = DATASETS[datasets]["X_sparse"].astype(np.float32)
else:
X = DATASETS[datasets]["X"].astype(np.float32)
y = DATASETS[datasets]["y"]
total_weight = X.shape[0]
TreeEstimator = ALL_TREES[name]
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
# test integer min_samples_leaf
est = TreeEstimator(min_weight_fraction_leaf=frac,
max_leaf_nodes=max_leaf_nodes,
min_samples_leaf=5,
random_state=0)
est.fit(X, y)
if sparse:
out = est.tree_.apply(X.tocsr())
else:
out = est.tree_.apply(X)
node_weights = np.bincount(out)
# drop inner nodes
leaf_weights = node_weights[node_weights != 0]
assert (
np.min(leaf_weights) >=
max((total_weight *
est.min_weight_fraction_leaf), 5)), (
"Failed with {0} min_weight_fraction_leaf={1}, "
"min_samples_leaf={2}".format(
name, est.min_weight_fraction_leaf,
est.min_samples_leaf))
for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
# test float min_samples_leaf
est = TreeEstimator(min_weight_fraction_leaf=frac,
max_leaf_nodes=max_leaf_nodes,
min_samples_leaf=.1,
random_state=0)
est.fit(X, y)
if sparse:
out = est.tree_.apply(X.tocsr())
else:
out = est.tree_.apply(X)
node_weights = np.bincount(out)
# drop inner nodes
leaf_weights = node_weights[node_weights != 0]
assert (
np.min(leaf_weights) >=
max((total_weight * est.min_weight_fraction_leaf),
(total_weight * est.min_samples_leaf))), (
"Failed with {0} min_weight_fraction_leaf={1}, "
"min_samples_leaf={2}".format(name,
est.min_weight_fraction_leaf,
est.min_samples_leaf))
@pytest.mark.parametrize("name", ALL_TREES)
def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name):
check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris")
@pytest.mark.parametrize("name", SPARSE_TREES)
def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name):
check_min_weight_fraction_leaf_with_min_samples_leaf(
name, "multilabel", True)
def test_min_impurity_split():
# test if min_impurity_split creates leaves with impurity
# [0, min_impurity_split) when min_samples_leaf = 1 and
# min_samples_split = 2.
X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
y = iris.target
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
TreeEstimator = ALL_TREES[name]
min_impurity_split = .5
# verify leaf nodes without min_impurity_split less than
# impurity 1e-7
est = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
random_state=0)
assert est.min_impurity_split is None, (
"Failed, min_impurity_split = {0} != None".format(
est.min_impurity_split))
try:
assert_warns(FutureWarning, est.fit, X, y)
except AssertionError:
pass
for node in range(est.tree_.node_count):
if (est.tree_.children_left[node] == TREE_LEAF or
est.tree_.children_right[node] == TREE_LEAF):
assert est.tree_.impurity[node] == 0., (
"Failed with {0} min_impurity_split={1}".format(
est.tree_.impurity[node],
est.min_impurity_split))
# verify leaf nodes have impurity [0,min_impurity_split] when using
# min_impurity_split
est = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
min_impurity_split=min_impurity_split,
random_state=0)
assert_warns_message(FutureWarning,
"Use the min_impurity_decrease",
est.fit, X, y)
for node in range(est.tree_.node_count):
if (est.tree_.children_left[node] == TREE_LEAF or
est.tree_.children_right[node] == TREE_LEAF):
assert est.tree_.impurity[node] >= 0, (
"Failed with {0}, min_impurity_split={1}".format(
est.tree_.impurity[node],
est.min_impurity_split))
assert est.tree_.impurity[node] <= min_impurity_split, (
"Failed with {0}, min_impurity_split={1}".format(
est.tree_.impurity[node],
est.min_impurity_split))
def test_min_impurity_decrease():
# test if min_impurity_decrease ensure that a split is made only if
# if the impurity decrease is atleast that value
X, y = datasets.make_classification(n_samples=10000, random_state=42)
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
TreeEstimator = ALL_TREES[name]
# Check default value of min_impurity_decrease, 1e-7
est1 = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0)
# Check with explicit value of 0.05
est2 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=0.05, random_state=0)
# Check with a much lower value of 0.0001
est3 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=0.0001, random_state=0)
# Check with a much lower value of 0.1
est4 = TreeEstimator(max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=0.1, random_state=0)
for est, expected_decrease in ((est1, 1e-7), (est2, 0.05),
(est3, 0.0001), (est4, 0.1)):
assert est.min_impurity_decrease <= expected_decrease, (
"Failed, min_impurity_decrease = {0} > {1}".format(
est.min_impurity_decrease,
expected_decrease))
est.fit(X, y)
for node in range(est.tree_.node_count):
# If current node is a not leaf node, check if the split was
# justified w.r.t the min_impurity_decrease
if est.tree_.children_left[node] != TREE_LEAF:
imp_parent = est.tree_.impurity[node]
wtd_n_node = est.tree_.weighted_n_node_samples[node]
left = est.tree_.children_left[node]
wtd_n_left = est.tree_.weighted_n_node_samples[left]
imp_left = est.tree_.impurity[left]
wtd_imp_left = wtd_n_left * imp_left
right = est.tree_.children_right[node]
wtd_n_right = est.tree_.weighted_n_node_samples[right]
imp_right = est.tree_.impurity[right]
wtd_imp_right = wtd_n_right * imp_right
wtd_avg_left_right_imp = wtd_imp_right + wtd_imp_left
wtd_avg_left_right_imp /= wtd_n_node
fractional_node_weight = (
est.tree_.weighted_n_node_samples[node] / X.shape[0])
actual_decrease = fractional_node_weight * (
imp_parent - wtd_avg_left_right_imp)
assert actual_decrease >= expected_decrease, (
"Failed with {0} expected min_impurity_decrease={1}"
.format(actual_decrease,
expected_decrease))
for name, TreeEstimator in ALL_TREES.items():
if "Classifier" in name:
X, y = iris.data, iris.target
else:
X, y = diabetes.data, diabetes.target
est = TreeEstimator(random_state=0)
est.fit(X, y)
score = est.score(X, y)
fitted_attribute = dict()
for attribute in ["max_depth", "node_count", "capacity"]:
fitted_attribute[attribute] = getattr(est.tree_, attribute)
serialized_object = pickle.dumps(est)
est2 = pickle.loads(serialized_object)
assert type(est2) == est.__class__
score2 = est2.score(X, y)
assert score == score2, (
"Failed to generate same score after pickling "
"with {0}".format(name))
for attribute in fitted_attribute:
assert (getattr(est2.tree_, attribute) ==
fitted_attribute[attribute]), (
"Failed to generate same attribute {0} after "
"pickling with {1}".format(attribute, name))
def test_multioutput():
# Check estimators on multi-output problems.
X = [[-2, -1],
[-1, -1],
[-1, -2],
[1, 1],
[1, 2],
[2, 1],
[-2, 1],
[-1, 1],
[-1, 2],
[2, -1],
[1, -1],
[1, -2]]
y = [[-1, 0],
[-1, 0],
[-1, 0],
[1, 1],
[1, 1],
[1, 1],
[-1, 2],
[-1, 2],
[-1, 2],
[1, 3],
[1, 3],
[1, 3]]
T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
# toy classification problem
for name, TreeClassifier in CLF_TREES.items():
clf = TreeClassifier(random_state=0)
y_hat = clf.fit(X, y).predict(T)
assert_array_equal(y_hat, y_true)
assert y_hat.shape == (4, 2)
proba = clf.predict_proba(T)
assert len(proba) == 2
assert proba[0].shape == (4, 2)
assert proba[1].shape == (4, 4)
log_proba = clf.predict_log_proba(T)
assert len(log_proba) == 2
assert log_proba[0].shape == (4, 2)
assert log_proba[1].shape == (4, 4)
# toy regression problem
for name, TreeRegressor in REG_TREES.items():
reg = TreeRegressor(random_state=0)
y_hat = reg.fit(X, y).predict(T)
assert_almost_equal(y_hat, y_true)
assert y_hat.shape == (4, 2)
def test_classes_shape():
# Test that n_classes_ and classes_ have proper shape.
for name, TreeClassifier in CLF_TREES.items():
# Classification, single output
clf = TreeClassifier(random_state=0)
clf.fit(X, y)
assert clf.n_classes_ == 2
assert_array_equal(clf.classes_, [-1, 1])
# Classification, multi-output
_y = np.vstack((y, np.array(y) * 2)).T
clf = TreeClassifier(random_state=0)
clf.fit(X, _y)
assert len(clf.n_classes_) == 2
assert len(clf.classes_) == 2
assert_array_equal(clf.n_classes_, [2, 2])
assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
def test_unbalanced_iris():
# Check class rebalancing.
unbalanced_X = iris.data[:125]
unbalanced_y = iris.target[:125]
sample_weight = compute_sample_weight("balanced", unbalanced_y)
for name, TreeClassifier in CLF_TREES.items():
clf = TreeClassifier(random_state=0)
clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
def test_memory_layout():
# Check that it works no matter the memory layout
for (name, TreeEstimator), dtype in product(ALL_TREES.items(),
[np.float64, np.float32]):
est = TreeEstimator(random_state=0)
# Nothing
X = np.asarray(iris.data, dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# C-order
X = np.asarray(iris.data, order="C", dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# F-order
X = np.asarray(iris.data, order="F", dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# Contiguous
X = np.ascontiguousarray(iris.data, dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# csr matrix
X = csr_matrix(iris.data, dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# csc_matrix
X = csc_matrix(iris.data, dtype=dtype)
y = iris.target
assert_array_equal(est.fit(X, y).predict(X), y)
# Strided
X = np.asarray(iris.data[::3], dtype=dtype)
y = iris.target[::3]
assert_array_equal(est.fit(X, y).predict(X), y)
def test_sample_weight():
# Check sample weighting.
# Test that zero-weighted samples are not taken into account
X = np.arange(100)[:, np.newaxis]
y = np.ones(100)
y[:50] = 0.0
sample_weight = np.ones(100)
sample_weight[y == 0] = 0.0
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert_array_equal(clf.predict(X), np.ones(100))
# Test that low weighted samples are not taken into account at low depth
X = np.arange(200)[:, np.newaxis]
y = np.zeros(200)
y[50:100] = 1
y[100:200] = 2
X[100:200, 0] = 200
sample_weight = np.ones(200)
sample_weight[y == 2] = .51 # Samples of class '2' are still weightier
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert clf.tree_.threshold[0] == 149.5
sample_weight[y == 2] = .5 # Samples of class '2' are no longer weightier
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert clf.tree_.threshold[0] == 49.5 # Threshold should have moved
# Test that sample weighting is the same as having duplicates
X = iris.data
y = iris.target
duplicates = rng.randint(0, X.shape[0], 100)
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X[duplicates], y[duplicates])
sample_weight = np.bincount(duplicates, minlength=X.shape[0])
clf2 = DecisionTreeClassifier(random_state=1)
clf2.fit(X, y, sample_weight=sample_weight)
internal = clf.tree_.children_left != tree._tree.TREE_LEAF
assert_array_almost_equal(clf.tree_.threshold[internal],
clf2.tree_.threshold[internal])
def test_sample_weight_invalid():
# Check sample weighting raises errors.
X = np.arange(100)[:, np.newaxis]
y = np.ones(100)
y[:50] = 0.0
clf = DecisionTreeClassifier(random_state=0)
sample_weight = np.random.rand(100, 1)
with pytest.raises(ValueError):
clf.fit(X, y, sample_weight=sample_weight)
sample_weight = np.array(0)
expected_err = r"Singleton.* cannot be considered a valid collection"
with pytest.raises(TypeError, match=expected_err):
clf.fit(X, y, sample_weight=sample_weight)
def check_class_weights(name):
"""Check class_weights resemble sample_weights behavior."""
TreeClassifier = CLF_TREES[name]
# Iris is balanced, so no effect expected for using 'balanced' weights
clf1 = TreeClassifier(random_state=0)
clf1.fit(iris.data, iris.target)
clf2 = TreeClassifier(class_weight='balanced', random_state=0)
clf2.fit(iris.data, iris.target)
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
# Make a multi-output problem with three copies of Iris
iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
# Create user-defined weights that should balance over the outputs
clf3 = TreeClassifier(class_weight=[{0: 2., 1: 2., 2: 1.},
{0: 2., 1: 1., 2: 2.},
{0: 1., 1: 2., 2: 2.}],
random_state=0)
clf3.fit(iris.data, iris_multi)
assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
# Check against multi-output "auto" which should also have no effect
clf4 = TreeClassifier(class_weight='balanced', random_state=0)
clf4.fit(iris.data, iris_multi)
assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
# Inflate importance of class 1, check against user-defined weights
sample_weight = np.ones(iris.target.shape)
sample_weight[iris.target == 1] *= 100
class_weight = {0: 1., 1: 100., 2: 1.}
clf1 = TreeClassifier(random_state=0)
clf1.fit(iris.data, iris.target, sample_weight)
clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
clf2.fit(iris.data, iris.target)
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
# Check that sample_weight and class_weight are multiplicative
clf1 = TreeClassifier(random_state=0)
clf1.fit(iris.data, iris.target, sample_weight ** 2)
clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
clf2.fit(iris.data, iris.target, sample_weight)
assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
@pytest.mark.parametrize("name", CLF_TREES)
def test_class_weights(name):
check_class_weights(name)
def check_class_weight_errors(name):
# Test if class_weight raises errors and warnings when expected.
TreeClassifier = CLF_TREES[name]
_y = np.vstack((y, np.array(y) * 2)).T
# Invalid preset string
clf = TreeClassifier(class_weight='the larch', random_state=0)
with pytest.raises(ValueError):
clf.fit(X, y)
with pytest.raises(ValueError):
clf.fit(X, _y)
# Not a list or preset for multi-output
clf = TreeClassifier(class_weight=1, random_state=0)
with pytest.raises(ValueError):
clf.fit(X, _y)
# Incorrect length list for multi-output
clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0)
with pytest.raises(ValueError):
clf.fit(X, _y)
@pytest.mark.parametrize("name", CLF_TREES)
def test_class_weight_errors(name):
check_class_weight_errors(name)
def test_max_leaf_nodes():
# Test greedy trees with max_depth + 1 leafs.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
k = 4
for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y)
assert est.get_n_leaves() == k + 1
# max_leaf_nodes in (0, 1) should raise ValueError
est = TreeEstimator(max_depth=None, max_leaf_nodes=0)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_depth=None, max_leaf_nodes=1)
with pytest.raises(ValueError):
est.fit(X, y)
est = TreeEstimator(max_depth=None, max_leaf_nodes=0.1)
with pytest.raises(ValueError):
est.fit(X, y)
def test_max_leaf_nodes_max_depth():
# Test precedence of max_leaf_nodes over max_depth.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
k = 4
for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
assert est.get_depth() == 1
def test_arrays_persist():
# Ensure property arrays' memory stays alive when tree disappears
# non-regression for #2726
for attr in ['n_classes', 'value', 'children_left', 'children_right',
'threshold', 'impurity', 'feature', 'n_node_samples']:
value = getattr(DecisionTreeClassifier().fit([[0], [1]],
[0, 1]).tree_, attr)
# if pointing to freed memory, contents may be arbitrary
assert -3 <= value.flat[0] < 3, \
'Array points to arbitrary memory'
def test_only_constant_features():
random_state = check_random_state(0)
X = np.zeros((10, 20))
y = random_state.randint(0, 2, (10, ))
for name, TreeEstimator in ALL_TREES.items():
est = TreeEstimator(random_state=0)
est.fit(X, y)
assert est.tree_.max_depth == 0
def test_behaviour_constant_feature_after_splits():
X = np.transpose(np.vstack(([[0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7]],
np.zeros((4, 11)))))
y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
for name, TreeEstimator in ALL_TREES.items():
# do not check extra random trees
if "ExtraTree" not in name:
est = TreeEstimator(random_state=0, max_features=1)
est.fit(X, y)
assert est.tree_.max_depth == 2
assert est.tree_.node_count == 5
def test_with_only_one_non_constant_features():
X = np.hstack([np.array([[1.], [1.], [0.], [0.]]),
np.zeros((4, 1000))])
y = np.array([0., 1., 0., 1.0])
for name, TreeEstimator in CLF_TREES.items():
est = TreeEstimator(random_state=0, max_features=1)
est.fit(X, y)
assert est.tree_.max_depth == 1
assert_array_equal(est.predict_proba(X), np.full((4, 2), 0.5))
for name, TreeEstimator in REG_TREES.items():
est = TreeEstimator(random_state=0, max_features=1)
est.fit(X, y)
assert est.tree_.max_depth == 1
assert_array_equal(est.predict(X), np.full((4, ), 0.5))
def test_big_input():
# Test if the warning for too large inputs is appropriate.
X = np.repeat(10 ** 40., 4).astype(np.float64).reshape(-1, 1)
clf = DecisionTreeClassifier()
try:
clf.fit(X, [0, 1, 0, 1])
except ValueError as e:
assert "float32" in str(e)
def test_realloc():
from sklearn.tree._utils import _realloc_test
with pytest.raises(MemoryError):
_realloc_test()
def test_huge_allocations():
n_bits = 8 * struct.calcsize("P")
X = np.random.randn(10, 2)
y = np.random.randint(0, 2, 10)
# Sanity check: we cannot request more memory than the size of the address
# space. Currently raises OverflowError.
huge = 2 ** (n_bits + 1)
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
with pytest.raises(Exception):
clf.fit(X, y)
# Non-regression test: MemoryError used to be dropped by Cython
# because of missing "except *".
huge = 2 ** (n_bits - 1) - 1
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
with pytest.raises(MemoryError):
clf.fit(X, y)
def check_sparse_input(tree, dataset, max_depth=None):
TreeEstimator = ALL_TREES[tree]
X = DATASETS[dataset]["X"]
X_sparse = DATASETS[dataset]["X_sparse"]
y = DATASETS[dataset]["y"]
# Gain testing time
if dataset in ["digits", "diabetes"]:
n_samples = X.shape[0] // 5
X = X[:n_samples]
X_sparse = X_sparse[:n_samples]
y = y[:n_samples]
for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
X_sparse = sparse_format(X_sparse)
# Check the default (depth first search)
d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
y_pred = d.predict(X)
if tree in CLF_TREES:
y_proba = d.predict_proba(X)
y_log_proba = d.predict_log_proba(X)
for sparse_matrix in (csr_matrix, csc_matrix, coo_matrix):
X_sparse_test = sparse_matrix(X_sparse, dtype=np.float32)
assert_array_almost_equal(s.predict(X_sparse_test), y_pred)
if tree in CLF_TREES:
assert_array_almost_equal(s.predict_proba(X_sparse_test),
y_proba)
assert_array_almost_equal(s.predict_log_proba(X_sparse_test),
y_log_proba)
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
@pytest.mark.parametrize(
"dataset",
("clf_small", "toy", "digits", "multilabel",
"sparse-pos", "sparse-neg", "sparse-mix",
"zeros")
)
def test_sparse_input(tree_type, dataset):
max_depth = 3 if dataset == "digits" else None
check_sparse_input(tree_type, dataset, max_depth)
@pytest.mark.parametrize("tree_type",
sorted(set(SPARSE_TREES).intersection(REG_TREES)))
@pytest.mark.parametrize("dataset", ["diabetes", "reg_small"])
def test_sparse_input_reg_trees(tree_type, dataset):
# Due to numerical instability of MSE and too strict test, we limit the
# maximal depth
check_sparse_input(tree_type, dataset, 2)
def check_sparse_parameters(tree, dataset):
TreeEstimator = ALL_TREES[tree]
X = DATASETS[dataset]["X"]
X_sparse = DATASETS[dataset]["X_sparse"]
y = DATASETS[dataset]["y"]
# Check max_features
d = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X, y)
s = TreeEstimator(random_state=0, max_features=1,
max_depth=2).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
assert_array_almost_equal(s.predict(X), d.predict(X))
# Check min_samples_split
d = TreeEstimator(random_state=0, max_features=1,
min_samples_split=10).fit(X, y)
s = TreeEstimator(random_state=0, max_features=1,
min_samples_split=10).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
assert_array_almost_equal(s.predict(X), d.predict(X))
# Check min_samples_leaf
d = TreeEstimator(random_state=0,
min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y)
s = TreeEstimator(random_state=0,
min_samples_leaf=X_sparse.shape[0] // 2).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
assert_array_almost_equal(s.predict(X), d.predict(X))
# Check best-first search
d = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X, y)
s = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
assert_array_almost_equal(s.predict(X), d.predict(X))
def check_sparse_criterion(tree, dataset):
TreeEstimator = ALL_TREES[tree]
X = DATASETS[dataset]["X"]
X_sparse = DATASETS[dataset]["X_sparse"]
y = DATASETS[dataset]["y"]
# Check various criterion
CRITERIONS = REG_CRITERIONS if tree in REG_TREES else CLF_CRITERIONS
for criterion in CRITERIONS:
d = TreeEstimator(random_state=0, max_depth=3,
criterion=criterion).fit(X, y)
s = TreeEstimator(random_state=0, max_depth=3,
criterion=criterion).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
assert_array_almost_equal(s.predict(X), d.predict(X))
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
@pytest.mark.parametrize("dataset",
["sparse-pos", "sparse-neg", "sparse-mix", "zeros"])
@pytest.mark.parametrize("check",
[check_sparse_parameters, check_sparse_criterion])
def test_sparse(tree_type, dataset, check):
check(tree_type, dataset)
def check_explicit_sparse_zeros(tree, max_depth=3,
n_features=10):
TreeEstimator = ALL_TREES[tree]
# n_samples set n_feature to ease construction of a simultaneous
# construction of a csr and csc matrix
n_samples = n_features
samples = np.arange(n_samples)
# Generate X, y
random_state = check_random_state(0)
indices = []
data = []
offset = 0
indptr = [offset]
for i in range(n_features):
n_nonzero_i = random_state.binomial(n_samples, 0.5)
indices_i = random_state.permutation(samples)[:n_nonzero_i]
indices.append(indices_i)
data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
data.append(data_i)
offset += n_nonzero_i
indptr.append(offset)
indices = np.concatenate(indices)
data = np.array(np.concatenate(data), dtype=np.float32)
X_sparse = csc_matrix((data, indices, indptr),
shape=(n_samples, n_features))
X = X_sparse.toarray()
X_sparse_test = csr_matrix((data, indices, indptr),
shape=(n_samples, n_features))
X_test = X_sparse_test.toarray()
y = random_state.randint(0, 3, size=(n_samples, ))
# Ensure that X_sparse_test owns its data, indices and indptr array
X_sparse_test = X_sparse_test.copy()
# Ensure that we have explicit zeros
assert (X_sparse.data == 0.).sum() > 0
assert (X_sparse_test.data == 0.).sum() > 0
# Perform the comparison
d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
assert_tree_equal(d.tree_, s.tree_,
"{0} with dense and sparse format gave different "
"trees".format(tree))
Xs = (X_test, X_sparse_test)
for X1, X2 in product(Xs, Xs):
assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
assert_array_almost_equal(s.apply(X1), d.apply(X2))
assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))
assert_array_almost_equal(s.tree_.decision_path(X1).toarray(),
d.tree_.decision_path(X2).toarray())
assert_array_almost_equal(s.decision_path(X1).toarray(),
d.decision_path(X2).toarray())
assert_array_almost_equal(s.decision_path(X1).toarray(),
s.tree_.decision_path(X1).toarray())
assert_array_almost_equal(s.predict(X1), d.predict(X2))
if tree in CLF_TREES:
assert_array_almost_equal(s.predict_proba(X1),
d.predict_proba(X2))
@pytest.mark.parametrize("tree_type", SPARSE_TREES)
def test_explicit_sparse_zeros(tree_type):
check_explicit_sparse_zeros(tree_type)
@ignore_warnings
def check_raise_error_on_1d_input(name):
TreeEstimator = ALL_TREES[name]
X = iris.data[:, 0].ravel()
X_2d = iris.data[:, 0].reshape((-1, 1))
y = iris.target
with pytest.raises(ValueError):
TreeEstimator(random_state=0).fit(X, y)
est = TreeEstimator(random_state=0)
est.fit(X_2d, y)
with pytest.raises(ValueError):
est.predict([X])
@pytest.mark.parametrize("name", ALL_TREES)
def test_1d_input(name):
with ignore_warnings():
check_raise_error_on_1d_input(name)
def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight):
est = TreeEstimator(random_state=0)
est.fit(X, y, sample_weight=sample_weight)
assert est.tree_.max_depth == 1
est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4)
est.fit(X, y, sample_weight=sample_weight)
assert est.tree_.max_depth == 0
def check_min_weight_leaf_split_level(name):
TreeEstimator = ALL_TREES[name]
X = np.array([[0], [0], [0], [0], [1]])
y = [0, 0, 0, 0, 1]
sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
_check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
_check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y,
sample_weight)
@pytest.mark.parametrize("name", ALL_TREES)
def test_min_weight_leaf_split_level(name):
check_min_weight_leaf_split_level(name)
def check_public_apply(name):
X_small32 = X_small.astype(tree._tree.DTYPE, copy=False)
est = ALL_TREES[name]()
est.fit(X_small, y_small)
assert_array_equal(est.apply(X_small),
est.tree_.apply(X_small32))
def check_public_apply_sparse(name):
X_small32 = csr_matrix(X_small.astype(tree._tree.DTYPE, copy=False))
est = ALL_TREES[name]()
est.fit(X_small, y_small)
assert_array_equal(est.apply(X_small),
est.tree_.apply(X_small32))
@pytest.mark.parametrize("name", ALL_TREES)
def test_public_apply_all_trees(name):
check_public_apply(name)
@pytest.mark.parametrize("name", SPARSE_TREES)
def test_public_apply_sparse_trees(name):
check_public_apply_sparse(name)
def test_decision_path_hardcoded():
X = iris.data
y = iris.target
est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
node_indicator = est.decision_path(X[:2]).toarray()
assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
def check_decision_path(name):
X = iris.data
y = iris.target
n_samples = X.shape[0]
TreeEstimator = ALL_TREES[name]
est = TreeEstimator(random_state=0, max_depth=2)
est.fit(X, y)
node_indicator_csr = est.decision_path(X)
node_indicator = node_indicator_csr.toarray()
assert node_indicator.shape == (n_samples, est.tree_.node_count)
# Assert that leaves index are correct
leaves = est.apply(X)
leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
# Ensure only one leave node per sample
all_leaves = est.tree_.children_left == TREE_LEAF
assert_array_almost_equal(np.dot(node_indicator, all_leaves),
np.ones(shape=n_samples))
# Ensure max depth is consistent with sum of indicator
max_depth = node_indicator.sum(axis=1).max()
assert est.tree_.max_depth <= max_depth
@pytest.mark.parametrize("name", ALL_TREES)
def test_decision_path(name):
check_decision_path(name)
def check_no_sparse_y_support(name):
X, y = X_multilabel, csr_matrix(y_multilabel)
TreeEstimator = ALL_TREES[name]
with pytest.raises(TypeError):
TreeEstimator(random_state=0).fit(X, y)
@pytest.mark.parametrize("name", ALL_TREES)
def test_no_sparse_y_support(name):
# Currently we don't support sparse y
check_no_sparse_y_support(name)
def test_mae():
"""Check MAE criterion produces correct results on small toy dataset:
------------------
| X | y | weight |
------------------
| 3 | 3 | 0.1 |
| 5 | 3 | 0.3 |
| 8 | 4 | 1.0 |
| 3 | 6 | 0.6 |
| 5 | 7 | 0.3 |
------------------
|sum wt:| 2.3 |
------------------
Because we are dealing with sample weights, we cannot find the median by
simply choosing/averaging the centre value(s), instead we consider the
median where 50% of the cumulative weight is found (in a y sorted data set)
. Therefore with regards to this test data, the cumulative weight is >= 50%
when y = 4. Therefore:
Median = 4
For all the samples, we can get the total error by summing:
Absolute(Median - y) * weight
I.e., total error = (Absolute(4 - 3) * 0.1)
+ (Absolute(4 - 3) * 0.3)
+ (Absolute(4 - 4) * 1.0)
+ (Absolute(4 - 6) * 0.6)
+ (Absolute(4 - 7) * 0.3)
= 2.5
Impurity = Total error / total weight
= 2.5 / 2.3
= 1.08695652173913
------------------
From this root node, the next best split is between X values of 3 and 5.
Thus, we have left and right child nodes:
LEFT RIGHT
------------------ ------------------
| X | y | weight | | X | y | weight |
------------------ ------------------
| 3 | 3 | 0.1 | | 5 | 3 | 0.3 |
| 3 | 6 | 0.6 | | 8 | 4 | 1.0 |
------------------ | 5 | 7 | 0.3 |
|sum wt:| 0.7 | ------------------
------------------ |sum wt:| 1.6 |
------------------
Impurity is found in the same way:
Left node Median = 6
Total error = (Absolute(6 - 3) * 0.1)
+ (Absolute(6 - 6) * 0.6)
= 0.3
Left Impurity = Total error / total weight
= 0.3 / 0.7
= 0.428571428571429
-------------------
Likewise for Right node:
Right node Median = 4
Total error = (Absolute(4 - 3) * 0.3)
+ (Absolute(4 - 4) * 1.0)
+ (Absolute(4 - 7) * 0.3)
= 1.2
Right Impurity = Total error / total weight
= 1.2 / 1.6
= 0.75
------
"""
dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
max_leaf_nodes=2)
# Test MAE where sample weights are non-uniform (as illustrated above):
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3])
assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])
# Test MAE where all sample weights are uniform:
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3],
sample_weight=np.ones(5))
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
# Test MAE where a `sample_weight` is not explicitly provided.
# This is equivalent to providing uniform sample weights, though
# the internal logic is different:
dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
def test_criterion_copy():
# Let's check whether copy of our criterion has the same type
# and properties as original
n_outputs = 3
n_classes = np.arange(3, dtype=np.intp)
n_samples = 100
def _pickle_copy(obj):
return pickle.loads(pickle.dumps(obj))
for copy_func in [copy.copy, copy.deepcopy, _pickle_copy]:
for _, typename in CRITERIA_CLF.items():
criteria = typename(n_outputs, n_classes)
result = copy_func(criteria).__reduce__()
typename_, (n_outputs_, n_classes_), _ = result
assert typename == typename_
assert n_outputs == n_outputs_
assert_array_equal(n_classes, n_classes_)
for _, typename in CRITERIA_REG.items():
criteria = typename(n_outputs, n_samples)
result = copy_func(criteria).__reduce__()
typename_, (n_outputs_, n_samples_), _ = result
assert typename == typename_
assert n_outputs == n_outputs_
assert n_samples == n_samples_
def test_empty_leaf_infinite_threshold():
# try to make empty leaf by using near infinite value.
data = np.random.RandomState(0).randn(100, 11) * 2e38
data = np.nan_to_num(data.astype('float32'))
X_full = data[:, :-1]
X_sparse = csc_matrix(X_full)
y = data[:, -1]
for X in [X_full, X_sparse]:
tree = DecisionTreeRegressor(random_state=0).fit(X, y)
terminal_regions = tree.apply(X)
left_leaf = set(np.where(tree.tree_.children_left == TREE_LEAF)[0])
empty_leaf = left_leaf.difference(terminal_regions)
infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0]
assert len(infinite_threshold) == 0
assert len(empty_leaf) == 0
@pytest.mark.parametrize("criterion", CLF_CRITERIONS)
@pytest.mark.parametrize(
"dataset", sorted(set(DATASETS.keys()) - {"reg_small", "diabetes"}))
@pytest.mark.parametrize(
"tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier])
def test_prune_tree_classifier_are_subtrees(criterion, dataset, tree_cls):
dataset = DATASETS[dataset]
X, y = dataset["X"], dataset["y"]
est = tree_cls(max_leaf_nodes=20, random_state=0)
info = est.cost_complexity_pruning_path(X, y)
pruning_path = info.ccp_alphas
impurities = info.impurities
assert np.all(np.diff(pruning_path) >= 0)
assert np.all(np.diff(impurities) >= 0)
assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
@pytest.mark.parametrize("criterion", REG_CRITERIONS)
@pytest.mark.parametrize("dataset", DATASETS.keys())
@pytest.mark.parametrize(
"tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
def test_prune_tree_regression_are_subtrees(criterion, dataset, tree_cls):
dataset = DATASETS[dataset]
X, y = dataset["X"], dataset["y"]
est = tree_cls(max_leaf_nodes=20, random_state=0)
info = est.cost_complexity_pruning_path(X, y)
pruning_path = info.ccp_alphas
impurities = info.impurities
assert np.all(np.diff(pruning_path) >= 0)
assert np.all(np.diff(impurities) >= 0)
assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
def test_prune_single_node_tree():
# single node tree
clf1 = DecisionTreeClassifier(random_state=0)
clf1.fit([[0], [1]], [0, 0])
# pruned single node tree
clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10)
clf2.fit([[0], [1]], [0, 0])
assert_is_subtree(clf1.tree_, clf2.tree_)
def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path):
# generate trees with increasing alphas
estimators = []
for ccp_alpha in pruning_path:
est = estimator_cls(
max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit(X, y)
estimators.append(est)
# A pruned tree must be a subtree of the previous tree (which had a
# smaller ccp_alpha)
for prev_est, next_est in zip(estimators, estimators[1:]):
assert_is_subtree(prev_est.tree_, next_est.tree_)
def assert_is_subtree(tree, subtree):
assert tree.node_count >= subtree.node_count
assert tree.max_depth >= subtree.max_depth
tree_c_left = tree.children_left
tree_c_right = tree.children_right
subtree_c_left = subtree.children_left
subtree_c_right = subtree.children_right
stack = [(0, 0)]
while stack:
tree_node_idx, subtree_node_idx = stack.pop()
assert_array_almost_equal(tree.value[tree_node_idx],
subtree.value[subtree_node_idx])
assert_almost_equal(tree.impurity[tree_node_idx],
subtree.impurity[subtree_node_idx])
assert_almost_equal(tree.n_node_samples[tree_node_idx],
subtree.n_node_samples[subtree_node_idx])
assert_almost_equal(tree.weighted_n_node_samples[tree_node_idx],
subtree.weighted_n_node_samples[subtree_node_idx])
if (subtree_c_left[subtree_node_idx] ==
subtree_c_right[subtree_node_idx]):
# is a leaf
assert_almost_equal(TREE_UNDEFINED,
subtree.threshold[subtree_node_idx])
else:
# not a leaf
assert_almost_equal(tree.threshold[tree_node_idx],
subtree.threshold[subtree_node_idx])
stack.append((tree_c_left[tree_node_idx],
subtree_c_left[subtree_node_idx]))
stack.append((tree_c_right[tree_node_idx],
subtree_c_right[subtree_node_idx]))
def test_prune_tree_raises_negative_ccp_alpha():
clf = DecisionTreeClassifier()
msg = "ccp_alpha must be greater than or equal to 0"
with pytest.raises(ValueError, match=msg):
clf.set_params(ccp_alpha=-1.0)
clf.fit(X, y)
clf.set_params(ccp_alpha=0.0)
clf.fit(X, y)
with pytest.raises(ValueError, match=msg):
clf.set_params(ccp_alpha=-1.0)
clf._prune_tree()
def check_apply_path_readonly(name):
X_readonly = create_memmap_backed_data(X_small.astype(tree._tree.DTYPE,
copy=False))
y_readonly = create_memmap_backed_data(np.array(y_small,
dtype=tree._tree.DTYPE))
est = ALL_TREES[name]()
est.fit(X_readonly, y_readonly)
assert_array_equal(est.predict(X_readonly),
est.predict(X_small))
assert_array_equal(est.decision_path(X_readonly).todense(),
est.decision_path(X_small).todense())
@pytest.mark.parametrize("name", ALL_TREES)
def test_apply_path_readonly_all_trees(name):
check_apply_path_readonly(name)
@pytest.mark.parametrize("criterion", ["mse", "friedman_mse", "poisson"])
@pytest.mark.parametrize("Tree", REG_TREES.values())
def test_balance_property(criterion, Tree):
# Test that sum(y_pred)=sum(y_true) on training set.
# This works if the mean is predicted (should even be true for each leaf).
# MAE predicts the median and is therefore excluded from this test.
# Choose a training set with non-negative targets (for poisson)
X, y = diabetes.data, diabetes.target
reg = Tree(criterion=criterion)
reg.fit(X, y)
assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
@pytest.mark.parametrize("seed", range(3))
def test_poisson_zero_nodes(seed):
# Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes.
X = [[0, 0], [0, 1], [0, 2], [0, 3],
[1, 0], [1, 2], [1, 2], [1, 3]]
y = [0, 0, 0, 0, 1, 2, 3, 4]
# Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
# easily learn that:
reg = DecisionTreeRegressor(criterion="mse", random_state=seed)
reg.fit(X, y)
assert np.amin(reg.predict(X)) == 0
# whereas Poisson must predict strictly positive numbers
reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
reg.fit(X, y)
assert np.all(reg.predict(X) > 0)
# Test additional dataset where something could go wrong.
n_features = 10
X, y = datasets.make_regression(
effective_rank=n_features * 2 // 3, tail_strength=0.6,
n_samples=1_000,
n_features=n_features,
n_informative=n_features * 2 // 3,
random_state=seed,
)
# some excess zeros
y[(-1 < y) & (y < 0)] = 0
# make sure the target is positive
y = np.abs(y)
reg = DecisionTreeRegressor(criterion='poisson', random_state=seed)
reg.fit(X, y)
assert np.all(reg.predict(X) > 0)
def test_poisson_vs_mse():
# For a Poisson distributed target, Poisson loss should give better results
# than least squares measured in Poisson deviance as metric.
# We have a similar test, test_poisson(), in
# sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
# Note: Some fine tuning was needed to have metric_poi < metric_dummy on
# the test set!
rng = np.random.RandomState(42)
n_train, n_test, n_features = 500, 500, 10
X = datasets.make_low_rank_matrix(n_samples=n_train + n_test,
n_features=n_features, random_state=rng)
# We create a log-linear Poisson model and downscale coef as it will get
# exponentiated.
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
y = rng.poisson(lam=np.exp(X @ coef))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
random_state=rng)
# We prevent some overfitting by setting min_samples_split=10.
tree_poi = DecisionTreeRegressor(criterion="poisson",
min_samples_split=10,
random_state=rng)
tree_mse = DecisionTreeRegressor(criterion="mse",
min_samples_split=10,
random_state=rng)
tree_poi.fit(X_train, y_train)
tree_mse.fit(X_train, y_train)
dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
metric_poi = mean_poisson_deviance(y, tree_poi.predict(X))
# mse might produce non-positive predictions => clip
metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X),
1e-15, None))
metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
# As MSE might correctly predict 0 in train set, its train score can
# be better than Poisson. This is no longer the case for the test set.
if val == "test":
assert metric_poi < metric_mse
assert metric_poi < metric_dummy
@pytest.mark.parametrize('criterion', REG_CRITERIONS)
def test_decision_tree_regressor_sample_weight_consistentcy(
criterion):
"""Test that the impact of sample_weight is consistent."""
tree_params = dict(criterion=criterion)
tree = DecisionTreeRegressor(**tree_params, random_state=42)
for kind in ['zeros', 'ones']:
check_sample_weights_invariance("DecisionTreeRegressor_" + criterion,
tree, kind='zeros')
rng = np.random.RandomState(0)
n_samples, n_features = 10, 5
X = rng.rand(n_samples, n_features)
y = np.mean(X, axis=1) + rng.rand(n_samples)
# make it positive in order to work also for poisson criterion
y += np.min(y) + 0.1
# check that multiplying sample_weight by 2 is equivalent
# to repeating corresponding samples twice
X2 = np.concatenate([X, X[:n_samples//2]], axis=0)
y2 = np.concatenate([y, y[:n_samples//2]])
sample_weight_1 = np.ones(len(y))
sample_weight_1[:n_samples//2] = 2
tree1 = DecisionTreeRegressor(**tree_params).fit(
X, y, sample_weight=sample_weight_1
)
tree2 = DecisionTreeRegressor(**tree_params).fit(
X2, y2, sample_weight=None
)
assert tree1.tree_.node_count == tree2.tree_.node_count
# Thresholds, tree.tree_.threshold, and values, tree.tree_.value, are not
# exactly the same, but on the training set, those differences do not
# matter and thus predictions are the same.
assert_allclose(tree1.predict(X), tree2.predict(X))
# TODO: Remove in v1.1
@pytest.mark.parametrize("TreeEstimator", [DecisionTreeClassifier,
DecisionTreeRegressor])
def test_X_idx_sorted_deprecated(TreeEstimator):
X_idx_sorted = np.argsort(X, axis=0)
tree = TreeEstimator()
with pytest.warns(FutureWarning,
match="The parameter 'X_idx_sorted' is deprecated"):
tree.fit(X, y, X_idx_sorted=X_idx_sorted)