""" Testing for the tree module (sklearn.tree). """ import copy import copyreg import io import pickle import struct from itertools import chain, product import joblib import numpy as np import pytest from joblib.numpy_pickle import NumpyPickler from numpy.testing import assert_allclose from sklearn import clone, datasets, tree from sklearn.dummy import DummyRegressor from sklearn.exceptions import NotFittedError from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score, mean_poisson_deviance, mean_squared_error from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.random_projection import _sparse_random_matrix from sklearn.tree import ( DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor, ) from sklearn.tree._classes import ( CRITERIA_CLF, CRITERIA_REG, DENSE_SPLITTERS, SPARSE_SPLITTERS, ) from sklearn.tree._tree import ( NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED, _check_n_classes, _check_node_ndarray, _check_value_ndarray, ) from sklearn.tree._tree import Tree as CythonTree from sklearn.utils import compute_sample_weight from sklearn.utils._testing import ( assert_almost_equal, assert_array_almost_equal, assert_array_equal, create_memmap_backed_data, ignore_warnings, skip_if_32bit, ) from sklearn.utils.estimator_checks import check_sample_weights_invariance from sklearn.utils.fixes import ( _IS_32BIT, COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS, ) from sklearn.utils.validation import check_random_state CLF_CRITERIONS = ("gini", "log_loss") REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, "ExtraTreeClassifier": ExtraTreeClassifier, } REG_TREES = { "DecisionTreeRegressor": DecisionTreeRegressor, "ExtraTreeRegressor": ExtraTreeRegressor, } ALL_TREES: dict = dict() ALL_TREES.update(CLF_TREES) ALL_TREES.update(REG_TREES) SPARSE_TREES = [ "DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", "ExtraTreeRegressor", ] X_small = np.array( [ [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0], [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1], [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1], [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1], [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1], [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1], [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1], [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0], [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0], [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0], [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0], [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0], [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1], [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1], [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1], [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1], [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1], [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1], [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1], [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1], [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0], ] ) y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] y_small_reg = [ 1.0, 2.1, 1.2, 0.05, 10, 2.4, 3.1, 1.01, 0.01, 2.98, 3.1, 1.1, 0.0, 1.2, 2, 11, 0, 0, 4.5, 0.201, 1.06, 0.9, 0, ] # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] true_result = [-1, 1, 1] # also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = np.random.RandomState(1) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # also load the diabetes dataset # and randomly permute it diabetes = datasets.load_diabetes() perm = rng.permutation(diabetes.target.size) diabetes.data = diabetes.data[perm] diabetes.target = diabetes.target[perm] digits = datasets.load_digits() perm = rng.permutation(digits.target.size) digits.data = digits.data[perm] digits.target = digits.target[perm] random_state = check_random_state(0) X_multilabel, y_multilabel = datasets.make_multilabel_classification( random_state=0, n_samples=30, n_features=10 ) # NB: despite their names X_sparse_* are numpy arrays (and not sparse matrices) X_sparse_pos = random_state.uniform(size=(20, 5)) X_sparse_pos[X_sparse_pos <= 0.8] = 0.0 y_random = random_state.randint(0, 4, size=(20,)) X_sparse_mix = _sparse_random_matrix(20, 10, density=0.25, random_state=0).toarray() DATASETS = { "iris": {"X": iris.data, "y": iris.target}, "diabetes": {"X": diabetes.data, "y": diabetes.target}, "digits": {"X": digits.data, "y": digits.target}, "toy": {"X": X, "y": y}, "clf_small": {"X": X_small, "y": y_small}, "reg_small": {"X": X_small, "y": y_small_reg}, "multilabel": {"X": X_multilabel, "y": y_multilabel}, "sparse-pos": {"X": X_sparse_pos, "y": y_random}, "sparse-neg": {"X": -X_sparse_pos, "y": y_random}, "sparse-mix": {"X": X_sparse_mix, "y": y_random}, "zeros": {"X": np.zeros((20, 3)), "y": y_random}, } def assert_tree_equal(d, s, message): assert ( s.node_count == d.node_count ), "{0}: inequal number of node ({1} != {2})".format( message, s.node_count, d.node_count ) assert_array_equal( d.children_right, s.children_right, message + ": inequal children_right" ) assert_array_equal( d.children_left, s.children_left, message + ": inequal children_left" ) external = d.children_right == TREE_LEAF internal = np.logical_not(external) assert_array_equal( d.feature[internal], s.feature[internal], message + ": inequal features" ) assert_array_equal( d.threshold[internal], s.threshold[internal], message + ": inequal threshold" ) assert_array_equal( d.n_node_samples.sum(), s.n_node_samples.sum(), message + ": inequal sum(n_node_samples)", ) assert_array_equal( d.n_node_samples, s.n_node_samples, message + ": inequal n_node_samples" ) assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity") assert_array_almost_equal( d.value[external], s.value[external], err_msg=message + ": inequal value" ) def test_classification_toy(): # Check classification on a toy dataset. for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name)) clf = Tree(max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name)) def test_weighted_classification_toy(): # Check classification on a weighted toy dataset. for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y, sample_weight=np.ones(len(X))) assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name)) clf.fit(X, y, sample_weight=np.full(len(X), 0.5)) assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name)) @pytest.mark.parametrize("Tree", REG_TREES.values()) @pytest.mark.parametrize("criterion", REG_CRITERIONS) def test_regression_toy(Tree, criterion): # Check regression on a toy dataset. if criterion == "poisson": # make target positive while not touching the original y and # true_result a = np.abs(np.min(y)) + 1 y_train = np.array(y) + a y_test = np.array(true_result) + a else: y_train = y y_test = true_result reg = Tree(criterion=criterion, random_state=1) reg.fit(X, y_train) assert_allclose(reg.predict(T), y_test) clf = Tree(criterion=criterion, max_features=1, random_state=1) clf.fit(X, y_train) assert_allclose(reg.predict(T), y_test) def test_xor(): # Check on a XOR problem y = np.zeros((10, 10)) y[:5, :5] = 1 y[5:, 5:] = 1 gridx, gridy = np.indices(y.shape) X = np.vstack([gridx.ravel(), gridy.ravel()]).T y = y.ravel() for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) clf = Tree(random_state=0, max_features=1) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) def test_iris(): # Check consistency on dataset iris. for (name, Tree), criterion in product(CLF_TREES.items(), CLF_CRITERIONS): clf = Tree(criterion=criterion, random_state=0) clf.fit(iris.data, iris.target) score = accuracy_score(clf.predict(iris.data), iris.target) assert score > 0.9, "Failed with {0}, criterion = {1} and score = {2}".format( name, criterion, score ) clf = Tree(criterion=criterion, max_features=2, random_state=0) clf.fit(iris.data, iris.target) score = accuracy_score(clf.predict(iris.data), iris.target) assert score > 0.5, "Failed with {0}, criterion = {1} and score = {2}".format( name, criterion, score ) @pytest.mark.parametrize("name, Tree", REG_TREES.items()) @pytest.mark.parametrize("criterion", REG_CRITERIONS) def test_diabetes_overfit(name, Tree, criterion): # check consistency of overfitted trees on the diabetes dataset # since the trees will overfit, we expect an MSE of 0 reg = Tree(criterion=criterion, random_state=0) reg.fit(diabetes.data, diabetes.target) score = mean_squared_error(diabetes.target, reg.predict(diabetes.data)) assert score == pytest.approx( 0 ), f"Failed with {name}, criterion = {criterion} and score = {score}" @skip_if_32bit @pytest.mark.parametrize("name, Tree", REG_TREES.items()) @pytest.mark.parametrize( "criterion, max_depth, metric, max_loss", [ ("squared_error", 15, mean_squared_error, 60), ("absolute_error", 20, mean_squared_error, 60), ("friedman_mse", 15, mean_squared_error, 60), ("poisson", 15, mean_poisson_deviance, 30), ], ) def test_diabetes_underfit(name, Tree, criterion, max_depth, metric, max_loss): # check consistency of trees when the depth and the number of features are # limited reg = Tree(criterion=criterion, max_depth=max_depth, max_features=6, random_state=0) reg.fit(diabetes.data, diabetes.target) loss = metric(diabetes.target, reg.predict(diabetes.data)) assert 0 < loss < max_loss def test_probability(): # Predict probabilities using DecisionTreeClassifier. for name, Tree in CLF_TREES.items(): clf = Tree(max_depth=1, max_features=1, random_state=42) clf.fit(iris.data, iris.target) prob_predict = clf.predict_proba(iris.data) assert_array_almost_equal( np.sum(prob_predict, 1), np.ones(iris.data.shape[0]), err_msg="Failed with {0}".format(name), ) assert_array_equal( np.argmax(prob_predict, 1), clf.predict(iris.data), err_msg="Failed with {0}".format(name), ) assert_almost_equal( clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)), 8, err_msg="Failed with {0}".format(name), ) def test_arrayrepr(): # Check the array representation. # Check resize X = np.arange(10000)[:, np.newaxis] y = np.arange(10000) for name, Tree in REG_TREES.items(): reg = Tree(max_depth=None, random_state=0) reg.fit(X, y) def test_pure_set(): # Check when y is pure. X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [1, 1, 1, 1, 1, 1] for name, TreeClassifier in CLF_TREES.items(): clf = TreeClassifier(random_state=0) clf.fit(X, y) assert_array_equal(clf.predict(X), y, err_msg="Failed with {0}".format(name)) for name, TreeRegressor in REG_TREES.items(): reg = TreeRegressor(random_state=0) reg.fit(X, y) assert_almost_equal(reg.predict(X), y, err_msg="Failed with {0}".format(name)) def test_numerical_stability(): # Check numerical stability. X = np.array( [ [152.08097839, 140.40744019, 129.75102234, 159.90493774], [142.50700378, 135.81935120, 117.82884979, 162.75781250], [127.28772736, 140.40744019, 129.75102234, 159.90493774], [132.37025452, 143.71923828, 138.35694885, 157.84558105], [103.10237122, 143.71928406, 138.35696411, 157.84559631], [127.71276855, 143.71923828, 138.35694885, 157.84558105], [120.91514587, 140.40744019, 129.75102234, 159.90493774], ] ) y = np.array([1.0, 0.70209277, 0.53896582, 0.0, 0.90914464, 0.48026916, 0.49622521]) with np.errstate(all="raise"): for name, Tree in REG_TREES.items(): reg = Tree(random_state=0) reg.fit(X, y) reg.fit(X, -y) reg.fit(-X, y) reg.fit(-X, -y) def test_importances(): # Check variable importances. X, y = datasets.make_classification( n_samples=5000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, ) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) assert n_important == 3, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data)) clf2.fit(iris.data, iris.target) assert_array_equal(clf.feature_importances_, clf2.feature_importances_) def test_importances_raises(): # Check if variable importance before fit raises ValueError. clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") def test_importances_gini_equal_squared_error(): # Check that gini is equivalent to squared_error for binary output variable X, y = datasets.make_classification( n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, ) # The gini index and the mean square error (variance) might differ due # to numerical instability. Since those instabilities mainly occurs at # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit( X, y ) reg = DecisionTreeRegressor( criterion="squared_error", max_depth=5, random_state=0 ).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) def test_max_features(): # Check max_features. for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_features="sqrt") est.fit(iris.data, iris.target) assert est.max_features_ == int(np.sqrt(iris.data.shape[1])) est = TreeEstimator(max_features="log2") est.fit(iris.data, iris.target) assert est.max_features_ == int(np.log2(iris.data.shape[1])) est = TreeEstimator(max_features=1) est.fit(iris.data, iris.target) assert est.max_features_ == 1 est = TreeEstimator(max_features=3) est.fit(iris.data, iris.target) assert est.max_features_ == 3 est = TreeEstimator(max_features=0.01) est.fit(iris.data, iris.target) assert est.max_features_ == 1 est = TreeEstimator(max_features=0.5) est.fit(iris.data, iris.target) assert est.max_features_ == int(0.5 * iris.data.shape[1]) est = TreeEstimator(max_features=1.0) est.fit(iris.data, iris.target) assert est.max_features_ == iris.data.shape[1] est = TreeEstimator(max_features=None) est.fit(iris.data, iris.target) assert est.max_features_ == iris.data.shape[1] def test_error(): # Test that it gives proper exception on deficient input. for name, TreeEstimator in CLF_TREES.items(): # predict before fit est = TreeEstimator() with pytest.raises(NotFittedError): est.predict_proba(X) est.fit(X, y) X2 = [[-2, -1, 1]] # wrong feature shape for sample with pytest.raises(ValueError): est.predict_proba(X2) # Wrong dimensions est = TreeEstimator() y2 = y[:-1] with pytest.raises(ValueError): est.fit(X, y2) # Test with arrays that are non-contiguous. Xf = np.asfortranarray(X) est = TreeEstimator() est.fit(Xf, y) assert_almost_equal(est.predict(T), true_result) # predict before fitting est = TreeEstimator() with pytest.raises(NotFittedError): est.predict(T) # predict on vector with different dims est.fit(X, y) t = np.asarray(T) with pytest.raises(ValueError): est.predict(t[:, 1:]) # wrong sample shape Xt = np.array(X).T est = TreeEstimator() est.fit(np.dot(X, Xt), y) with pytest.raises(ValueError): est.predict(X) with pytest.raises(ValueError): est.apply(X) clf = TreeEstimator() clf.fit(X, y) with pytest.raises(ValueError): clf.predict(Xt) with pytest.raises(ValueError): clf.apply(Xt) # apply before fitting est = TreeEstimator() with pytest.raises(NotFittedError): est.apply(T) # non positive target for Poisson splitting Criterion est = DecisionTreeRegressor(criterion="poisson") with pytest.raises(ValueError, match="y is not positive.*Poisson"): est.fit([[0, 1, 2]], [0, 0, 0]) with pytest.raises(ValueError, match="Some.*y are negative.*Poisson"): est.fit([[0, 1, 2]], [5, -0.1, 2]) def test_min_samples_split(): """Test min_samples_split parameter""" X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) y = iris.target # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): TreeEstimator = ALL_TREES[name] # test for integer parameter est = TreeEstimator( min_samples_split=10, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y) # count samples on nodes, -1 means it is a leaf node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1] assert np.min(node_samples) > 9, "Failed with {0}".format(name) # test for float parameter est = TreeEstimator( min_samples_split=0.2, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y) # count samples on nodes, -1 means it is a leaf node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1] assert np.min(node_samples) > 9, "Failed with {0}".format(name) def test_min_samples_leaf(): # Test if leaves contain more than leaf_count training examples X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE) y = iris.target # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): TreeEstimator = ALL_TREES[name] # test integer parameter est = TreeEstimator( min_samples_leaf=5, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) # test float parameter est = TreeEstimator( min_samples_leaf=0.1, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) def check_min_weight_fraction_leaf(name, datasets, sparse_container=None): """Test if leaves contain at least min_weight_fraction_leaf of the training set""" X = DATASETS[datasets]["X"].astype(np.float32) if sparse_container is not None: X = sparse_container(X) y = DATASETS[datasets]["y"] weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) TreeEstimator = ALL_TREES[name] # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)): est = TreeEstimator( min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y, sample_weight=weights) if sparse_container is not None: out = est.tree_.apply(X.tocsr()) else: out = est.tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert ( np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf ), "Failed with {0} min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf ) # test case with no weights passed in total_weight = X.shape[0] for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)): est = TreeEstimator( min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0 ) est.fit(X, y) if sparse_container is not None: out = est.tree_.apply(X.tocsr()) else: out = est.tree_.apply(X) node_weights = np.bincount(out) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert ( np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf ), "Failed with {0} min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf ) @pytest.mark.parametrize("name", ALL_TREES) def test_min_weight_fraction_leaf_on_dense_input(name): check_min_weight_fraction_leaf(name, "iris") @pytest.mark.parametrize("name", SPARSE_TREES) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_min_weight_fraction_leaf_on_sparse_input(name, csc_container): check_min_weight_fraction_leaf(name, "multilabel", sparse_container=csc_container) def check_min_weight_fraction_leaf_with_min_samples_leaf( name, datasets, sparse_container=None ): """Test the interaction between min_weight_fraction_leaf and min_samples_leaf when sample_weights is not provided in fit.""" X = DATASETS[datasets]["X"].astype(np.float32) if sparse_container is not None: X = sparse_container(X) y = DATASETS[datasets]["y"] total_weight = X.shape[0] TreeEstimator = ALL_TREES[name] for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)): # test integer min_samples_leaf est = TreeEstimator( min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=5, random_state=0, ) est.fit(X, y) if sparse_container is not None: out = est.tree_.apply(X.tocsr()) else: out = est.tree_.apply(X) node_weights = np.bincount(out) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert np.min(leaf_weights) >= max( (total_weight * est.min_weight_fraction_leaf), 5 ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format( name, est.min_weight_fraction_leaf, est.min_samples_leaf ) for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)): # test float min_samples_leaf est = TreeEstimator( min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, min_samples_leaf=0.1, random_state=0, ) est.fit(X, y) if sparse_container is not None: out = est.tree_.apply(X.tocsr()) else: out = est.tree_.apply(X) node_weights = np.bincount(out) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert np.min(leaf_weights) >= max( (total_weight * est.min_weight_fraction_leaf), (total_weight * est.min_samples_leaf), ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format( name, est.min_weight_fraction_leaf, est.min_samples_leaf ) @pytest.mark.parametrize("name", ALL_TREES) def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name): check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris") @pytest.mark.parametrize("name", SPARSE_TREES) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input( name, csc_container ): check_min_weight_fraction_leaf_with_min_samples_leaf( name, "multilabel", sparse_container=csc_container ) def test_min_impurity_decrease(global_random_seed): # test if min_impurity_decrease ensure that a split is made only if # if the impurity decrease is at least that value X, y = datasets.make_classification(n_samples=100, random_state=global_random_seed) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()): TreeEstimator = ALL_TREES[name] # Check default value of min_impurity_decrease, 1e-7 est1 = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0) # Check with explicit value of 0.05 est2 = TreeEstimator( max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.05, random_state=0 ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.1, random_state=0 ) for est, expected_decrease in ( (est1, 1e-7), (est2, 0.05), (est3, 0.0001), (est4, 0.1), ): assert ( est.min_impurity_decrease <= expected_decrease ), "Failed, min_impurity_decrease = {0} > {1}".format( est.min_impurity_decrease, expected_decrease ) est.fit(X, y) for node in range(est.tree_.node_count): # If current node is a not leaf node, check if the split was # justified w.r.t the min_impurity_decrease if est.tree_.children_left[node] != TREE_LEAF: imp_parent = est.tree_.impurity[node] wtd_n_node = est.tree_.weighted_n_node_samples[node] left = est.tree_.children_left[node] wtd_n_left = est.tree_.weighted_n_node_samples[left] imp_left = est.tree_.impurity[left] wtd_imp_left = wtd_n_left * imp_left right = est.tree_.children_right[node] wtd_n_right = est.tree_.weighted_n_node_samples[right] imp_right = est.tree_.impurity[right] wtd_imp_right = wtd_n_right * imp_right wtd_avg_left_right_imp = wtd_imp_right + wtd_imp_left wtd_avg_left_right_imp /= wtd_n_node fractional_node_weight = ( est.tree_.weighted_n_node_samples[node] / X.shape[0] ) actual_decrease = fractional_node_weight * ( imp_parent - wtd_avg_left_right_imp ) assert ( actual_decrease >= expected_decrease ), "Failed with {0} expected min_impurity_decrease={1}".format( actual_decrease, expected_decrease ) def test_pickle(): """Test pickling preserves Tree properties and performance.""" for name, TreeEstimator in ALL_TREES.items(): if "Classifier" in name: X, y = iris.data, iris.target else: X, y = diabetes.data, diabetes.target est = TreeEstimator(random_state=0) est.fit(X, y) score = est.score(X, y) # test that all class properties are maintained attributes = [ "max_depth", "node_count", "capacity", "n_classes", "children_left", "children_right", "n_leaves", "feature", "threshold", "impurity", "n_node_samples", "weighted_n_node_samples", "value", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes } serialized_object = pickle.dumps(est) est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ score2 = est2.score(X, y) assert ( score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: assert_array_equal( getattr(est2.tree_, attribute), fitted_attribute[attribute], err_msg=( f"Failed to generate same attribute {attribute} after pickling with" f" {name}" ), ) def test_multioutput(): # Check estimators on multi-output problems. X = [ [-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2], ] y = [ [-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3], ] T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] # toy classification problem for name, TreeClassifier in CLF_TREES.items(): clf = TreeClassifier(random_state=0) y_hat = clf.fit(X, y).predict(T) assert_array_equal(y_hat, y_true) assert y_hat.shape == (4, 2) proba = clf.predict_proba(T) assert len(proba) == 2 assert proba[0].shape == (4, 2) assert proba[1].shape == (4, 4) log_proba = clf.predict_log_proba(T) assert len(log_proba) == 2 assert log_proba[0].shape == (4, 2) assert log_proba[1].shape == (4, 4) # toy regression problem for name, TreeRegressor in REG_TREES.items(): reg = TreeRegressor(random_state=0) y_hat = reg.fit(X, y).predict(T) assert_almost_equal(y_hat, y_true) assert y_hat.shape == (4, 2) def test_classes_shape(): # Test that n_classes_ and classes_ have proper shape. for name, TreeClassifier in CLF_TREES.items(): # Classification, single output clf = TreeClassifier(random_state=0) clf.fit(X, y) assert clf.n_classes_ == 2 assert_array_equal(clf.classes_, [-1, 1]) # Classification, multi-output _y = np.vstack((y, np.array(y) * 2)).T clf = TreeClassifier(random_state=0) clf.fit(X, _y) assert len(clf.n_classes_) == 2 assert len(clf.classes_) == 2 assert_array_equal(clf.n_classes_, [2, 2]) assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]]) def test_unbalanced_iris(): # Check class rebalancing. unbalanced_X = iris.data[:125] unbalanced_y = iris.target[:125] sample_weight = compute_sample_weight("balanced", unbalanced_y) for name, TreeClassifier in CLF_TREES.items(): clf = TreeClassifier(random_state=0) clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight) assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y) def test_memory_layout(): # Check that it works no matter the memory layout for (name, TreeEstimator), dtype in product( ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # C-order X = np.asarray(iris.data, order="C", dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # F-order X = np.asarray(iris.data, order="F", dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # Contiguous X = np.ascontiguousarray(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # csr for csr_container in CSR_CONTAINERS: X = csr_container(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # csc for csc_container in CSC_CONTAINERS: X = csc_container(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # Strided X = np.asarray(iris.data[::3], dtype=dtype) y = iris.target[::3] assert_array_equal(est.fit(X, y).predict(X), y) def test_sample_weight(): # Check sample weighting. # Test that zero-weighted samples are not taken into account X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 sample_weight = np.ones(100) sample_weight[y == 0] = 0.0 clf = DecisionTreeClassifier(random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(X), np.ones(100)) # Test that low weighted samples are not taken into account at low depth X = np.arange(200)[:, np.newaxis] y = np.zeros(200) y[50:100] = 1 y[100:200] = 2 X[100:200, 0] = 200 sample_weight = np.ones(200) sample_weight[y == 2] = 0.51 # Samples of class '2' are still weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert clf.tree_.threshold[0] == 149.5 sample_weight[y == 2] = 0.5 # Samples of class '2' are no longer weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert clf.tree_.threshold[0] == 49.5 # Threshold should have moved # Test that sample weighting is the same as having duplicates X = iris.data y = iris.target duplicates = rng.randint(0, X.shape[0], 100) clf = DecisionTreeClassifier(random_state=1) clf.fit(X[duplicates], y[duplicates]) sample_weight = np.bincount(duplicates, minlength=X.shape[0]) clf2 = DecisionTreeClassifier(random_state=1) clf2.fit(X, y, sample_weight=sample_weight) internal = clf.tree_.children_left != tree._tree.TREE_LEAF assert_array_almost_equal( clf.tree_.threshold[internal], clf2.tree_.threshold[internal] ) def test_sample_weight_invalid(): # Check sample weighting raises errors. X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 clf = DecisionTreeClassifier(random_state=0) sample_weight = np.random.rand(100, 1) with pytest.raises(ValueError): clf.fit(X, y, sample_weight=sample_weight) sample_weight = np.array(0) expected_err = r"Singleton.* cannot be considered a valid collection" with pytest.raises(TypeError, match=expected_err): clf.fit(X, y, sample_weight=sample_weight) @pytest.mark.parametrize("name", CLF_TREES) def test_class_weights(name): # Test that class_weights resemble sample_weights behavior. TreeClassifier = CLF_TREES[name] # Iris is balanced, so no effect expected for using 'balanced' weights clf1 = TreeClassifier(random_state=0) clf1.fit(iris.data, iris.target) clf2 = TreeClassifier(class_weight="balanced", random_state=0) clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) # Make a multi-output problem with three copies of Iris iris_multi = np.vstack((iris.target, iris.target, iris.target)).T # Create user-defined weights that should balance over the outputs clf3 = TreeClassifier( class_weight=[ {0: 2.0, 1: 2.0, 2: 1.0}, {0: 2.0, 1: 1.0, 2: 2.0}, {0: 1.0, 1: 2.0, 2: 2.0}, ], random_state=0, ) clf3.fit(iris.data, iris_multi) assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) # Check against multi-output "auto" which should also have no effect clf4 = TreeClassifier(class_weight="balanced", random_state=0) clf4.fit(iris.data, iris_multi) assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) # Inflate importance of class 1, check against user-defined weights sample_weight = np.ones(iris.target.shape) sample_weight[iris.target == 1] *= 100 class_weight = {0: 1.0, 1: 100.0, 2: 1.0} clf1 = TreeClassifier(random_state=0) clf1.fit(iris.data, iris.target, sample_weight) clf2 = TreeClassifier(class_weight=class_weight, random_state=0) clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) # Check that sample_weight and class_weight are multiplicative clf1 = TreeClassifier(random_state=0) clf1.fit(iris.data, iris.target, sample_weight**2) clf2 = TreeClassifier(class_weight=class_weight, random_state=0) clf2.fit(iris.data, iris.target, sample_weight) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) @pytest.mark.parametrize("name", CLF_TREES) def test_class_weight_errors(name): # Test if class_weight raises errors and warnings when expected. TreeClassifier = CLF_TREES[name] _y = np.vstack((y, np.array(y) * 2)).T # Incorrect length list for multi-output clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0) err_msg = "number of elements in class_weight should match number of outputs." with pytest.raises(ValueError, match=err_msg): clf.fit(X, _y) def test_max_leaf_nodes(): # Test greedy trees with max_depth + 1 leafs. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y) assert est.get_n_leaves() == k + 1 def test_max_leaf_nodes_max_depth(): # Test precedence of max_leaf_nodes over max_depth. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) k = 4 for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y) assert est.get_depth() == 1 def test_arrays_persist(): # Ensure property arrays' memory stays alive when tree disappears # non-regression for #2726 for attr in [ "n_classes", "value", "children_left", "children_right", "threshold", "impurity", "feature", "n_node_samples", ]: value = getattr(DecisionTreeClassifier().fit([[0], [1]], [0, 1]).tree_, attr) # if pointing to freed memory, contents may be arbitrary assert -3 <= value.flat[0] < 3, "Array points to arbitrary memory" def test_only_constant_features(): random_state = check_random_state(0) X = np.zeros((10, 20)) y = random_state.randint(0, 2, (10,)) for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(random_state=0) est.fit(X, y) assert est.tree_.max_depth == 0 def test_behaviour_constant_feature_after_splits(): X = np.transpose( np.vstack(([[0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7]], np.zeros((4, 11)))) ) y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees if "ExtraTree" not in name: est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 assert est.tree_.node_count == 5 def test_with_only_one_non_constant_features(): X = np.hstack([np.array([[1.0], [1.0], [0.0], [0.0]]), np.zeros((4, 1000))]) y = np.array([0.0, 1.0, 0.0, 1.0]) for name, TreeEstimator in CLF_TREES.items(): est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 1 assert_array_equal(est.predict_proba(X), np.full((4, 2), 0.5)) for name, TreeEstimator in REG_TREES.items(): est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 1 assert_array_equal(est.predict(X), np.full((4,), 0.5)) def test_big_input(): # Test if the warning for too large inputs is appropriate. X = np.repeat(10**40.0, 4).astype(np.float64).reshape(-1, 1) clf = DecisionTreeClassifier() with pytest.raises(ValueError, match="float32"): clf.fit(X, [0, 1, 0, 1]) def test_realloc(): from sklearn.tree._utils import _realloc_test with pytest.raises(MemoryError): _realloc_test() def test_huge_allocations(): n_bits = 8 * struct.calcsize("P") X = np.random.randn(10, 2) y = np.random.randint(0, 2, 10) # Sanity check: we cannot request more memory than the size of the address # space. Currently raises OverflowError. huge = 2 ** (n_bits + 1) clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge) with pytest.raises(Exception): clf.fit(X, y) # Non-regression test: MemoryError used to be dropped by Cython # because of missing "except *". huge = 2 ** (n_bits - 1) - 1 clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge) with pytest.raises(MemoryError): clf.fit(X, y) def check_sparse_input(tree, dataset, max_depth=None): TreeEstimator = ALL_TREES[tree] X = DATASETS[dataset]["X"] y = DATASETS[dataset]["y"] # Gain testing time if dataset in ["digits", "diabetes"]: n_samples = X.shape[0] // 5 X = X[:n_samples] y = y[:n_samples] for sparse_container in COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS: X_sparse = sparse_container(X) # Check the default (depth first search) d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y) s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree), ) y_pred = d.predict(X) if tree in CLF_TREES: y_proba = d.predict_proba(X) y_log_proba = d.predict_log_proba(X) for sparse_container_test in COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS: X_sparse_test = sparse_container_test(X_sparse, dtype=np.float32) assert_array_almost_equal(s.predict(X_sparse_test), y_pred) if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X_sparse_test), y_proba) assert_array_almost_equal( s.predict_log_proba(X_sparse_test), y_log_proba ) @pytest.mark.parametrize("tree_type", SPARSE_TREES) @pytest.mark.parametrize( "dataset", ( "clf_small", "toy", "digits", "multilabel", "sparse-pos", "sparse-neg", "sparse-mix", "zeros", ), ) def test_sparse_input(tree_type, dataset): max_depth = 3 if dataset == "digits" else None check_sparse_input(tree_type, dataset, max_depth) @pytest.mark.parametrize("tree_type", sorted(set(SPARSE_TREES).intersection(REG_TREES))) @pytest.mark.parametrize("dataset", ["diabetes", "reg_small"]) def test_sparse_input_reg_trees(tree_type, dataset): # Due to numerical instability of MSE and too strict test, we limit the # maximal depth check_sparse_input(tree_type, dataset, 2) @pytest.mark.parametrize("tree_type", SPARSE_TREES) @pytest.mark.parametrize("dataset", ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_sparse_parameters(tree_type, dataset, csc_container): TreeEstimator = ALL_TREES[tree_type] X = DATASETS[dataset]["X"] X_sparse = csc_container(X) y = DATASETS[dataset]["y"] # Check max_features d = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X, y) s = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree_type), ) assert_array_almost_equal(s.predict(X), d.predict(X)) # Check min_samples_split d = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit(X, y) s = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit( X_sparse, y ) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree_type), ) assert_array_almost_equal(s.predict(X), d.predict(X)) # Check min_samples_leaf d = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y) s = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit( X_sparse, y ) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree_type), ) assert_array_almost_equal(s.predict(X), d.predict(X)) # Check best-first search d = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X, y) s = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree_type), ) assert_array_almost_equal(s.predict(X), d.predict(X)) @pytest.mark.parametrize( "tree_type, criterion", list(product([tree for tree in SPARSE_TREES if tree in REG_TREES], REG_CRITERIONS)) + list( product([tree for tree in SPARSE_TREES if tree in CLF_TREES], CLF_CRITERIONS) ), ) @pytest.mark.parametrize("dataset", ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_sparse_criteria(tree_type, dataset, csc_container, criterion): TreeEstimator = ALL_TREES[tree_type] X = DATASETS[dataset]["X"] X_sparse = csc_container(X) y = DATASETS[dataset]["y"] d = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(X, y) s = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree_type), ) assert_array_almost_equal(s.predict(X), d.predict(X)) @pytest.mark.parametrize("tree_type", SPARSE_TREES) @pytest.mark.parametrize( "csc_container,csr_container", zip(CSC_CONTAINERS, CSR_CONTAINERS) ) def test_explicit_sparse_zeros(tree_type, csc_container, csr_container): TreeEstimator = ALL_TREES[tree_type] max_depth = 3 n_features = 10 # n_samples set n_feature to ease construction of a simultaneous # construction of a csr and csc matrix n_samples = n_features samples = np.arange(n_samples) # Generate X, y random_state = check_random_state(0) indices = [] data = [] offset = 0 indptr = [offset] for i in range(n_features): n_nonzero_i = random_state.binomial(n_samples, 0.5) indices_i = random_state.permutation(samples)[:n_nonzero_i] indices.append(indices_i) data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i,)) - 1 data.append(data_i) offset += n_nonzero_i indptr.append(offset) indices = np.concatenate(indices).astype(np.int32) indptr = np.array(indptr, dtype=np.int32) data = np.array(np.concatenate(data), dtype=np.float32) X_sparse = csc_container((data, indices, indptr), shape=(n_samples, n_features)) X = X_sparse.toarray() X_sparse_test = csr_container( (data, indices, indptr), shape=(n_samples, n_features) ) X_test = X_sparse_test.toarray() y = random_state.randint(0, 3, size=(n_samples,)) # Ensure that X_sparse_test owns its data, indices and indptr array X_sparse_test = X_sparse_test.copy() # Ensure that we have explicit zeros assert (X_sparse.data == 0.0).sum() > 0 assert (X_sparse_test.data == 0.0).sum() > 0 # Perform the comparison d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y) s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different trees".format(tree), ) Xs = (X_test, X_sparse_test) for X1, X2 in product(Xs, Xs): assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2)) assert_array_almost_equal(s.apply(X1), d.apply(X2)) assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1)) assert_array_almost_equal( s.tree_.decision_path(X1).toarray(), d.tree_.decision_path(X2).toarray() ) assert_array_almost_equal( s.decision_path(X1).toarray(), d.decision_path(X2).toarray() ) assert_array_almost_equal( s.decision_path(X1).toarray(), s.tree_.decision_path(X1).toarray() ) assert_array_almost_equal(s.predict(X1), d.predict(X2)) if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2)) @ignore_warnings def check_raise_error_on_1d_input(name): TreeEstimator = ALL_TREES[name] X = iris.data[:, 0].ravel() X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target with pytest.raises(ValueError): TreeEstimator(random_state=0).fit(X, y) est = TreeEstimator(random_state=0) est.fit(X_2d, y) with pytest.raises(ValueError): est.predict([X]) @pytest.mark.parametrize("name", ALL_TREES) def test_1d_input(name): with ignore_warnings(): check_raise_error_on_1d_input(name) @pytest.mark.parametrize("name", ALL_TREES) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS) def test_min_weight_leaf_split_level(name, sparse_container): TreeEstimator = ALL_TREES[name] X = np.array([[0], [0], [0], [0], [1]]) y = [0, 0, 0, 0, 1] sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] if sparse_container is not None: X = sparse_container(X) est = TreeEstimator(random_state=0) est.fit(X, y, sample_weight=sample_weight) assert est.tree_.max_depth == 1 est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4) est.fit(X, y, sample_weight=sample_weight) assert est.tree_.max_depth == 0 @pytest.mark.parametrize("name", ALL_TREES) def test_public_apply_all_trees(name): X_small32 = X_small.astype(tree._tree.DTYPE, copy=False) est = ALL_TREES[name]() est.fit(X_small, y_small) assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32)) @pytest.mark.parametrize("name", SPARSE_TREES) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_public_apply_sparse_trees(name, csr_container): X_small32 = csr_container(X_small.astype(tree._tree.DTYPE, copy=False)) est = ALL_TREES[name]() est.fit(X_small, y_small) assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32)) def test_decision_path_hardcoded(): X = iris.data y = iris.target est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y) node_indicator = est.decision_path(X[:2]).toarray() assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]]) @pytest.mark.parametrize("name", ALL_TREES) def test_decision_path(name): X = iris.data y = iris.target n_samples = X.shape[0] TreeEstimator = ALL_TREES[name] est = TreeEstimator(random_state=0, max_depth=2) est.fit(X, y) node_indicator_csr = est.decision_path(X) node_indicator = node_indicator_csr.toarray() assert node_indicator.shape == (n_samples, est.tree_.node_count) # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample all_leaves = est.tree_.children_left == TREE_LEAF assert_array_almost_equal( np.dot(node_indicator, all_leaves), np.ones(shape=n_samples) ) # Ensure max depth is consistent with sum of indicator max_depth = node_indicator.sum(axis=1).max() assert est.tree_.max_depth <= max_depth @pytest.mark.parametrize("name", ALL_TREES) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_no_sparse_y_support(name, csr_container): # Currently we don't support sparse y X, y = X_multilabel, csr_container(y_multilabel) TreeEstimator = ALL_TREES[name] with pytest.raises(TypeError): TreeEstimator(random_state=0).fit(X, y) def test_mae(): """Check MAE criterion produces correct results on small toy dataset: ------------------ | X | y | weight | ------------------ | 3 | 3 | 0.1 | | 5 | 3 | 0.3 | | 8 | 4 | 1.0 | | 3 | 6 | 0.6 | | 5 | 7 | 0.3 | ------------------ |sum wt:| 2.3 | ------------------ Because we are dealing with sample weights, we cannot find the median by simply choosing/averaging the centre value(s), instead we consider the median where 50% of the cumulative weight is found (in a y sorted data set) . Therefore with regards to this test data, the cumulative weight is >= 50% when y = 4. Therefore: Median = 4 For all the samples, we can get the total error by summing: Absolute(Median - y) * weight I.e., total error = (Absolute(4 - 3) * 0.1) + (Absolute(4 - 3) * 0.3) + (Absolute(4 - 4) * 1.0) + (Absolute(4 - 6) * 0.6) + (Absolute(4 - 7) * 0.3) = 2.5 Impurity = Total error / total weight = 2.5 / 2.3 = 1.08695652173913 ------------------ From this root node, the next best split is between X values of 3 and 5. Thus, we have left and right child nodes: LEFT RIGHT ------------------ ------------------ | X | y | weight | | X | y | weight | ------------------ ------------------ | 3 | 3 | 0.1 | | 5 | 3 | 0.3 | | 3 | 6 | 0.6 | | 8 | 4 | 1.0 | ------------------ | 5 | 7 | 0.3 | |sum wt:| 0.7 | ------------------ ------------------ |sum wt:| 1.6 | ------------------ Impurity is found in the same way: Left node Median = 6 Total error = (Absolute(6 - 3) * 0.1) + (Absolute(6 - 6) * 0.6) = 0.3 Left Impurity = Total error / total weight = 0.3 / 0.7 = 0.428571428571429 ------------------- Likewise for Right node: Right node Median = 4 Total error = (Absolute(4 - 3) * 0.3) + (Absolute(4 - 4) * 1.0) + (Absolute(4 - 7) * 0.3) = 1.2 Right Impurity = Total error / total weight = 1.2 / 1.6 = 0.75 ------ """ dt_mae = DecisionTreeRegressor( random_state=0, criterion="absolute_error", max_leaf_nodes=2 ) # Test MAE where sample weights are non-uniform (as illustrated above): dt_mae.fit( X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3], ) assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6]) assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0]) # Test MAE where all sample weights are uniform: dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=np.ones(5)) assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) # Test MAE where a `sample_weight` is not explicitly provided. # This is equivalent to providing uniform sample weights, though # the internal logic is different: dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3]) assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0]) assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) def test_criterion_copy(): # Let's check whether copy of our criterion has the same type # and properties as original n_outputs = 3 n_classes = np.arange(3, dtype=np.intp) n_samples = 100 def _pickle_copy(obj): return pickle.loads(pickle.dumps(obj)) for copy_func in [copy.copy, copy.deepcopy, _pickle_copy]: for _, typename in CRITERIA_CLF.items(): criteria = typename(n_outputs, n_classes) result = copy_func(criteria).__reduce__() typename_, (n_outputs_, n_classes_), _ = result assert typename == typename_ assert n_outputs == n_outputs_ assert_array_equal(n_classes, n_classes_) for _, typename in CRITERIA_REG.items(): criteria = typename(n_outputs, n_samples) result = copy_func(criteria).__reduce__() typename_, (n_outputs_, n_samples_), _ = result assert typename == typename_ assert n_outputs == n_outputs_ assert n_samples == n_samples_ @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS) def test_empty_leaf_infinite_threshold(sparse_container): # try to make empty leaf by using near infinite value. data = np.random.RandomState(0).randn(100, 11) * 2e38 data = np.nan_to_num(data.astype("float32")) X = data[:, :-1] if sparse_container is not None: X = sparse_container(X) y = data[:, -1] tree = DecisionTreeRegressor(random_state=0).fit(X, y) terminal_regions = tree.apply(X) left_leaf = set(np.where(tree.tree_.children_left == TREE_LEAF)[0]) empty_leaf = left_leaf.difference(terminal_regions) infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0] assert len(infinite_threshold) == 0 assert len(empty_leaf) == 0 @pytest.mark.parametrize( "dataset", sorted(set(DATASETS.keys()) - {"reg_small", "diabetes"}) ) @pytest.mark.parametrize("tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier]) def test_prune_tree_classifier_are_subtrees(dataset, tree_cls): dataset = DATASETS[dataset] X, y = dataset["X"], dataset["y"] est = tree_cls(max_leaf_nodes=20, random_state=0) info = est.cost_complexity_pruning_path(X, y) pruning_path = info.ccp_alphas impurities = info.impurities assert np.all(np.diff(pruning_path) >= 0) assert np.all(np.diff(impurities) >= 0) assert_pruning_creates_subtree(tree_cls, X, y, pruning_path) @pytest.mark.parametrize("dataset", DATASETS.keys()) @pytest.mark.parametrize("tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor]) def test_prune_tree_regression_are_subtrees(dataset, tree_cls): dataset = DATASETS[dataset] X, y = dataset["X"], dataset["y"] est = tree_cls(max_leaf_nodes=20, random_state=0) info = est.cost_complexity_pruning_path(X, y) pruning_path = info.ccp_alphas impurities = info.impurities assert np.all(np.diff(pruning_path) >= 0) assert np.all(np.diff(impurities) >= 0) assert_pruning_creates_subtree(tree_cls, X, y, pruning_path) def test_prune_single_node_tree(): # single node tree clf1 = DecisionTreeClassifier(random_state=0) clf1.fit([[0], [1]], [0, 0]) # pruned single node tree clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10) clf2.fit([[0], [1]], [0, 0]) assert_is_subtree(clf1.tree_, clf2.tree_) def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path): # generate trees with increasing alphas estimators = [] for ccp_alpha in pruning_path: est = estimator_cls(max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit( X, y ) estimators.append(est) # A pruned tree must be a subtree of the previous tree (which had a # smaller ccp_alpha) for prev_est, next_est in zip(estimators, estimators[1:]): assert_is_subtree(prev_est.tree_, next_est.tree_) def assert_is_subtree(tree, subtree): assert tree.node_count >= subtree.node_count assert tree.max_depth >= subtree.max_depth tree_c_left = tree.children_left tree_c_right = tree.children_right subtree_c_left = subtree.children_left subtree_c_right = subtree.children_right stack = [(0, 0)] while stack: tree_node_idx, subtree_node_idx = stack.pop() assert_array_almost_equal( tree.value[tree_node_idx], subtree.value[subtree_node_idx] ) assert_almost_equal( tree.impurity[tree_node_idx], subtree.impurity[subtree_node_idx] ) assert_almost_equal( tree.n_node_samples[tree_node_idx], subtree.n_node_samples[subtree_node_idx] ) assert_almost_equal( tree.weighted_n_node_samples[tree_node_idx], subtree.weighted_n_node_samples[subtree_node_idx], ) if subtree_c_left[subtree_node_idx] == subtree_c_right[subtree_node_idx]: # is a leaf assert_almost_equal(TREE_UNDEFINED, subtree.threshold[subtree_node_idx]) else: # not a leaf assert_almost_equal( tree.threshold[tree_node_idx], subtree.threshold[subtree_node_idx] ) stack.append((tree_c_left[tree_node_idx], subtree_c_left[subtree_node_idx])) stack.append( (tree_c_right[tree_node_idx], subtree_c_right[subtree_node_idx]) ) @pytest.mark.parametrize("name", ALL_TREES) @pytest.mark.parametrize("splitter", ["best", "random"]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_apply_path_readonly_all_trees(name, splitter, sparse_container): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) if sparse_container is None: X_readonly = create_memmap_backed_data(X_small) else: X_readonly = sparse_container(dataset["X"]) X_readonly.data = np.array(X_readonly.data, dtype=tree._tree.DTYPE) ( X_readonly.data, X_readonly.indices, X_readonly.indptr, ) = create_memmap_backed_data( (X_readonly.data, X_readonly.indices, X_readonly.indptr) ) y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE)) est = ALL_TREES[name](splitter=splitter) est.fit(X_readonly, y_readonly) assert_array_equal(est.predict(X_readonly), est.predict(X_small)) assert_array_equal( est.decision_path(X_readonly).todense(), est.decision_path(X_small).todense() ) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse", "poisson"]) @pytest.mark.parametrize("Tree", REG_TREES.values()) def test_balance_property(criterion, Tree): # Test that sum(y_pred)=sum(y_true) on training set. # This works if the mean is predicted (should even be true for each leaf). # MAE predicts the median and is therefore excluded from this test. # Choose a training set with non-negative targets (for poisson) X, y = diabetes.data, diabetes.target reg = Tree(criterion=criterion) reg.fit(X, y) assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) @pytest.mark.parametrize("seed", range(3)) def test_poisson_zero_nodes(seed): # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes. X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]] y = [0, 0, 0, 0, 1, 2, 3, 4] # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can # easily learn that: reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed) reg.fit(X, y) assert np.amin(reg.predict(X)) == 0 # whereas Poisson must predict strictly positive numbers reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) # Test additional dataset where something could go wrong. n_features = 10 X, y = datasets.make_regression( effective_rank=n_features * 2 // 3, tail_strength=0.6, n_samples=1_000, n_features=n_features, n_informative=n_features * 2 // 3, random_state=seed, ) # some excess zeros y[(-1 < y) & (y < 0)] = 0 # make sure the target is positive y = np.abs(y) reg = DecisionTreeRegressor(criterion="poisson", random_state=seed) reg.fit(X, y) assert np.all(reg.predict(X) > 0) def test_poisson_vs_mse(): # For a Poisson distributed target, Poisson loss should give better results # than squared error measured in Poisson deviance as metric. # We have a similar test, test_poisson(), in # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py rng = np.random.RandomState(42) n_train, n_test, n_features = 500, 500, 10 X = datasets.make_low_rank_matrix( n_samples=n_train + n_test, n_features=n_features, random_state=rng ) # We create a log-linear Poisson model and downscale coef as it will get # exponentiated. coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) y = rng.poisson(lam=np.exp(X @ coef)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=n_test, random_state=rng ) # We prevent some overfitting by setting min_samples_split=10. tree_poi = DecisionTreeRegressor( criterion="poisson", min_samples_split=10, random_state=rng ) tree_mse = DecisionTreeRegressor( criterion="squared_error", min_samples_split=10, random_state=rng ) tree_poi.fit(X_train, y_train) tree_mse.fit(X_train, y_train) dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]: metric_poi = mean_poisson_deviance(y, tree_poi.predict(X)) # squared_error might produce non-positive predictions => clip metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None)) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) # As squared_error might correctly predict 0 in train set, its train # score can be better than Poisson. This is no longer the case for the # test set. if val == "test": assert metric_poi < 0.5 * metric_mse assert metric_poi < 0.75 * metric_dummy @pytest.mark.parametrize("criterion", REG_CRITERIONS) def test_decision_tree_regressor_sample_weight_consistency(criterion): """Test that the impact of sample_weight is consistent.""" tree_params = dict(criterion=criterion) tree = DecisionTreeRegressor(**tree_params, random_state=42) for kind in ["zeros", "ones"]: check_sample_weights_invariance( "DecisionTreeRegressor_" + criterion, tree, kind="zeros" ) rng = np.random.RandomState(0) n_samples, n_features = 10, 5 X = rng.rand(n_samples, n_features) y = np.mean(X, axis=1) + rng.rand(n_samples) # make it positive in order to work also for poisson criterion y += np.min(y) + 0.1 # check that multiplying sample_weight by 2 is equivalent # to repeating corresponding samples twice X2 = np.concatenate([X, X[: n_samples // 2]], axis=0) y2 = np.concatenate([y, y[: n_samples // 2]]) sample_weight_1 = np.ones(len(y)) sample_weight_1[: n_samples // 2] = 2 tree1 = DecisionTreeRegressor(**tree_params).fit( X, y, sample_weight=sample_weight_1 ) tree2 = DecisionTreeRegressor(**tree_params).fit(X2, y2, sample_weight=None) assert tree1.tree_.node_count == tree2.tree_.node_count # Thresholds, tree.tree_.threshold, and values, tree.tree_.value, are not # exactly the same, but on the training set, those differences do not # matter and thus predictions are the same. assert_allclose(tree1.predict(X), tree2.predict(X)) @pytest.mark.parametrize("Tree", [DecisionTreeClassifier, ExtraTreeClassifier]) @pytest.mark.parametrize("n_classes", [2, 4]) def test_criterion_entropy_same_as_log_loss(Tree, n_classes): """Test that criterion=entropy gives same as log_loss.""" n_samples, n_features = 50, 5 X, y = datasets.make_classification( n_classes=n_classes, n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, random_state=42, ) tree_log_loss = Tree(criterion="log_loss", random_state=43).fit(X, y) tree_entropy = Tree(criterion="entropy", random_state=43).fit(X, y) assert_tree_equal( tree_log_loss.tree_, tree_entropy.tree_, f"{Tree!r} with criterion 'entropy' and 'log_loss' gave different trees.", ) assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X)) def test_different_endianness_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def reduce_ndarray(arr): return arr.byteswap().view(arr.dtype.newbyteorder()).__reduce__() def get_pickle_non_native_endianness(): f = io.BytesIO() p = pickle.Pickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[np.ndarray] = reduce_ndarray p.dump(clf) f.seek(0) return f new_clf = pickle.load(get_pickle_non_native_endianness()) new_score = new_clf.score(X, y) assert np.isclose(score, new_score) def test_different_endianness_joblib_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) class NonNativeEndiannessNumpyPickler(NumpyPickler): def save(self, obj): if isinstance(obj, np.ndarray): obj = obj.byteswap().view(obj.dtype.newbyteorder()) super().save(obj) def get_joblib_pickle_non_native_endianness(): f = io.BytesIO() p = NonNativeEndiannessNumpyPickler(f) p.dump(clf) f.seek(0) return f new_clf = joblib.load(get_joblib_pickle_non_native_endianness()) new_score = new_clf.score(X, y) assert np.isclose(score, new_score) def get_different_bitness_node_ndarray(node_ndarray): new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32 # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd) indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"] new_dtype_dict = { name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items() } for name in indexing_field_names: new_dtype_dict[name] = new_dtype_for_indexing_fields new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) return node_ndarray.astype(new_dtype, casting="same_kind") def get_different_alignment_node_ndarray(node_ndarray): new_dtype_dict = { name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items() } offsets = [offset for dtype, offset in node_ndarray.dtype.fields.values()] shifted_offsets = [8 + offset for offset in offsets] new_dtype = np.dtype( { "names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values()), "offsets": shifted_offsets, } ) return node_ndarray.astype(new_dtype, casting="same_kind") def reduce_tree_with_different_bitness(tree): new_dtype = np.int64 if _IS_32BIT else np.int32 tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__() new_n_classes = n_classes.astype(new_dtype, casting="same_kind") new_state = state.copy() new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"]) return (tree_cls, (n_features, new_n_classes, n_outputs), new_state) def test_different_bitness_pickle(): X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def pickle_dump_with_different_bitness(): f = io.BytesIO() p = pickle.Pickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness p.dump(clf) f.seek(0) return f new_clf = pickle.load(pickle_dump_with_different_bitness()) new_score = new_clf.score(X, y) assert score == pytest.approx(new_score) def test_different_bitness_joblib_pickle(): # Make sure that a platform specific pickle generated on a 64 bit # platform can be converted at pickle load time into an estimator # with Cython code that works with the host's native integer precision # to index nodes in the tree data structure when the host is a 32 bit # platform (and vice versa). X, y = datasets.make_classification(random_state=0) clf = DecisionTreeClassifier(random_state=0, max_depth=3) clf.fit(X, y) score = clf.score(X, y) def joblib_dump_with_different_bitness(): f = io.BytesIO() p = NumpyPickler(f) p.dispatch_table = copyreg.dispatch_table.copy() p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness p.dump(clf) f.seek(0) return f new_clf = joblib.load(joblib_dump_with_different_bitness()) new_score = new_clf.score(X, y) assert score == pytest.approx(new_score) def test_check_n_classes(): expected_dtype = np.dtype(np.int32) if _IS_32BIT else np.dtype(np.int64) allowed_dtypes = [np.dtype(np.int32), np.dtype(np.int64)] allowed_dtypes += [dt.newbyteorder() for dt in allowed_dtypes] n_classes = np.array([0, 1], dtype=expected_dtype) for dt in allowed_dtypes: _check_n_classes(n_classes.astype(dt), expected_dtype) with pytest.raises(ValueError, match="Wrong dimensions.+n_classes"): wrong_dim_n_classes = np.array([[0, 1]], dtype=expected_dtype) _check_n_classes(wrong_dim_n_classes, expected_dtype) with pytest.raises(ValueError, match="n_classes.+incompatible dtype"): wrong_dtype_n_classes = n_classes.astype(np.float64) _check_n_classes(wrong_dtype_n_classes, expected_dtype) def test_check_value_ndarray(): expected_dtype = np.dtype(np.float64) expected_shape = (5, 1, 2) value_ndarray = np.zeros(expected_shape, dtype=expected_dtype) allowed_dtypes = [expected_dtype, expected_dtype.newbyteorder()] for dt in allowed_dtypes: _check_value_ndarray( value_ndarray, expected_dtype=dt, expected_shape=expected_shape ) with pytest.raises(ValueError, match="Wrong shape.+value array"): _check_value_ndarray( value_ndarray, expected_dtype=expected_dtype, expected_shape=(1, 2) ) for problematic_arr in [value_ndarray[:, :, :1], np.asfortranarray(value_ndarray)]: with pytest.raises(ValueError, match="value array.+C-contiguous"): _check_value_ndarray( problematic_arr, expected_dtype=expected_dtype, expected_shape=problematic_arr.shape, ) with pytest.raises(ValueError, match="value array.+incompatible dtype"): _check_value_ndarray( value_ndarray.astype(np.float32), expected_dtype=expected_dtype, expected_shape=expected_shape, ) def test_check_node_ndarray(): expected_dtype = NODE_DTYPE node_ndarray = np.zeros((5,), dtype=expected_dtype) valid_node_ndarrays = [ node_ndarray, get_different_bitness_node_ndarray(node_ndarray), get_different_alignment_node_ndarray(node_ndarray), ] valid_node_ndarrays += [ arr.astype(arr.dtype.newbyteorder()) for arr in valid_node_ndarrays ] for arr in valid_node_ndarrays: _check_node_ndarray(node_ndarray, expected_dtype=expected_dtype) with pytest.raises(ValueError, match="Wrong dimensions.+node array"): problematic_node_ndarray = np.zeros((5, 2), dtype=expected_dtype) _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) with pytest.raises(ValueError, match="node array.+C-contiguous"): problematic_node_ndarray = node_ndarray[::2] _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) dtype_dict = {name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()} # array with wrong 'threshold' field dtype (int64 rather than float64) new_dtype_dict = dtype_dict.copy() new_dtype_dict["threshold"] = np.int64 new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) problematic_node_ndarray = node_ndarray.astype(new_dtype) with pytest.raises(ValueError, match="node array.+incompatible dtype"): _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) # array with wrong 'left_child' field dtype (float64 rather than int64 or int32) new_dtype_dict = dtype_dict.copy() new_dtype_dict["left_child"] = np.float64 new_dtype = np.dtype( {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())} ) problematic_node_ndarray = node_ndarray.astype(new_dtype) with pytest.raises(ValueError, match="node array.+incompatible dtype"): _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype) @pytest.mark.parametrize( "Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values()) ) def test_splitter_serializable(Splitter): """Check that splitters are serializable.""" rng = np.random.RandomState(42) max_features = 10 n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp) criterion = CRITERIA_CLF["gini"](n_outputs, n_classes) splitter = Splitter(criterion, max_features, 5, 0.5, rng, monotonic_cst=None) splitter_serialize = pickle.dumps(splitter) splitter_back = pickle.loads(splitter_serialize) assert splitter_back.max_features == max_features assert isinstance(splitter_back, Splitter) def test_tree_deserialization_from_read_only_buffer(tmpdir): """Check that Trees can be deserialized with read only buffers. Non-regression test for gh-25584. """ pickle_path = str(tmpdir.join("clf.joblib")) clf = DecisionTreeClassifier(random_state=0) clf.fit(X_small, y_small) joblib.dump(clf, pickle_path) loaded_clf = joblib.load(pickle_path, mmap_mode="r") assert_tree_equal( loaded_clf.tree_, clf.tree_, "The trees of the original and loaded classifiers are not equal.", ) @pytest.mark.parametrize("Tree", ALL_TREES.values()) def test_min_sample_split_1_error(Tree): """Check that an error is raised when min_sample_split=1. non-regression test for issue gh-25481. """ X = np.array([[0, 0], [1, 1]]) y = np.array([0, 1]) # min_samples_split=1.0 is valid Tree(min_samples_split=1.0).fit(X, y) # min_samples_split=1 is invalid tree = Tree(min_samples_split=1) msg = ( r"'min_samples_split' .* must be an int in the range \[2, inf\) " r"or a float in the range \(0.0, 1.0\]" ) with pytest.raises(ValueError, match=msg): tree.fit(X, y) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"]) def test_missing_values_on_equal_nodes_no_missing(criterion): """Check missing values goes to correct node during predictions""" X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X, y) # Goes to right node because it has the most data points y_pred = dtc.predict([[np.nan]]) assert_allclose(y_pred, [np.mean(y[-5:])]) # equal number of elements in both nodes X_equal = X[:-1] y_equal = y[:-1] dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X_equal, y_equal) # Goes to right node because the implementation sets: # missing_go_to_left = n_left > n_right, which is False y_pred = dtc.predict([[np.nan]]) assert_allclose(y_pred, [np.mean(y_equal[-4:])]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_three_classes(criterion): """Test when missing values are uniquely present in a class among 3 classes.""" missing_values_class = 0 X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 8, 9, 11, 12]]).T y = np.array([missing_values_class] * 4 + [1] * 4 + [2] * 4) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 3, 12]]).T y_nan_pred = dtc.predict(X_test) # Missing values necessarily are associated to the observed class. assert_array_equal(y_nan_pred, [missing_values_class, 1, 2]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_to_left(criterion): """Missing values spanning only one class at fit-time must make missing values at predict-time be classified has belonging to this class.""" X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T y = np.array([0] * 4 + [1] * 6) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 5, np.nan]]).T y_pred = dtc.predict(X_test) assert_array_equal(y_pred, [0, 1, 0]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_best_splitter_to_right(criterion): """Missing values and non-missing values sharing one class at fit-time must make missing values at predict-time be classified has belonging to this class.""" X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T y = np.array([1] * 4 + [0] * 4 + [1] * 2) dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 1.2, 4.8]]).T y_pred = dtc.predict(X_test) assert_array_equal(y_pred, [1, 0, 1]) @pytest.mark.parametrize("criterion", ["entropy", "gini"]) def test_missing_values_missing_both_classes_has_nan(criterion): """Check behavior of missing value when there is one missing value in each class.""" X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T y = np.array([0] * 5 + [1] * 5) dtc = DecisionTreeClassifier(random_state=42, max_depth=1, criterion=criterion) dtc.fit(X, y) X_test = np.array([[np.nan, 2.3, 34.2]]).T y_pred = dtc.predict(X_test) # Missing value goes to the class at the right (here 1) because the implementation # searches right first. assert_array_equal(y_pred, [1, 0, 1]) @pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS) @pytest.mark.parametrize( "tree", [ DecisionTreeClassifier(splitter="random"), DecisionTreeRegressor(criterion="absolute_error"), ], ) def test_missing_value_errors(sparse_container, tree): """Check unsupported configurations for missing values.""" X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T y = np.array([0] * 5 + [1] * 5) if sparse_container is not None: X = sparse_container(X) with pytest.raises(ValueError, match="Input X contains NaN"): tree.fit(X, y) def test_missing_values_poisson(): """Smoke test for poisson regression and missing values.""" X, y = diabetes.data.copy(), diabetes.target # Set some values missing X[::5, 0] = np.nan X[::6, -1] = np.nan reg = DecisionTreeRegressor(criterion="poisson", random_state=42) reg.fit(X, y) y_pred = reg.predict(X) assert (y_pred >= 0.0).all() def make_friedman1_classification(*args, **kwargs): X, y = datasets.make_friedman1(*args, **kwargs) y = y > 14 return X, y @pytest.mark.parametrize( "make_data,Tree", [ (datasets.make_friedman1, DecisionTreeRegressor), (make_friedman1_classification, DecisionTreeClassifier), ], ) @pytest.mark.parametrize("sample_weight_train", [None, "ones"]) def test_missing_values_is_resilience( make_data, Tree, sample_weight_train, global_random_seed ): """Check that trees can deal with missing values have decent performance.""" n_samples, n_features = 5_000, 10 X, y = make_data( n_samples=n_samples, n_features=n_features, random_state=global_random_seed ) X_missing = X.copy() rng = np.random.RandomState(global_random_seed) X_missing[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan X_missing_train, X_missing_test, y_train, y_test = train_test_split( X_missing, y, random_state=global_random_seed ) if sample_weight_train == "ones": sample_weight = np.ones(X_missing_train.shape[0]) else: sample_weight = None native_tree = Tree(max_depth=10, random_state=global_random_seed) native_tree.fit(X_missing_train, y_train, sample_weight=sample_weight) score_native_tree = native_tree.score(X_missing_test, y_test) tree_with_imputer = make_pipeline( SimpleImputer(), Tree(max_depth=10, random_state=global_random_seed) ) tree_with_imputer.fit(X_missing_train, y_train) score_tree_with_imputer = tree_with_imputer.score(X_missing_test, y_test) assert ( score_native_tree > score_tree_with_imputer ), f"{score_native_tree=} should be strictly greater than {score_tree_with_imputer}" def test_missing_value_is_predictive(): """Check the tree learns when only the missing value is predictive.""" rng = np.random.RandomState(0) n_samples = 1000 X = rng.standard_normal(size=(n_samples, 10)) y = rng.randint(0, high=2, size=n_samples) # Create a predictive feature using `y` and with some noise X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05]) y_mask = y.copy().astype(bool) y_mask[X_random_mask] = ~y_mask[X_random_mask] X_predictive = rng.standard_normal(size=n_samples) X_predictive[y_mask] = np.nan X[:, 5] = X_predictive X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) tree = DecisionTreeClassifier(random_state=rng).fit(X_train, y_train) assert tree.score(X_train, y_train) >= 0.85 assert tree.score(X_test, y_test) >= 0.85 @pytest.mark.parametrize( "make_data, Tree", [ (datasets.make_regression, DecisionTreeRegressor), (datasets.make_classification, DecisionTreeClassifier), ], ) def test_sample_weight_non_uniform(make_data, Tree): """Check sample weight is correctly handled with missing values.""" rng = np.random.RandomState(0) n_samples, n_features = 1000, 10 X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) # Create dataset with missing values X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan # Zero sample weight is the same as removing the sample sample_weight = np.ones(X.shape[0]) sample_weight[::2] = 0.0 tree_with_sw = Tree(random_state=0) tree_with_sw.fit(X, y, sample_weight=sample_weight) tree_samples_removed = Tree(random_state=0) tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) def test_deterministic_pickle(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/27268 # Uninitialised memory would lead to the two pickle strings being different. tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) pickle1 = pickle.dumps(tree1) pickle2 = pickle.dumps(tree2) assert pickle1 == pickle2 @pytest.mark.parametrize( "X", [ # missing values will go left for greedy splits np.array([np.nan, 2, np.nan, 4, 5, 6]), np.array([np.nan, np.nan, 3, 4, 5, 6]), # missing values will go right for greedy splits np.array([1, 2, 3, 4, np.nan, np.nan]), np.array([1, 2, 3, np.nan, 6, np.nan]), ], ) @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"]) def test_regression_tree_missing_values_toy(X, criterion): """Check that we properly handle missing values in regression trees using a toy dataset. The regression targeted by this test was that we were not reinitializing the criterion when it comes to the number of missing values. Therefore, the value of the critetion (i.e. MSE) was completely wrong. This test check that the MSE is null when there is a single sample in the leaf. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/28254 https://github.com/scikit-learn/scikit-learn/issues/28316 """ X = X.reshape(-1, 1) y = np.arange(6) tree = DecisionTreeRegressor(criterion=criterion, random_state=0).fit(X, y) tree_ref = clone(tree).fit(y.reshape(-1, 1), y) assert all(tree.tree_.impurity >= 0) # MSE should always be positive # Check the impurity match after the first split assert_allclose(tree.tree_.impurity[:2], tree_ref.tree_.impurity[:2]) # Find the leaves with a single sample where the MSE should be 0 leaves_idx = np.flatnonzero( (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1) ) assert_allclose(tree.tree_.impurity[leaves_idx], 0.0) def test_classification_tree_missing_values_toy(): """Check that we properly handle missing values in clasification trees using a toy dataset. The test is more involved because we use a case where we detected a regression in a random forest. We therefore define the seed and bootstrap indices to detect one of the non-frequent regression. Here, we check that the impurity is null or positive in the leaves. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/28254 """ X, y = datasets.load_iris(return_X_y=True) rng = np.random.RandomState(42) X_missing = X.copy() mask = rng.binomial( n=np.ones(shape=(1, 4), dtype=np.int32), p=X[:, [2]] / 8 ).astype(bool) X_missing[mask] = np.nan X_train, _, y_train, _ = train_test_split(X_missing, y, random_state=13) # fmt: off # no black reformatting for this specific array indices = np.array([ 2, 81, 39, 97, 91, 38, 46, 31, 101, 13, 89, 82, 100, 42, 69, 27, 81, 16, 73, 74, 51, 47, 107, 17, 75, 110, 20, 15, 104, 57, 26, 15, 75, 79, 35, 77, 90, 51, 46, 13, 94, 91, 23, 8, 93, 93, 73, 77, 12, 13, 74, 109, 110, 24, 10, 23, 104, 27, 92, 52, 20, 109, 8, 8, 28, 27, 35, 12, 12, 7, 43, 0, 30, 31, 78, 12, 24, 105, 50, 0, 73, 12, 102, 105, 13, 31, 1, 69, 11, 32, 75, 90, 106, 94, 60, 56, 35, 17, 62, 85, 81, 39, 80, 16, 63, 6, 80, 84, 3, 3, 76, 78 ], dtype=np.int32) # fmt: on tree = DecisionTreeClassifier( max_depth=3, max_features="sqrt", random_state=1857819720 ) tree.fit(X_train[indices], y_train[indices]) assert all(tree.tree_.impurity >= 0) leaves_idx = np.flatnonzero( (tree.tree_.children_left == -1) & (tree.tree_.n_node_samples == 1) ) assert_allclose(tree.tree_.impurity[leaves_idx], 0.0)