projektAI/venv/Lib/site-packages/sklearn/ensemble/_hist_gradient_boosting/tests/test_warm_start.py

import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_allclose

import pytest

from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression

# To use this experimental feature, we need to explicitly ask for it:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import check_scoring


X_classification, y_classification = make_classification(random_state=0)
X_regression, y_regression = make_regression(random_state=0)


def _assert_predictor_equal(gb_1, gb_2, X):
    """Assert that two HistGBM instances are identical."""
    # Check identical nodes for each tree
    for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):
        for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):
            assert_array_equal(predictor_1.nodes, predictor_2.nodes)

    # Check identical predictions
    assert_allclose(gb_1.predict(X), gb_2.predict(X))


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):
    # Check that a ValueError is raised when the maximum number of iterations
    # is smaller than the number of iterations from the previous fit when warm
    # start is True.

    estimator = GradientBoosting(max_iter=10, early_stopping=False,
                                 warm_start=True)
    estimator.fit(X, y)
    estimator.set_params(max_iter=5)
    err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '
               'when warm_start==True')
    with pytest.raises(ValueError, match=err_msg):
        estimator.fit(X, y)


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_yields_identical_results(GradientBoosting, X, y):
    # Make sure that fitting 50 iterations and then 25 with warm start is
    # equivalent to fitting 75 iterations.

    rng = 42
    gb_warm_start = GradientBoosting(
        n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True
    )
    gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)

    gb_no_warm_start = GradientBoosting(
        n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False
    )
    gb_no_warm_start.fit(X, y)

    # Check that both predictors are equal
    _assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_max_depth(GradientBoosting, X, y):
    # Test if possible to fit trees of different depth in ensemble.
    gb = GradientBoosting(max_iter=20, min_samples_leaf=1,
                          warm_start=True, max_depth=2, early_stopping=False)
    gb.fit(X, y)
    gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)
    gb.fit(X, y)

    # First 20 trees have max_depth == 2
    for i in range(20):
        assert gb._predictors[i][0].get_max_depth() == 2
    # Last 10 trees have max_depth == 3
    for i in range(1, 11):
        assert gb._predictors[-i][0].get_max_depth() == 3


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
@pytest.mark.parametrize('scoring', (None, 'loss'))
def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):
    # Make sure that early stopping occurs after a small number of iterations
    # when fitting a second time with warm starting.

    n_iter_no_change = 5
    gb = GradientBoosting(
        n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,
        random_state=42, warm_start=True, tol=1e-3, scoring=scoring,
    )
    gb.fit(X, y)
    n_iter_first_fit = gb.n_iter_
    gb.fit(X, y)
    n_iter_second_fit = gb.n_iter_
    assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_equal_n_estimators(GradientBoosting, X, y):
    # Test if warm start with equal n_estimators does nothing
    gb_1 = GradientBoosting(max_depth=2, early_stopping=False)
    gb_1.fit(X, y)

    gb_2 = clone(gb_1)
    gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,
                    n_iter_no_change=5)
    gb_2.fit(X, y)

    # Check that both predictors are equal
    _assert_predictor_equal(gb_1, gb_2, X)


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
def test_warm_start_clear(GradientBoosting, X, y):
    # Test if fit clears state.
    gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)
    gb_1.fit(X, y)

    gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,
                            warm_start=True)
    gb_2.fit(X, y)  # inits state
    gb_2.set_params(warm_start=False)
    gb_2.fit(X, y)  # clears old state and equals est

    # Check that both predictors have the same train_score_ and
    # validation_score_ attributes
    assert_allclose(gb_1.train_score_, gb_2.train_score_)
    assert_allclose(gb_1.validation_score_, gb_2.validation_score_)

    # Check that both predictors are equal
    _assert_predictor_equal(gb_1, gb_2, X)


@pytest.mark.parametrize('GradientBoosting, X, y', [
    (HistGradientBoostingClassifier, X_classification, y_classification),
    (HistGradientBoostingRegressor, X_regression, y_regression)
])
@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
    # Make sure the seeds for train/val split and small trainset subsampling
    # are correctly set in a warm start context.
    def _get_rng(rng_type):
        # Helper to avoid consuming rngs
        if rng_type == 'none':
            return None
        elif rng_type == 'int':
            return 42
        else:
            return np.random.RandomState(0)

    random_state = _get_rng(rng_type)
    gb_1 = GradientBoosting(early_stopping=True, max_iter=2,
                            random_state=random_state)
    gb_1.set_params(scoring=check_scoring(gb_1))
    gb_1.fit(X, y)
    random_seed_1_1 = gb_1._random_seed

    gb_1.fit(X, y)
    random_seed_1_2 = gb_1._random_seed  # clear the old state, different seed

    random_state = _get_rng(rng_type)
    gb_2 = GradientBoosting(early_stopping=True, max_iter=2,
                            random_state=random_state, warm_start=True)
    gb_2.set_params(scoring=check_scoring(gb_2))
    gb_2.fit(X, y)  # inits state
    random_seed_2_1 = gb_2._random_seed
    gb_2.fit(X, y)  # clears old state and equals est
    random_seed_2_2 = gb_2._random_seed

    # Without warm starting, the seeds should be
    # * all different if random state is None
    # * all equal if random state is an integer
    # * different when refitting and equal with a new estimator (because
    #   the random state is mutated)
    if rng_type == 'none':
        assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
    elif rng_type == 'int':
        assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
    else:
        assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2

    # With warm starting, the seeds must be equal
    assert random_seed_2_1 == random_seed_2_2
Działa 2021-06-06 22:13:05 +02:00			`import numpy as np`
			`from numpy.testing import assert_array_equal`
			`from numpy.testing import assert_allclose`

			`import pytest`

			`from sklearn.base import clone`
			`from sklearn.datasets import make_classification, make_regression`

			`# To use this experimental feature, we need to explicitly ask for it:`
			`from sklearn.experimental import enable_hist_gradient_boosting # noqa`
			`from sklearn.ensemble import HistGradientBoostingRegressor`
			`from sklearn.ensemble import HistGradientBoostingClassifier`
			`from sklearn.metrics import check_scoring`


			`X_classification, y_classification = make_classification(random_state=0)`
			`X_regression, y_regression = make_regression(random_state=0)`


			`def _assert_predictor_equal(gb_1, gb_2, X):`
			`"""Assert that two HistGBM instances are identical."""`
			`# Check identical nodes for each tree`
			`for (pred_ith_1, pred_ith_2) in zip(gb_1._predictors, gb_2._predictors):`
			`for (predictor_1, predictor_2) in zip(pred_ith_1, pred_ith_2):`
			`assert_array_equal(predictor_1.nodes, predictor_2.nodes)`

			`# Check identical predictions`
			`assert_allclose(gb_1.predict(X), gb_2.predict(X))`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`def test_max_iter_with_warm_start_validation(GradientBoosting, X, y):`
			`# Check that a ValueError is raised when the maximum number of iterations`
			`# is smaller than the number of iterations from the previous fit when warm`
			`# start is True.`

			`estimator = GradientBoosting(max_iter=10, early_stopping=False,`
			`warm_start=True)`
			`estimator.fit(X, y)`
			`estimator.set_params(max_iter=5)`
			`err_msg = ('max_iter=5 must be larger than or equal to n_iter_=10 '`
			`'when warm_start==True')`
			`with pytest.raises(ValueError, match=err_msg):`
			`estimator.fit(X, y)`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`def test_warm_start_yields_identical_results(GradientBoosting, X, y):`
			`# Make sure that fitting 50 iterations and then 25 with warm start is`
			`# equivalent to fitting 75 iterations.`

			`rng = 42`
			`gb_warm_start = GradientBoosting(`
			`n_iter_no_change=100, max_iter=50, random_state=rng, warm_start=True`
			`)`
			`gb_warm_start.fit(X, y).set_params(max_iter=75).fit(X, y)`

			`gb_no_warm_start = GradientBoosting(`
			`n_iter_no_change=100, max_iter=75, random_state=rng, warm_start=False`
			`)`
			`gb_no_warm_start.fit(X, y)`

			`# Check that both predictors are equal`
			`_assert_predictor_equal(gb_warm_start, gb_no_warm_start, X)`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`def test_warm_start_max_depth(GradientBoosting, X, y):`
			`# Test if possible to fit trees of different depth in ensemble.`
			`gb = GradientBoosting(max_iter=20, min_samples_leaf=1,`
			`warm_start=True, max_depth=2, early_stopping=False)`
			`gb.fit(X, y)`
			`gb.set_params(max_iter=30, max_depth=3, n_iter_no_change=110)`
			`gb.fit(X, y)`

			`# First 20 trees have max_depth == 2`
			`for i in range(20):`
			`assert gb._predictors[i][0].get_max_depth() == 2`
			`# Last 10 trees have max_depth == 3`
			`for i in range(1, 11):`
			`assert gb._predictors[-i][0].get_max_depth() == 3`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`@pytest.mark.parametrize('scoring', (None, 'loss'))`
			`def test_warm_start_early_stopping(GradientBoosting, X, y, scoring):`
			`# Make sure that early stopping occurs after a small number of iterations`
			`# when fitting a second time with warm starting.`

			`n_iter_no_change = 5`
			`gb = GradientBoosting(`
			`n_iter_no_change=n_iter_no_change, max_iter=10000, early_stopping=True,`
			`random_state=42, warm_start=True, tol=1e-3, scoring=scoring,`
			`)`
			`gb.fit(X, y)`
			`n_iter_first_fit = gb.n_iter_`
			`gb.fit(X, y)`
			`n_iter_second_fit = gb.n_iter_`
			`assert 0 < n_iter_second_fit - n_iter_first_fit < n_iter_no_change`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`def test_warm_start_equal_n_estimators(GradientBoosting, X, y):`
			`# Test if warm start with equal n_estimators does nothing`
			`gb_1 = GradientBoosting(max_depth=2, early_stopping=False)`
			`gb_1.fit(X, y)`

			`gb_2 = clone(gb_1)`
			`gb_2.set_params(max_iter=gb_1.max_iter, warm_start=True,`
			`n_iter_no_change=5)`
			`gb_2.fit(X, y)`

			`# Check that both predictors are equal`
			`_assert_predictor_equal(gb_1, gb_2, X)`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`def test_warm_start_clear(GradientBoosting, X, y):`
			`# Test if fit clears state.`
			`gb_1 = GradientBoosting(n_iter_no_change=5, random_state=42)`
			`gb_1.fit(X, y)`

			`gb_2 = GradientBoosting(n_iter_no_change=5, random_state=42,`
			`warm_start=True)`
			`gb_2.fit(X, y) # inits state`
			`gb_2.set_params(warm_start=False)`
			`gb_2.fit(X, y) # clears old state and equals est`

			`# Check that both predictors have the same train_score_ and`
			`# validation_score_ attributes`
			`assert_allclose(gb_1.train_score_, gb_2.train_score_)`
			`assert_allclose(gb_1.validation_score_, gb_2.validation_score_)`

			`# Check that both predictors are equal`
			`_assert_predictor_equal(gb_1, gb_2, X)`


			`@pytest.mark.parametrize('GradientBoosting, X, y', [`
			`(HistGradientBoostingClassifier, X_classification, y_classification),`
			`(HistGradientBoostingRegressor, X_regression, y_regression)`
			`])`
			`@pytest.mark.parametrize('rng_type', ('none', 'int', 'instance'))`
			`def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):`
			`# Make sure the seeds for train/val split and small trainset subsampling`
			`# are correctly set in a warm start context.`
			`def _get_rng(rng_type):`
			`# Helper to avoid consuming rngs`
			`if rng_type == 'none':`
			`return None`
			`elif rng_type == 'int':`
			`return 42`
			`else:`
			`return np.random.RandomState(0)`

			`random_state = _get_rng(rng_type)`
			`gb_1 = GradientBoosting(early_stopping=True, max_iter=2,`
			`random_state=random_state)`
			`gb_1.set_params(scoring=check_scoring(gb_1))`
			`gb_1.fit(X, y)`
			`random_seed_1_1 = gb_1._random_seed`

			`gb_1.fit(X, y)`
			`random_seed_1_2 = gb_1._random_seed # clear the old state, different seed`

			`random_state = _get_rng(rng_type)`
			`gb_2 = GradientBoosting(early_stopping=True, max_iter=2,`
			`random_state=random_state, warm_start=True)`
			`gb_2.set_params(scoring=check_scoring(gb_2))`
			`gb_2.fit(X, y) # inits state`
			`random_seed_2_1 = gb_2._random_seed`
			`gb_2.fit(X, y) # clears old state and equals est`
			`random_seed_2_2 = gb_2._random_seed`

			`# Without warm starting, the seeds should be`
			`# * all different if random state is None`
			`# * all equal if random state is an integer`
			`# * different when refitting and equal with a new estimator (because`
			`# the random state is mutated)`
			`if rng_type == 'none':`
			`assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1`
			`elif rng_type == 'int':`
			`assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1`
			`else:`
			`assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2`

			`# With warm starting, the seeds must be equal`
			`assert random_seed_2_1 == random_seed_2_2`