projektAI/venv/Lib/site-packages/sklearn/model_selection/tests/test_successive_halving.py

from math import ceil

import pytest
from scipy.stats import norm, randint
import numpy as np

from sklearn.datasets import make_classification
from sklearn.dummy import DummyClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.model_selection._search_successive_halving import (
    _SubsampleMetaSplitter, _top_k, _refit_callable)


class FastClassifier(DummyClassifier):
    """Dummy classifier that accepts parameters a, b, ... z.

    These parameter don't affect the predictions and are useful for fast
    grid searching."""

    def __init__(self, strategy='stratified', random_state=None,
                 constant=None, **kwargs):
        super().__init__(strategy=strategy, random_state=random_state,
                         constant=constant)

    def get_params(self, deep=False):
        params = super().get_params(deep=deep)
        for char in range(ord('a'), ord('z') + 1):
            params[chr(char)] = 'whatever'
        return params


@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
@pytest.mark.parametrize(
    ('aggressive_elimination,'
     'max_resources,'
     'expected_n_iterations,'
     'expected_n_required_iterations,'
     'expected_n_possible_iterations,'
     'expected_n_remaining_candidates,'
     'expected_n_candidates,'
     'expected_n_resources,'), [
         # notice how it loops at the beginning
         # also, the number of candidates evaluated at the last iteration is
         # <= factor
         (True, 'limited', 4, 4, 3, 1, [60, 20, 7, 3], [20, 20, 60, 180]),
         # no aggressive elimination: we end up with less iterations, and
         # the number of candidates at the last iter is > factor, which isn't
         # ideal
         (False, 'limited', 3, 4, 3, 3, [60, 20, 7], [20, 60, 180]),
        #  # When the amount of resource isn't limited, aggressive_elimination
        #  # has no effect. Here the default min_resources='exhaust' will take
        #  # over.
         (True, 'unlimited', 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
         (False, 'unlimited', 4, 4, 4, 1, [60, 20, 7, 3], [37, 111, 333, 999]),
     ]
)
def test_aggressive_elimination(
        Est, aggressive_elimination, max_resources, expected_n_iterations,
        expected_n_required_iterations, expected_n_possible_iterations,
        expected_n_remaining_candidates, expected_n_candidates,
        expected_n_resources):
    # Test the aggressive_elimination parameter.

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))}
    base_estimator = FastClassifier()

    if max_resources == 'limited':
        max_resources = 180
    else:
        max_resources = n_samples

    sh = Est(base_estimator, param_grid,
             aggressive_elimination=aggressive_elimination,
             max_resources=max_resources, factor=3)
    sh.set_params(verbose=True)  # just for test coverage

    if Est is HalvingRandomSearchCV:
        # same number of candidates as with the grid
        sh.set_params(n_candidates=2 * 30, min_resources='exhaust')

    sh.fit(X, y)

    assert sh.n_iterations_ == expected_n_iterations
    assert sh.n_required_iterations_ == expected_n_required_iterations
    assert sh.n_possible_iterations_ == expected_n_possible_iterations
    assert sh.n_resources_ == expected_n_resources
    assert sh.n_candidates_ == expected_n_candidates
    assert sh.n_remaining_candidates_ == expected_n_remaining_candidates
    assert ceil(sh.n_candidates_[-1] / sh.factor) == sh.n_remaining_candidates_


@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
@pytest.mark.parametrize(
    ('min_resources,'
     'max_resources,'
     'expected_n_iterations,'
     'expected_n_possible_iterations,'
     'expected_n_resources,'), [
         # with enough resources
         ('smallest', 'auto', 2, 4, [20, 60]),
         # with enough resources but min_resources set manually
         (50, 'auto', 2, 3, [50, 150]),
         # without enough resources, only one iteration can be done
         ('smallest', 30, 1, 1, [20]),
         # with exhaust: use as much resources as possible at the last iter
         ('exhaust', 'auto', 2, 2, [333, 999]),
         ('exhaust', 1000, 2, 2, [333, 999]),
         ('exhaust', 999, 2, 2, [333, 999]),
         ('exhaust', 600, 2, 2, [200, 600]),
         ('exhaust', 599, 2, 2, [199, 597]),
         ('exhaust', 300, 2, 2, [100, 300]),
         ('exhaust', 60, 2, 2, [20, 60]),
         ('exhaust', 50, 1, 1, [20]),
         ('exhaust', 20, 1, 1, [20]),
     ]
)
def test_min_max_resources(
        Est, min_resources, max_resources, expected_n_iterations,
        expected_n_possible_iterations,
        expected_n_resources):
    # Test the min_resources and max_resources parameters, and how they affect
    # the number of resources used at each iteration
    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': [1, 2], 'b': [1, 2, 3]}
    base_estimator = FastClassifier()

    sh = Est(base_estimator, param_grid, factor=3, min_resources=min_resources,
             max_resources=max_resources)
    if Est is HalvingRandomSearchCV:
        sh.set_params(n_candidates=6)  # same number as with the grid

    sh.fit(X, y)

    expected_n_required_iterations = 2  # given 6 combinations and factor = 3
    assert sh.n_iterations_ == expected_n_iterations
    assert sh.n_required_iterations_ == expected_n_required_iterations
    assert sh.n_possible_iterations_ == expected_n_possible_iterations
    assert sh.n_resources_ == expected_n_resources
    if min_resources == 'exhaust':
        assert (sh.n_possible_iterations_ == sh.n_iterations_ ==
                len(sh.n_resources_))


@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV))
@pytest.mark.parametrize(
    'max_resources, n_iterations, n_possible_iterations', [
        ('auto', 5, 9),  # all resources are used
        (1024, 5, 9),
        (700, 5, 8),
        (512, 5, 8),
        (511, 5, 7),
        (32, 4, 4),
        (31, 3, 3),
        (16, 3, 3),
        (4, 1, 1),  # max_resources == min_resources, only one iteration is
                    # possible
    ])
def test_n_iterations(Est, max_resources, n_iterations, n_possible_iterations):
    # test the number of actual iterations that were run depending on
    # max_resources

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=1)
    param_grid = {'a': [1, 2], 'b': list(range(10))}
    base_estimator = FastClassifier()
    factor = 2

    sh = Est(base_estimator, param_grid, cv=2, factor=factor,
             max_resources=max_resources, min_resources=4)
    if Est is HalvingRandomSearchCV:
        sh.set_params(n_candidates=20)  # same as for HalvingGridSearchCV
    sh.fit(X, y)
    assert sh.n_required_iterations_ == 5
    assert sh.n_iterations_ == n_iterations
    assert sh.n_possible_iterations_ == n_possible_iterations


@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV))
def test_resource_parameter(Est):
    # Test the resource parameter

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': [1, 2], 'b': list(range(10))}
    base_estimator = FastClassifier()
    sh = Est(base_estimator, param_grid, cv=2, resource='c',
             max_resources=10, factor=3)
    sh.fit(X, y)
    assert set(sh.n_resources_) == set([1, 3, 9])
    for r_i, params, param_c in zip(sh.cv_results_['n_resources'],
                                    sh.cv_results_['params'],
                                    sh.cv_results_['param_c']):
        assert r_i == params['c'] == param_c

    with pytest.raises(
            ValueError,
            match='Cannot use resource=1234 which is not supported '):
        sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2,
                                 resource='1234', max_resources=10)
        sh.fit(X, y)

    with pytest.raises(
            ValueError,
            match='Cannot use parameter c as the resource since it is part '
                  'of the searched parameters.'):
        param_grid = {'a': [1, 2], 'b': [1, 2], 'c': [1, 3]}
        sh = HalvingGridSearchCV(base_estimator, param_grid, cv=2,
                                 resource='c', max_resources=10)
        sh.fit(X, y)


@pytest.mark.parametrize(
    'max_resources, n_candidates, expected_n_candidates', [
        (512, 'exhaust', 128),  # generate exactly as much as needed
        (32, 'exhaust', 8),
        (32, 8, 8),
        (32, 7, 7),  # ask for less than what we could
        (32, 9, 9),  # ask for more than 'reasonable'
    ])
def test_random_search(max_resources, n_candidates, expected_n_candidates):
    # Test random search and make sure the number of generated candidates is
    # as expected

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': norm, 'b': norm}
    base_estimator = FastClassifier()
    sh = HalvingRandomSearchCV(base_estimator, param_grid,
                               n_candidates=n_candidates, cv=2,
                               max_resources=max_resources, factor=2,
                               min_resources=4)
    sh.fit(X, y)
    assert sh.n_candidates_[0] == expected_n_candidates
    if n_candidates == 'exhaust':
        # Make sure 'exhaust' makes the last iteration use as much resources as
        # we can
        assert sh.n_resources_[-1] == max_resources


@pytest.mark.parametrize('param_distributions, expected_n_candidates', [
    ({'a': [1, 2]}, 2),  # all lists, sample less than n_candidates
    ({'a': randint(1, 3)}, 10),  # not all list, respect n_candidates
])
def test_random_search_discrete_distributions(param_distributions,
                                              expected_n_candidates):
    # Make sure random search samples the appropriate number of candidates when
    # we ask for more than what's possible. How many parameters are sampled
    # depends whether the distributions are 'all lists' or not (see
    # ParameterSampler for details). This is somewhat redundant with the checks
    # in ParameterSampler but interaction bugs were discovered during
    # developement of SH

    n_samples = 1024
    X, y = make_classification(n_samples=n_samples, random_state=0)
    base_estimator = FastClassifier()
    sh = HalvingRandomSearchCV(base_estimator, param_distributions,
                               n_candidates=10)
    sh.fit(X, y)
    assert sh.n_candidates_[0] == expected_n_candidates


@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
@pytest.mark.parametrize('params, expected_error_message', [
    ({'scoring': {'accuracy', 'accuracy'}},
     'Multimetric scoring is not supported'),
    ({'resource': 'not_a_parameter'},
     'Cannot use resource=not_a_parameter which is not supported'),
    ({'resource': 'a', 'max_resources': 100},
     'Cannot use parameter a as the resource since it is part of'),
    ({'max_resources': 'not_auto'},
     'max_resources must be either'),
    ({'max_resources': 100.5},
     'max_resources must be either'),
    ({'max_resources': -10},
     'max_resources must be either'),
    ({'min_resources': 'bad str'},
     'min_resources must be either'),
    ({'min_resources': 0.5},
     'min_resources must be either'),
    ({'min_resources': -10},
     'min_resources must be either'),
    ({'max_resources': 'auto', 'resource': 'b'},
     "max_resources can only be 'auto' if resource='n_samples'"),
    ({'min_resources': 15, 'max_resources': 14},
     "min_resources_=15 is greater than max_resources_=14"),
    ({'cv': KFold(shuffle=True)}, "must yield consistent folds"),
    ({'cv': ShuffleSplit()}, "must yield consistent folds"),
])
def test_input_errors(Est, params, expected_error_message):
    base_estimator = FastClassifier()
    param_grid = {'a': [1]}
    X, y = make_classification(100)

    sh = Est(base_estimator, param_grid, **params)

    with pytest.raises(ValueError, match=expected_error_message):
        sh.fit(X, y)


@pytest.mark.parametrize('params, expected_error_message', [
    ({'n_candidates': 'exhaust', 'min_resources': 'exhaust'},
     "cannot be both set to 'exhaust'"),
    ({'n_candidates': 'bad'}, "either 'exhaust' or a positive integer"),
    ({'n_candidates': 0}, "either 'exhaust' or a positive integer"),
])
def test_input_errors_randomized(params, expected_error_message):
    # tests specific to HalvingRandomSearchCV

    base_estimator = FastClassifier()
    param_grid = {'a': [1]}
    X, y = make_classification(100)

    sh = HalvingRandomSearchCV(base_estimator, param_grid, **params)

    with pytest.raises(ValueError, match=expected_error_message):
        sh.fit(X, y)


@pytest.mark.parametrize(
    'fraction, subsample_test, expected_train_size, expected_test_size', [
        (.5, True, 40, 10),
        (.5, False, 40, 20),
        (.2, True, 16, 4),
        (.2, False, 16, 20)])
def test_subsample_splitter_shapes(fraction, subsample_test,
                                   expected_train_size, expected_test_size):
    # Make sure splits returned by SubsampleMetaSplitter are of appropriate
    # size

    n_samples = 100
    X, y = make_classification(n_samples)
    cv = _SubsampleMetaSplitter(base_cv=KFold(5), fraction=fraction,
                                subsample_test=subsample_test,
                                random_state=None)

    for train, test in cv.split(X, y):
        assert train.shape[0] == expected_train_size
        assert test.shape[0] == expected_test_size
        if subsample_test:
            assert train.shape[0] + test.shape[0] == int(n_samples * fraction)
        else:
            assert test.shape[0] == n_samples // cv.base_cv.get_n_splits()


@pytest.mark.parametrize('subsample_test', (True, False))
def test_subsample_splitter_determinism(subsample_test):
    # Make sure _SubsampleMetaSplitter is consistent across calls to split():
    # - we're OK having training sets differ (they're always sampled with a
    #   different fraction anyway)
    # - when we don't subsample the test set, we want it to be always the same.
    #   This check is the most important. This is ensured by the determinism
    #   of the base_cv.

    # Note: we could force both train and test splits to be always the same if
    # we drew an int seed in _SubsampleMetaSplitter.__init__

    n_samples = 100
    X, y = make_classification(n_samples)
    cv = _SubsampleMetaSplitter(base_cv=KFold(5), fraction=.5,
                                subsample_test=subsample_test,
                                random_state=None)

    folds_a = list(cv.split(X, y, groups=None))
    folds_b = list(cv.split(X, y, groups=None))

    for (train_a, test_a), (train_b, test_b) in zip(folds_a, folds_b):
        assert not np.all(train_a == train_b)

        if subsample_test:
            assert not np.all(test_a == test_b)
        else:
            assert np.all(test_a == test_b)
            assert np.all(X[test_a] == X[test_b])


@pytest.mark.parametrize('k, itr, expected', [
    (1, 0, ['c']),
    (2, 0, ['a', 'c']),
    (4, 0, ['d', 'b', 'a', 'c']),
    (10, 0, ['d', 'b', 'a', 'c']),

    (1, 1, ['e']),
    (2, 1, ['f', 'e']),
    (10, 1, ['f', 'e']),

    (1, 2, ['i']),
    (10, 2, ['g', 'h', 'i']),
])
def test_top_k(k, itr, expected):

    results = {  # this isn't a 'real world' result dict
        'iter': [0, 0, 0, 0, 1, 1, 2, 2, 2],
        'mean_test_score': [4, 3, 5, 1, 11, 10, 5, 6, 9],
        'params': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'],
    }
    got = _top_k(results, k=k, itr=itr)
    assert np.all(got == expected)


def test_refit_callable():

    results = {  # this isn't a 'real world' result dict
        'iter': np.array([0, 0, 0, 0, 1, 1, 2, 2, 2]),
        'mean_test_score': np.array([4, 3, 5, 1, 11, 10, 5, 6, 9]),
        'params': np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']),
    }
    assert _refit_callable(results) == 8  # index of 'i'


@pytest.mark.parametrize('Est', (HalvingRandomSearchCV, HalvingGridSearchCV))
def test_cv_results(Est):
    # test that the cv_results_ matches correctly the logic of the
    # tournament: in particular that the candidates continued in each
    # successive iteration are those that were best in the previous iteration
    pd = pytest.importorskip('pandas')

    rng = np.random.RandomState(0)

    n_samples = 1000
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))}
    base_estimator = FastClassifier()

    # generate random scores: we want to avoid ties, which would otherwise
    # mess with the ordering and make testing harder
    def scorer(est, X, y):
        return rng.rand()

    sh = Est(base_estimator, param_grid, factor=2, scoring=scorer)
    if Est is HalvingRandomSearchCV:
        # same number of candidates as with the grid
        sh.set_params(n_candidates=2 * 30, min_resources='exhaust')

    sh.fit(X, y)

    # non-regression check for
    # https://github.com/scikit-learn/scikit-learn/issues/19203
    assert isinstance(sh.cv_results_['iter'], np.ndarray)
    assert isinstance(sh.cv_results_['n_resources'], np.ndarray)

    cv_results_df = pd.DataFrame(sh.cv_results_)

    # just make sure we don't have ties
    assert len(cv_results_df['mean_test_score'].unique()) == len(cv_results_df)

    cv_results_df['params_str'] = cv_results_df['params'].apply(str)
    table = cv_results_df.pivot(index='params_str', columns='iter',
                                values='mean_test_score')

    # table looks like something like this:
    # iter                    0      1       2        3   4   5
    # params_str
    # {'a': 'l2', 'b': 23} 0.75    NaN     NaN      NaN NaN NaN
    # {'a': 'l1', 'b': 30} 0.90  0.875     NaN      NaN NaN NaN
    # {'a': 'l1', 'b': 0}  0.75    NaN     NaN      NaN NaN NaN
    # {'a': 'l2', 'b': 3}  0.85  0.925  0.9125  0.90625 NaN NaN
    # {'a': 'l1', 'b': 5}  0.80    NaN     NaN      NaN NaN NaN
    # ...

    # where a NaN indicates that the candidate wasn't evaluated at a given
    # iteration, because it wasn't part of the top-K at some previous
    # iteration. We here make sure that candidates that aren't in the top-k at
    # any given iteration are indeed not evaluated at the subsequent
    # iterations.
    nan_mask = pd.isna(table)
    n_iter = sh.n_iterations_
    for it in range(n_iter - 1):
        already_discarded_mask = nan_mask[it]

        # make sure that if a candidate is already discarded, we don't evaluate
        # it later
        assert (already_discarded_mask & nan_mask[it + 1] ==
                already_discarded_mask).all()

        # make sure that the number of discarded candidate is correct
        discarded_now_mask = ~already_discarded_mask & nan_mask[it + 1]
        kept_mask = ~already_discarded_mask & ~discarded_now_mask
        assert kept_mask.sum() == sh.n_candidates_[it + 1]

        # make sure that all discarded candidates have a lower score than the
        # kept candidates
        discarded_max_score = table[it].where(discarded_now_mask).max()
        kept_min_score = table[it].where(kept_mask).min()
        assert discarded_max_score < kept_min_score

    # We now make sure that the best candidate is chosen only from the last
    # iteration.
    # We also make sure this is true even if there were higher scores in
    # earlier rounds (this isn't generally the case, but worth ensuring it's
    # possible).

    last_iter = cv_results_df['iter'].max()
    idx_best_last_iter = (
        cv_results_df[cv_results_df['iter'] == last_iter]
        ['mean_test_score'].idxmax()
    )
    idx_best_all_iters = cv_results_df['mean_test_score'].idxmax()

    assert sh.best_params_ == cv_results_df.iloc[idx_best_last_iter]['params']
    assert (cv_results_df.iloc[idx_best_last_iter]['mean_test_score'] <
            cv_results_df.iloc[idx_best_all_iters]['mean_test_score'])
    assert (cv_results_df.iloc[idx_best_last_iter]['params'] !=
            cv_results_df.iloc[idx_best_all_iters]['params'])


@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
def test_base_estimator_inputs(Est):
    # make sure that the base estimators are passed the correct parameters and
    # number of samples at each iteration.
    pd = pytest.importorskip('pandas')

    passed_n_samples_fit = []
    passed_n_samples_predict = []
    passed_params = []

    class FastClassifierBookKeeping(FastClassifier):

        def fit(self, X, y):
            passed_n_samples_fit.append(X.shape[0])
            return super().fit(X, y)

        def predict(self, X):
            passed_n_samples_predict.append(X.shape[0])
            return super().predict(X)

        def set_params(self, **params):
            passed_params.append(params)
            return super().set_params(**params)

    n_samples = 1024
    n_splits = 2
    X, y = make_classification(n_samples=n_samples, random_state=0)
    param_grid = {'a': ('l1', 'l2'), 'b': list(range(30))}
    base_estimator = FastClassifierBookKeeping()

    sh = Est(base_estimator, param_grid, factor=2, cv=n_splits,
             return_train_score=False, refit=False)
    if Est is HalvingRandomSearchCV:
        # same number of candidates as with the grid
        sh.set_params(n_candidates=2 * 30, min_resources='exhaust')

    sh.fit(X, y)

    assert len(passed_n_samples_fit) == len(passed_n_samples_predict)
    passed_n_samples = [x + y for (x, y) in zip(passed_n_samples_fit,
                                                passed_n_samples_predict)]

    # Lists are of length n_splits * n_iter * n_candidates_at_i.
    # Each chunk of size n_splits corresponds to the n_splits folds for the
    # same candidate at the same iteration, so they contain equal values. We
    # subsample such that the lists are of length n_iter * n_candidates_at_it
    passed_n_samples = passed_n_samples[::n_splits]
    passed_params = passed_params[::n_splits]

    cv_results_df = pd.DataFrame(sh.cv_results_)

    assert len(passed_params) == len(passed_n_samples) == len(cv_results_df)

    uniques, counts = np.unique(passed_n_samples, return_counts=True)
    assert (sh.n_resources_ == uniques).all()
    assert (sh.n_candidates_ == counts).all()

    assert (cv_results_df['params'] == passed_params).all()
    assert (cv_results_df['n_resources'] == passed_n_samples).all()


@pytest.mark.parametrize('Est', (HalvingGridSearchCV, HalvingRandomSearchCV))
def test_groups_support(Est):
    # Check if ValueError (when groups is None) propagates to
    # HalvingGridSearchCV and HalvingRandomSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 50)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2),
                 GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)]
    error_msg = "The 'groups' parameter should not be None."
    for cv in group_cvs:
        gs = Est(clf, grid, cv=cv)
        with pytest.raises(ValueError, match=error_msg):
            gs.fit(X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
    for cv in non_group_cvs:
        gs = Est(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)