projektAI/venv/Lib/site-packages/sklearn/feature_selection/tests/test_sequential.py
2021-06-06 22:13:05 +02:00

135 lines
5.0 KiB
Python

import pytest
import scipy
import numpy as np
from numpy.testing import assert_array_equal
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
@pytest.mark.parametrize('n_features_to_select', (0, 5, 0., -1, 1.1))
def test_bad_n_features_to_select(n_features_to_select):
X, y = make_regression(n_features=5)
sfs = SequentialFeatureSelector(LinearRegression(),
n_features_to_select=n_features_to_select)
with pytest.raises(ValueError, match="must be either None"):
sfs.fit(X, y)
def test_bad_direction():
X, y = make_regression(n_features=5)
sfs = SequentialFeatureSelector(LinearRegression(), direction='bad')
with pytest.raises(ValueError, match="must be either 'forward' or"):
sfs.fit(X, y)
@pytest.mark.parametrize('direction', ('forward', 'backward'))
@pytest.mark.parametrize('n_features_to_select', (1, 5, 9, None))
def test_n_features_to_select(direction, n_features_to_select):
# Make sure n_features_to_select is respected
X, y = make_regression(n_features=10)
sfs = SequentialFeatureSelector(LinearRegression(),
n_features_to_select=n_features_to_select,
direction=direction, cv=2)
sfs.fit(X, y)
if n_features_to_select is None:
n_features_to_select = 5 # n_features // 2
assert sfs.get_support(indices=True).shape[0] == n_features_to_select
assert sfs.n_features_to_select_ == n_features_to_select
assert sfs.transform(X).shape[1] == n_features_to_select
@pytest.mark.parametrize('direction', ('forward', 'backward'))
@pytest.mark.parametrize('n_features_to_select, expected', (
(.1, 1),
(1., 10),
(.5, 5),
(None, 5), # just to make sure .5 is equivalent to passing None
))
def test_n_features_to_select_float(direction, n_features_to_select, expected):
# Test passing a float as n_features_to_select
X, y = make_regression(n_features=10)
sfs = SequentialFeatureSelector(LinearRegression(),
n_features_to_select=n_features_to_select,
direction=direction, cv=2)
sfs.fit(X, y)
assert sfs.n_features_to_select_ == expected
@pytest.mark.parametrize('seed', range(10))
@pytest.mark.parametrize('direction', ('forward', 'backward'))
@pytest.mark.parametrize('n_features_to_select, expected_selected_features', [
(2, [0, 2]), # f1 is dropped since it has no predictive power
(1, [2]), # f2 is more predictive than f0 so it's kept
])
def test_sanity(seed, direction, n_features_to_select,
expected_selected_features):
# Basic sanity check: 3 features, only f0 and f2 are correlated with the
# target, f2 having a stronger correlation than f0. We expect f1 to be
# dropped, and f2 to always be selected.
rng = np.random.RandomState(seed)
n_samples = 100
X = rng.randn(n_samples, 3)
y = 3 * X[:, 0] - 10 * X[:, 2]
sfs = SequentialFeatureSelector(LinearRegression(),
n_features_to_select=n_features_to_select,
direction=direction, cv=2)
sfs.fit(X, y)
assert_array_equal(sfs.get_support(indices=True),
expected_selected_features)
def test_sparse_support():
# Make sure sparse data is supported
X, y = make_regression(n_features=10)
X = scipy.sparse.csr_matrix(X)
sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
sfs.fit(X, y)
sfs.transform(X)
def test_nan_support():
# Make sure nans are OK if the underlying estimator supports nans
rng = np.random.RandomState(0)
n_samples, n_features = 100, 10
X, y = make_regression(n_samples, n_features, random_state=0)
nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
X[nan_mask] = np.nan
sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2)
sfs.fit(X, y)
sfs.transform(X)
with pytest.raises(ValueError, match='Input contains NaN'):
# LinearRegression does not support nans
SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
def test_pipeline_support():
# Make sure that pipelines can be passed into SFS and that SFS can be
# passed into a pipeline
n_samples, n_features = 50, 3
X, y = make_regression(n_samples, n_features, random_state=0)
# pipeline in SFS
pipe = make_pipeline(StandardScaler(), LinearRegression())
sfs = SequentialFeatureSelector(pipe, cv=2)
sfs.fit(X, y)
sfs.transform(X)
# SFS in pipeline
sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
pipe = make_pipeline(StandardScaler(), sfs)
pipe.fit(X, y)
pipe.transform(X)