135 lines
5.0 KiB
Python
135 lines
5.0 KiB
Python
|
import pytest
|
||
|
import scipy
|
||
|
import numpy as np
|
||
|
from numpy.testing import assert_array_equal
|
||
|
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
from sklearn.pipeline import make_pipeline
|
||
|
from sklearn.feature_selection import SequentialFeatureSelector
|
||
|
from sklearn.datasets import make_regression
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.experimental import enable_hist_gradient_boosting # noqa
|
||
|
from sklearn.ensemble import HistGradientBoostingRegressor
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('n_features_to_select', (0, 5, 0., -1, 1.1))
|
||
|
def test_bad_n_features_to_select(n_features_to_select):
|
||
|
X, y = make_regression(n_features=5)
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(),
|
||
|
n_features_to_select=n_features_to_select)
|
||
|
with pytest.raises(ValueError, match="must be either None"):
|
||
|
sfs.fit(X, y)
|
||
|
|
||
|
|
||
|
def test_bad_direction():
|
||
|
X, y = make_regression(n_features=5)
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(), direction='bad')
|
||
|
with pytest.raises(ValueError, match="must be either 'forward' or"):
|
||
|
sfs.fit(X, y)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('direction', ('forward', 'backward'))
|
||
|
@pytest.mark.parametrize('n_features_to_select', (1, 5, 9, None))
|
||
|
def test_n_features_to_select(direction, n_features_to_select):
|
||
|
# Make sure n_features_to_select is respected
|
||
|
|
||
|
X, y = make_regression(n_features=10)
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(),
|
||
|
n_features_to_select=n_features_to_select,
|
||
|
direction=direction, cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
if n_features_to_select is None:
|
||
|
n_features_to_select = 5 # n_features // 2
|
||
|
assert sfs.get_support(indices=True).shape[0] == n_features_to_select
|
||
|
assert sfs.n_features_to_select_ == n_features_to_select
|
||
|
assert sfs.transform(X).shape[1] == n_features_to_select
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('direction', ('forward', 'backward'))
|
||
|
@pytest.mark.parametrize('n_features_to_select, expected', (
|
||
|
(.1, 1),
|
||
|
(1., 10),
|
||
|
(.5, 5),
|
||
|
(None, 5), # just to make sure .5 is equivalent to passing None
|
||
|
))
|
||
|
def test_n_features_to_select_float(direction, n_features_to_select, expected):
|
||
|
# Test passing a float as n_features_to_select
|
||
|
X, y = make_regression(n_features=10)
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(),
|
||
|
n_features_to_select=n_features_to_select,
|
||
|
direction=direction, cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
assert sfs.n_features_to_select_ == expected
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('seed', range(10))
|
||
|
@pytest.mark.parametrize('direction', ('forward', 'backward'))
|
||
|
@pytest.mark.parametrize('n_features_to_select, expected_selected_features', [
|
||
|
(2, [0, 2]), # f1 is dropped since it has no predictive power
|
||
|
(1, [2]), # f2 is more predictive than f0 so it's kept
|
||
|
])
|
||
|
def test_sanity(seed, direction, n_features_to_select,
|
||
|
expected_selected_features):
|
||
|
# Basic sanity check: 3 features, only f0 and f2 are correlated with the
|
||
|
# target, f2 having a stronger correlation than f0. We expect f1 to be
|
||
|
# dropped, and f2 to always be selected.
|
||
|
|
||
|
rng = np.random.RandomState(seed)
|
||
|
n_samples = 100
|
||
|
X = rng.randn(n_samples, 3)
|
||
|
y = 3 * X[:, 0] - 10 * X[:, 2]
|
||
|
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(),
|
||
|
n_features_to_select=n_features_to_select,
|
||
|
direction=direction, cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
assert_array_equal(sfs.get_support(indices=True),
|
||
|
expected_selected_features)
|
||
|
|
||
|
|
||
|
def test_sparse_support():
|
||
|
# Make sure sparse data is supported
|
||
|
|
||
|
X, y = make_regression(n_features=10)
|
||
|
X = scipy.sparse.csr_matrix(X)
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
sfs.transform(X)
|
||
|
|
||
|
|
||
|
def test_nan_support():
|
||
|
# Make sure nans are OK if the underlying estimator supports nans
|
||
|
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_samples, n_features = 100, 10
|
||
|
X, y = make_regression(n_samples, n_features, random_state=0)
|
||
|
nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
|
||
|
X[nan_mask] = np.nan
|
||
|
sfs = SequentialFeatureSelector(HistGradientBoostingRegressor(), cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
sfs.transform(X)
|
||
|
|
||
|
with pytest.raises(ValueError, match='Input contains NaN'):
|
||
|
# LinearRegression does not support nans
|
||
|
SequentialFeatureSelector(LinearRegression(), cv=2).fit(X, y)
|
||
|
|
||
|
|
||
|
def test_pipeline_support():
|
||
|
# Make sure that pipelines can be passed into SFS and that SFS can be
|
||
|
# passed into a pipeline
|
||
|
|
||
|
n_samples, n_features = 50, 3
|
||
|
X, y = make_regression(n_samples, n_features, random_state=0)
|
||
|
|
||
|
# pipeline in SFS
|
||
|
pipe = make_pipeline(StandardScaler(), LinearRegression())
|
||
|
sfs = SequentialFeatureSelector(pipe, cv=2)
|
||
|
sfs.fit(X, y)
|
||
|
sfs.transform(X)
|
||
|
|
||
|
# SFS in pipeline
|
||
|
sfs = SequentialFeatureSelector(LinearRegression(), cv=2)
|
||
|
pipe = make_pipeline(StandardScaler(), sfs)
|
||
|
pipe.fit(X, y)
|
||
|
pipe.transform(X)
|