541 lines
20 KiB
Python
541 lines
20 KiB
Python
import numpy as np
|
|
import pytest
|
|
from joblib import parallel_backend
|
|
from numpy.testing import assert_allclose
|
|
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.datasets import (
|
|
load_diabetes,
|
|
load_iris,
|
|
make_classification,
|
|
make_regression,
|
|
)
|
|
from sklearn.dummy import DummyClassifier
|
|
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.inspection import permutation_importance
|
|
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
from sklearn.metrics import (
|
|
get_scorer,
|
|
mean_squared_error,
|
|
r2_score,
|
|
)
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler, scale
|
|
from sklearn.utils._testing import _convert_container
|
|
|
|
|
|
@pytest.mark.parametrize("n_jobs", [1, 2])
|
|
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
|
|
@pytest.mark.parametrize("sample_weight", [None, "ones"])
|
|
def test_permutation_importance_correlated_feature_regression(
|
|
n_jobs, max_samples, sample_weight
|
|
):
|
|
# Make sure that feature highly correlated to the target have a higher
|
|
# importance
|
|
rng = np.random.RandomState(42)
|
|
n_repeats = 5
|
|
|
|
X, y = load_diabetes(return_X_y=True)
|
|
y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
|
|
|
|
X = np.hstack([X, y_with_little_noise])
|
|
|
|
weights = np.ones_like(y) if sample_weight == "ones" else sample_weight
|
|
clf = RandomForestRegressor(n_estimators=10, random_state=42)
|
|
clf.fit(X, y)
|
|
|
|
result = permutation_importance(
|
|
clf,
|
|
X,
|
|
y,
|
|
sample_weight=weights,
|
|
n_repeats=n_repeats,
|
|
random_state=rng,
|
|
n_jobs=n_jobs,
|
|
max_samples=max_samples,
|
|
)
|
|
|
|
assert result.importances.shape == (X.shape[1], n_repeats)
|
|
|
|
# the correlated feature with y was added as the last column and should
|
|
# have the highest importance
|
|
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
|
|
|
|
|
|
@pytest.mark.parametrize("n_jobs", [1, 2])
|
|
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
|
|
def test_permutation_importance_correlated_feature_regression_pandas(
|
|
n_jobs, max_samples
|
|
):
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
# Make sure that feature highly correlated to the target have a higher
|
|
# importance
|
|
rng = np.random.RandomState(42)
|
|
n_repeats = 5
|
|
|
|
dataset = load_iris()
|
|
X, y = dataset.data, dataset.target
|
|
y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
|
|
|
|
# Adds feature correlated with y as the last column
|
|
X = pd.DataFrame(X, columns=dataset.feature_names)
|
|
X["correlated_feature"] = y_with_little_noise
|
|
|
|
clf = RandomForestClassifier(n_estimators=10, random_state=42)
|
|
clf.fit(X, y)
|
|
|
|
result = permutation_importance(
|
|
clf,
|
|
X,
|
|
y,
|
|
n_repeats=n_repeats,
|
|
random_state=rng,
|
|
n_jobs=n_jobs,
|
|
max_samples=max_samples,
|
|
)
|
|
|
|
assert result.importances.shape == (X.shape[1], n_repeats)
|
|
|
|
# the correlated feature with y was added as the last column and should
|
|
# have the highest importance
|
|
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
|
|
|
|
|
|
@pytest.mark.parametrize("n_jobs", [1, 2])
|
|
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
|
|
def test_robustness_to_high_cardinality_noisy_feature(n_jobs, max_samples, seed=42):
|
|
# Permutation variable importance should not be affected by the high
|
|
# cardinality bias of traditional feature importances, especially when
|
|
# computed on a held-out test set:
|
|
rng = np.random.RandomState(seed)
|
|
n_repeats = 5
|
|
n_samples = 1000
|
|
n_classes = 5
|
|
n_informative_features = 2
|
|
n_noise_features = 1
|
|
n_features = n_informative_features + n_noise_features
|
|
|
|
# Generate a multiclass classification dataset and a set of informative
|
|
# binary features that can be used to predict some classes of y exactly
|
|
# while leaving some classes unexplained to make the problem harder.
|
|
classes = np.arange(n_classes)
|
|
y = rng.choice(classes, size=n_samples)
|
|
X = np.hstack([(y == c).reshape(-1, 1) for c in classes[:n_informative_features]])
|
|
X = X.astype(np.float32)
|
|
|
|
# Not all target classes are explained by the binary class indicator
|
|
# features:
|
|
assert n_informative_features < n_classes
|
|
|
|
# Add 10 other noisy features with high cardinality (numerical) values
|
|
# that can be used to overfit the training data.
|
|
X = np.concatenate([X, rng.randn(n_samples, n_noise_features)], axis=1)
|
|
assert X.shape == (n_samples, n_features)
|
|
|
|
# Split the dataset to be able to evaluate on a held-out test set. The
|
|
# Test size should be large enough for importance measurements to be
|
|
# stable:
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=0.5, random_state=rng
|
|
)
|
|
clf = RandomForestClassifier(n_estimators=5, random_state=rng)
|
|
clf.fit(X_train, y_train)
|
|
|
|
# Variable importances computed by impurity decrease on the tree node
|
|
# splits often use the noisy features in splits. This can give misleading
|
|
# impression that high cardinality noisy variables are the most important:
|
|
tree_importances = clf.feature_importances_
|
|
informative_tree_importances = tree_importances[:n_informative_features]
|
|
noisy_tree_importances = tree_importances[n_informative_features:]
|
|
assert informative_tree_importances.max() < noisy_tree_importances.min()
|
|
|
|
# Let's check that permutation-based feature importances do not have this
|
|
# problem.
|
|
r = permutation_importance(
|
|
clf,
|
|
X_test,
|
|
y_test,
|
|
n_repeats=n_repeats,
|
|
random_state=rng,
|
|
n_jobs=n_jobs,
|
|
max_samples=max_samples,
|
|
)
|
|
|
|
assert r.importances.shape == (X.shape[1], n_repeats)
|
|
|
|
# Split the importances between informative and noisy features
|
|
informative_importances = r.importances_mean[:n_informative_features]
|
|
noisy_importances = r.importances_mean[n_informative_features:]
|
|
|
|
# Because we do not have a binary variable explaining each target classes,
|
|
# the RF model will have to use the random variable to make some
|
|
# (overfitting) splits (as max_depth is not set). Therefore the noisy
|
|
# variables will be non-zero but with small values oscillating around
|
|
# zero:
|
|
assert max(np.abs(noisy_importances)) > 1e-7
|
|
assert noisy_importances.max() < 0.05
|
|
|
|
# The binary features correlated with y should have a higher importance
|
|
# than the high cardinality noisy features.
|
|
# The maximum test accuracy is 2 / 5 == 0.4, each informative feature
|
|
# contributing approximately a bit more than 0.2 of accuracy.
|
|
assert informative_importances.min() > 0.15
|
|
|
|
|
|
def test_permutation_importance_mixed_types():
|
|
rng = np.random.RandomState(42)
|
|
n_repeats = 4
|
|
|
|
# Last column is correlated with y
|
|
X = np.array([[1.0, 2.0, 3.0, np.nan], [2, 1, 2, 1]]).T
|
|
y = np.array([0, 1, 0, 1])
|
|
|
|
clf = make_pipeline(SimpleImputer(), LogisticRegression(solver="lbfgs"))
|
|
clf.fit(X, y)
|
|
result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
|
|
|
|
assert result.importances.shape == (X.shape[1], n_repeats)
|
|
|
|
# the correlated feature with y is the last column and should
|
|
# have the highest importance
|
|
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
|
|
|
|
# use another random state
|
|
rng = np.random.RandomState(0)
|
|
result2 = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
|
|
assert result2.importances.shape == (X.shape[1], n_repeats)
|
|
|
|
assert not np.allclose(result.importances, result2.importances)
|
|
|
|
# the correlated feature with y is the last column and should
|
|
# have the highest importance
|
|
assert np.all(result2.importances_mean[-1] > result2.importances_mean[:-1])
|
|
|
|
|
|
def test_permutation_importance_mixed_types_pandas():
|
|
pd = pytest.importorskip("pandas")
|
|
rng = np.random.RandomState(42)
|
|
n_repeats = 5
|
|
|
|
# Last column is correlated with y
|
|
X = pd.DataFrame({"col1": [1.0, 2.0, 3.0, np.nan], "col2": ["a", "b", "a", "b"]})
|
|
y = np.array([0, 1, 0, 1])
|
|
|
|
num_preprocess = make_pipeline(SimpleImputer(), StandardScaler())
|
|
preprocess = ColumnTransformer(
|
|
[("num", num_preprocess, ["col1"]), ("cat", OneHotEncoder(), ["col2"])]
|
|
)
|
|
clf = make_pipeline(preprocess, LogisticRegression(solver="lbfgs"))
|
|
clf.fit(X, y)
|
|
|
|
result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng)
|
|
|
|
assert result.importances.shape == (X.shape[1], n_repeats)
|
|
# the correlated feature with y is the last column and should
|
|
# have the highest importance
|
|
assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
|
|
|
|
|
|
def test_permutation_importance_linear_regresssion():
|
|
X, y = make_regression(n_samples=500, n_features=10, random_state=0)
|
|
|
|
X = scale(X)
|
|
y = scale(y)
|
|
|
|
lr = LinearRegression().fit(X, y)
|
|
|
|
# this relationship can be computed in closed form
|
|
expected_importances = 2 * lr.coef_**2
|
|
results = permutation_importance(
|
|
lr, X, y, n_repeats=50, scoring="neg_mean_squared_error"
|
|
)
|
|
assert_allclose(
|
|
expected_importances, results.importances_mean, rtol=1e-1, atol=1e-6
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("max_samples", [500, 1.0])
|
|
def test_permutation_importance_equivalence_sequential_parallel(max_samples):
|
|
# regression test to make sure that sequential and parallel calls will
|
|
# output the same results.
|
|
# Also tests that max_samples equal to number of samples is equivalent to 1.0
|
|
X, y = make_regression(n_samples=500, n_features=10, random_state=0)
|
|
lr = LinearRegression().fit(X, y)
|
|
|
|
importance_sequential = permutation_importance(
|
|
lr, X, y, n_repeats=5, random_state=0, n_jobs=1, max_samples=max_samples
|
|
)
|
|
|
|
# First check that the problem is structured enough and that the model is
|
|
# complex enough to not yield trivial, constant importances:
|
|
imp_min = importance_sequential["importances"].min()
|
|
imp_max = importance_sequential["importances"].max()
|
|
assert imp_max - imp_min > 0.3
|
|
|
|
# The actually check that parallelism does not impact the results
|
|
# either with shared memory (threading) or without isolated memory
|
|
# via process-based parallelism using the default backend
|
|
# ('loky' or 'multiprocessing') depending on the joblib version:
|
|
|
|
# process-based parallelism (by default):
|
|
importance_processes = permutation_importance(
|
|
lr, X, y, n_repeats=5, random_state=0, n_jobs=2
|
|
)
|
|
assert_allclose(
|
|
importance_processes["importances"], importance_sequential["importances"]
|
|
)
|
|
|
|
# thread-based parallelism:
|
|
with parallel_backend("threading"):
|
|
importance_threading = permutation_importance(
|
|
lr, X, y, n_repeats=5, random_state=0, n_jobs=2
|
|
)
|
|
assert_allclose(
|
|
importance_threading["importances"], importance_sequential["importances"]
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("n_jobs", [None, 1, 2])
|
|
@pytest.mark.parametrize("max_samples", [0.5, 1.0])
|
|
def test_permutation_importance_equivalence_array_dataframe(n_jobs, max_samples):
|
|
# This test checks that the column shuffling logic has the same behavior
|
|
# both a dataframe and a simple numpy array.
|
|
pd = pytest.importorskip("pandas")
|
|
|
|
# regression test to make sure that sequential and parallel calls will
|
|
# output the same results.
|
|
X, y = make_regression(n_samples=100, n_features=5, random_state=0)
|
|
X_df = pd.DataFrame(X)
|
|
|
|
# Add a categorical feature that is statistically linked to y:
|
|
binner = KBinsDiscretizer(n_bins=3, encode="ordinal")
|
|
cat_column = binner.fit_transform(y.reshape(-1, 1))
|
|
|
|
# Concatenate the extra column to the numpy array: integers will be
|
|
# cast to float values
|
|
X = np.hstack([X, cat_column])
|
|
assert X.dtype.kind == "f"
|
|
|
|
# Insert extra column as a non-numpy-native dtype (while keeping backward
|
|
# compat for old pandas versions):
|
|
if hasattr(pd, "Categorical"):
|
|
cat_column = pd.Categorical(cat_column.ravel())
|
|
else:
|
|
cat_column = cat_column.ravel()
|
|
new_col_idx = len(X_df.columns)
|
|
X_df[new_col_idx] = cat_column
|
|
assert X_df[new_col_idx].dtype == cat_column.dtype
|
|
|
|
# Stich an arbitrary index to the dataframe:
|
|
X_df.index = np.arange(len(X_df)).astype(str)
|
|
|
|
rf = RandomForestRegressor(n_estimators=5, max_depth=3, random_state=0)
|
|
rf.fit(X, y)
|
|
|
|
n_repeats = 3
|
|
importance_array = permutation_importance(
|
|
rf,
|
|
X,
|
|
y,
|
|
n_repeats=n_repeats,
|
|
random_state=0,
|
|
n_jobs=n_jobs,
|
|
max_samples=max_samples,
|
|
)
|
|
|
|
# First check that the problem is structured enough and that the model is
|
|
# complex enough to not yield trivial, constant importances:
|
|
imp_min = importance_array["importances"].min()
|
|
imp_max = importance_array["importances"].max()
|
|
assert imp_max - imp_min > 0.3
|
|
|
|
# Now check that importances computed on dataframe matche the values
|
|
# of those computed on the array with the same data.
|
|
importance_dataframe = permutation_importance(
|
|
rf,
|
|
X_df,
|
|
y,
|
|
n_repeats=n_repeats,
|
|
random_state=0,
|
|
n_jobs=n_jobs,
|
|
max_samples=max_samples,
|
|
)
|
|
assert_allclose(
|
|
importance_array["importances"], importance_dataframe["importances"]
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
|
|
def test_permutation_importance_large_memmaped_data(input_type):
|
|
# Smoke, non-regression test for:
|
|
# https://github.com/scikit-learn/scikit-learn/issues/15810
|
|
n_samples, n_features = int(5e4), 4
|
|
X, y = make_classification(
|
|
n_samples=n_samples, n_features=n_features, random_state=0
|
|
)
|
|
assert X.nbytes > 1e6 # trigger joblib memmaping
|
|
|
|
X = _convert_container(X, input_type)
|
|
clf = DummyClassifier(strategy="prior").fit(X, y)
|
|
|
|
# Actual smoke test: should not raise any error:
|
|
n_repeats = 5
|
|
r = permutation_importance(clf, X, y, n_repeats=n_repeats, n_jobs=2)
|
|
|
|
# Auxiliary check: DummyClassifier is feature independent:
|
|
# permutating feature should not change the predictions
|
|
expected_importances = np.zeros((n_features, n_repeats))
|
|
assert_allclose(expected_importances, r.importances)
|
|
|
|
|
|
def test_permutation_importance_sample_weight():
|
|
# Creating data with 2 features and 1000 samples, where the target
|
|
# variable is a linear combination of the two features, such that
|
|
# in half of the samples the impact of feature 1 is twice the impact of
|
|
# feature 2, and vice versa on the other half of the samples.
|
|
rng = np.random.RandomState(1)
|
|
n_samples = 1000
|
|
n_features = 2
|
|
n_half_samples = n_samples // 2
|
|
x = rng.normal(0.0, 0.001, (n_samples, n_features))
|
|
y = np.zeros(n_samples)
|
|
y[:n_half_samples] = 2 * x[:n_half_samples, 0] + x[:n_half_samples, 1]
|
|
y[n_half_samples:] = x[n_half_samples:, 0] + 2 * x[n_half_samples:, 1]
|
|
|
|
# Fitting linear regression with perfect prediction
|
|
lr = LinearRegression(fit_intercept=False)
|
|
lr.fit(x, y)
|
|
|
|
# When all samples are weighted with the same weights, the ratio of
|
|
# the two features importance should equal to 1 on expectation (when using
|
|
# mean absolutes error as the loss function).
|
|
pi = permutation_importance(
|
|
lr, x, y, random_state=1, scoring="neg_mean_absolute_error", n_repeats=200
|
|
)
|
|
x1_x2_imp_ratio_w_none = pi.importances_mean[0] / pi.importances_mean[1]
|
|
assert x1_x2_imp_ratio_w_none == pytest.approx(1, 0.01)
|
|
|
|
# When passing a vector of ones as the sample_weight, results should be
|
|
# the same as in the case that sample_weight=None.
|
|
w = np.ones(n_samples)
|
|
pi = permutation_importance(
|
|
lr,
|
|
x,
|
|
y,
|
|
random_state=1,
|
|
scoring="neg_mean_absolute_error",
|
|
n_repeats=200,
|
|
sample_weight=w,
|
|
)
|
|
x1_x2_imp_ratio_w_ones = pi.importances_mean[0] / pi.importances_mean[1]
|
|
assert x1_x2_imp_ratio_w_ones == pytest.approx(x1_x2_imp_ratio_w_none, 0.01)
|
|
|
|
# When the ratio between the weights of the first half of the samples and
|
|
# the second half of the samples approaches to infinity, the ratio of
|
|
# the two features importance should equal to 2 on expectation (when using
|
|
# mean absolutes error as the loss function).
|
|
w = np.hstack([np.repeat(10.0**10, n_half_samples), np.repeat(1.0, n_half_samples)])
|
|
lr.fit(x, y, w)
|
|
pi = permutation_importance(
|
|
lr,
|
|
x,
|
|
y,
|
|
random_state=1,
|
|
scoring="neg_mean_absolute_error",
|
|
n_repeats=200,
|
|
sample_weight=w,
|
|
)
|
|
x1_x2_imp_ratio_w = pi.importances_mean[0] / pi.importances_mean[1]
|
|
assert x1_x2_imp_ratio_w / x1_x2_imp_ratio_w_none == pytest.approx(2, 0.01)
|
|
|
|
|
|
def test_permutation_importance_no_weights_scoring_function():
|
|
# Creating a scorer function that does not takes sample_weight
|
|
def my_scorer(estimator, X, y):
|
|
return 1
|
|
|
|
# Creating some data and estimator for the permutation test
|
|
x = np.array([[1, 2], [3, 4]])
|
|
y = np.array([1, 2])
|
|
w = np.array([1, 1])
|
|
lr = LinearRegression()
|
|
lr.fit(x, y)
|
|
|
|
# test that permutation_importance does not return error when
|
|
# sample_weight is None
|
|
try:
|
|
permutation_importance(lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1)
|
|
except TypeError:
|
|
pytest.fail(
|
|
"permutation_test raised an error when using a scorer "
|
|
"function that does not accept sample_weight even though "
|
|
"sample_weight was None"
|
|
)
|
|
|
|
# test that permutation_importance raise exception when sample_weight is
|
|
# not None
|
|
with pytest.raises(TypeError):
|
|
permutation_importance(
|
|
lr, x, y, random_state=1, scoring=my_scorer, n_repeats=1, sample_weight=w
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"list_single_scorer, multi_scorer",
|
|
[
|
|
(["r2", "neg_mean_squared_error"], ["r2", "neg_mean_squared_error"]),
|
|
(
|
|
["r2", "neg_mean_squared_error"],
|
|
{
|
|
"r2": get_scorer("r2"),
|
|
"neg_mean_squared_error": get_scorer("neg_mean_squared_error"),
|
|
},
|
|
),
|
|
(
|
|
["r2", "neg_mean_squared_error"],
|
|
lambda estimator, X, y: {
|
|
"r2": r2_score(y, estimator.predict(X)),
|
|
"neg_mean_squared_error": -mean_squared_error(y, estimator.predict(X)),
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_permutation_importance_multi_metric(list_single_scorer, multi_scorer):
|
|
# Test permutation importance when scoring contains multiple scorers
|
|
|
|
# Creating some data and estimator for the permutation test
|
|
x, y = make_regression(n_samples=500, n_features=10, random_state=0)
|
|
lr = LinearRegression().fit(x, y)
|
|
|
|
multi_importance = permutation_importance(
|
|
lr, x, y, random_state=1, scoring=multi_scorer, n_repeats=2
|
|
)
|
|
assert set(multi_importance.keys()) == set(list_single_scorer)
|
|
|
|
for scorer in list_single_scorer:
|
|
multi_result = multi_importance[scorer]
|
|
single_result = permutation_importance(
|
|
lr, x, y, random_state=1, scoring=scorer, n_repeats=2
|
|
)
|
|
|
|
assert_allclose(multi_result.importances, single_result.importances)
|
|
|
|
|
|
def test_permutation_importance_max_samples_error():
|
|
"""Check that a proper error message is raised when `max_samples` is not
|
|
set to a valid input value.
|
|
"""
|
|
X = np.array([(1.0, 2.0, 3.0, 4.0)]).T
|
|
y = np.array([0, 1, 0, 1])
|
|
|
|
clf = LogisticRegression()
|
|
clf.fit(X, y)
|
|
|
|
err_msg = r"max_samples must be <= n_samples"
|
|
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
permutation_importance(clf, X, y, max_samples=5)
|