3RNN/Lib/site-packages/sklearn/tests/test_multioutput.py
2024-05-26 19:49:15 +02:00

868 lines
29 KiB
Python

import re
import numpy as np
import pytest
from joblib import cpu_count
from sklearn import datasets
from sklearn.base import ClassifierMixin, clone
from sklearn.datasets import (
load_linnerud,
make_classification,
make_multilabel_classification,
make_regression,
)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import (
GradientBoostingRegressor,
RandomForestClassifier,
StackingRegressor,
)
from sklearn.exceptions import NotFittedError
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
Lasso,
LinearRegression,
LogisticRegression,
OrthogonalMatchingPursuit,
PassiveAggressiveClassifier,
Ridge,
SGDClassifier,
SGDRegressor,
)
from sklearn.metrics import jaccard_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import (
ClassifierChain,
MultiOutputClassifier,
MultiOutputRegressor,
RegressorChain,
)
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import (
BSR_CONTAINERS,
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
def test_multi_target_regression():
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
for n in range(3):
rgr = GradientBoostingRegressor(random_state=0)
rgr.fit(X_train, y_train[:, n])
references[:, n] = rgr.predict(X_test)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
assert_almost_equal(references, y_pred)
def test_multi_target_regression_partial_fit():
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
half_index = 25
for n in range(3):
sgr = SGDRegressor(random_state=0, max_iter=5)
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
references[:, n] = sgr.predict(X_test)
sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
y_pred = sgr.predict(X_test)
assert_almost_equal(references, y_pred)
assert not hasattr(MultiOutputRegressor(Lasso), "partial_fit")
def test_multi_target_regression_one_target():
# Test multi target regression raises
X, y = datasets.make_regression(n_targets=1, random_state=0)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
msg = "at least two dimensions"
with pytest.raises(ValueError, match=msg):
rgr.fit(X, y)
@pytest.mark.parametrize(
"sparse_container",
CSR_CONTAINERS
+ CSC_CONTAINERS
+ COO_CONTAINERS
+ LIL_CONTAINERS
+ DOK_CONTAINERS
+ BSR_CONTAINERS,
)
def test_multi_target_sparse_regression(sparse_container):
X, y = datasets.make_regression(n_targets=3, random_state=0)
X_train, y_train = X[:50], y[:50]
X_test = X[50:]
rgr = MultiOutputRegressor(Lasso(random_state=0))
rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
rgr.fit(X_train, y_train)
rgr_sparse.fit(sparse_container(X_train), y_train)
assert_almost_equal(
rgr.predict(X_test), rgr_sparse.predict(sparse_container(X_test))
)
def test_multi_target_sample_weights_api():
X = [[1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [2.718, 3.141]]
w = [0.8, 0.6]
rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
msg = "does not support sample weights"
with pytest.raises(ValueError, match=msg):
rgr.fit(X, y, w)
# no exception should be raised if the base estimator supports weights
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y, w)
def test_multi_target_sample_weight_partial_fit():
# weighted regressor
X = [[1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [2.718, 3.141]]
w = [2.0, 1.0]
rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
rgr_w.partial_fit(X, y, w)
# weighted with different weights
w = [2.0, 2.0]
rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
rgr.partial_fit(X, y, w)
assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
def test_multi_target_sample_weights():
# weighted regressor
Xw = [[1, 2, 3], [4, 5, 6]]
yw = [[3.141, 2.718], [2.718, 3.141]]
w = [2.0, 1.0]
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y)
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
# Import the data
iris = datasets.load_iris()
# create a multiple targets by randomized shuffling and concatenating y.
X = iris.data
y1 = iris.target
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
y = np.column_stack((y1, y2, y3))
n_samples, n_features = X.shape
n_outputs = y.shape[1]
n_classes = len(np.unique(y1))
classes = list(map(np.unique, (y1, y2, y3)))
def test_multi_output_classification_partial_fit_parallelism():
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
mor.partial_fit(X, y, classes)
est1 = mor.estimators_[0]
mor.partial_fit(X, y)
est2 = mor.estimators_[0]
if cpu_count() > 1:
# parallelism requires this to be the case for a sane implementation
assert est1 is not est2
# check multioutput has predict_proba
def test_hasattr_multi_output_predict_proba():
# default SGDClassifier has loss='hinge'
# which does not expose a predict_proba method
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
assert not hasattr(multi_target_linear, "predict_proba")
# case where predict_proba attribute exists
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
assert hasattr(multi_target_linear, "predict_proba")
# check predict_proba passes
def test_multi_output_predict_proba():
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
param = {"loss": ("hinge", "log_loss", "modified_huber")}
# inner function for custom scoring
def custom_scorer(estimator, X, y):
if hasattr(estimator, "predict_proba"):
return 1.0
else:
return 0.0
grid_clf = GridSearchCV(
sgd_linear_clf,
param_grid=param,
scoring=custom_scorer,
cv=3,
error_score="raise",
)
multi_target_linear = MultiOutputClassifier(grid_clf)
multi_target_linear.fit(X, y)
multi_target_linear.predict_proba(X)
# SGDClassifier defaults to loss='hinge' which is not a probabilistic
# loss function; therefore it does not expose a predict_proba method
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
multi_target_linear.fit(X, y)
inner2_msg = "probability estimates are not available for loss='hinge'"
inner1_msg = "'SGDClassifier' has no attribute 'predict_proba'"
outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
multi_target_linear.predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner1_msg in str(exec_info.value.__cause__)
assert isinstance(exec_info.value.__cause__.__cause__, AttributeError)
assert inner2_msg in str(exec_info.value.__cause__.__cause__)
def test_multi_output_classification_partial_fit():
# test if multi_target initializes correctly with base estimator and fit
# assert predictions work as expected for predict
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
# train the multi_target_linear and also get the predictions.
half_index = X.shape[0] // 2
multi_target_linear.partial_fit(X[:half_index], y[:half_index], classes=classes)
first_predictions = multi_target_linear.predict(X)
assert (n_samples, n_outputs) == first_predictions.shape
multi_target_linear.partial_fit(X[half_index:], y[half_index:])
second_predictions = multi_target_linear.predict(X)
assert (n_samples, n_outputs) == second_predictions.shape
# train the linear classification with each column and assert that
# predictions are equal after first partial_fit and second partial_fit
for i in range(3):
# create a clone with the same state
sgd_linear_clf = clone(sgd_linear_clf)
sgd_linear_clf.partial_fit(
X[:half_index], y[:half_index, i], classes=classes[i]
)
assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def test_multi_output_classification_partial_fit_no_first_classes_exception():
sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
msg = "classes must be passed on the first call to partial_fit."
with pytest.raises(ValueError, match=msg):
multi_target_linear.partial_fit(X, y)
def test_multi_output_classification():
# test if multi_target initializes correctly with base estimator and fit
# assert predictions work as expected for predict, prodict_proba and score
forest = RandomForestClassifier(n_estimators=10, random_state=1)
multi_target_forest = MultiOutputClassifier(forest)
# train the multi_target_forest and also get the predictions.
multi_target_forest.fit(X, y)
predictions = multi_target_forest.predict(X)
assert (n_samples, n_outputs) == predictions.shape
predict_proba = multi_target_forest.predict_proba(X)
assert len(predict_proba) == n_outputs
for class_probabilities in predict_proba:
assert (n_samples, n_classes) == class_probabilities.shape
assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions)
# train the forest with each column and assert that predictions are equal
for i in range(3):
forest_ = clone(forest) # create a clone with the same state
forest_.fit(X, y[:, i])
assert list(forest_.predict(X)) == list(predictions[:, i])
assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
def test_multiclass_multioutput_estimator():
# test to check meta of meta estimators
svc = LinearSVC(random_state=0)
multi_class_svc = OneVsRestClassifier(svc)
multi_target_svc = MultiOutputClassifier(multi_class_svc)
multi_target_svc.fit(X, y)
predictions = multi_target_svc.predict(X)
assert (n_samples, n_outputs) == predictions.shape
# train the forest with each column and assert that predictions are equal
for i in range(3):
multi_class_svc_ = clone(multi_class_svc) # create a clone
multi_class_svc_.fit(X, y[:, i])
assert list(multi_class_svc_.predict(X)) == list(predictions[:, i])
def test_multiclass_multioutput_estimator_predict_proba():
seed = 542
# make test deterministic
rng = np.random.RandomState(seed)
# random features
X = rng.normal(size=(5, 5))
# random labels
y1 = np.array(["b", "a", "a", "b", "a"]).reshape(5, 1) # 2 classes
y2 = np.array(["d", "e", "f", "e", "d"]).reshape(5, 1) # 3 classes
Y = np.concatenate([y1, y2], axis=1)
clf = MultiOutputClassifier(
LogisticRegression(solver="liblinear", random_state=seed)
)
clf.fit(X, Y)
y_result = clf.predict_proba(X)
y_actual = [
np.array(
[
[0.23481764, 0.76518236],
[0.67196072, 0.32803928],
[0.54681448, 0.45318552],
[0.34883923, 0.65116077],
[0.73687069, 0.26312931],
]
),
np.array(
[
[0.5171785, 0.23878628, 0.24403522],
[0.22141451, 0.64102704, 0.13755846],
[0.16751315, 0.18256843, 0.64991843],
[0.27357372, 0.55201592, 0.17441036],
[0.65745193, 0.26062899, 0.08191907],
]
),
]
for i in range(len(y_actual)):
assert_almost_equal(y_result[i], y_actual[i])
def test_multi_output_classification_sample_weights():
# weighted classifier
Xw = [[1, 2, 3], [4, 5, 6]]
yw = [[3, 2], [2, 3]]
w = np.asarray([2.0, 1.0])
forest = RandomForestClassifier(n_estimators=10, random_state=1)
clf_w = MultiOutputClassifier(forest)
clf_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
y = [[3, 2], [3, 2], [2, 3]]
forest = RandomForestClassifier(n_estimators=10, random_state=1)
clf = MultiOutputClassifier(forest)
clf.fit(X, y)
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_classification_partial_fit_sample_weights():
# weighted classifier
Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
yw = [[3, 2], [2, 3], [3, 2]]
w = np.asarray([2.0, 1.0, 1.0])
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
clf_w = MultiOutputClassifier(sgd_linear_clf)
clf_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [3, 2], [2, 3], [3, 2]]
sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
clf = MultiOutputClassifier(sgd_linear_clf)
clf.fit(X, y)
X_test = [[1.5, 2.5, 3.5]]
assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
def test_multi_output_exceptions():
# NotFittedError when fit is not done but score, predict and
# and predict_proba are called
moc = MultiOutputClassifier(LinearSVC(random_state=0))
with pytest.raises(NotFittedError):
moc.score(X, y)
# ValueError when number of outputs is different
# for fit and score
y_new = np.column_stack((y1, y2))
moc.fit(X, y)
with pytest.raises(ValueError):
moc.score(X, y_new)
# ValueError when y is continuous
msg = "Unknown label type"
with pytest.raises(ValueError, match=msg):
moc.fit(X, X[:, 1])
@pytest.mark.parametrize("response_method", ["predict_proba", "predict"])
def test_multi_output_not_fitted_error(response_method):
"""Check that we raise the proper error when the estimator is not fitted"""
moc = MultiOutputClassifier(LogisticRegression())
with pytest.raises(NotFittedError):
getattr(moc, response_method)(X)
def test_multi_output_delegate_predict_proba():
"""Check the behavior for the delegation of predict_proba to the underlying
estimator"""
# A base estimator with `predict_proba`should expose the method even before fit
moc = MultiOutputClassifier(LogisticRegression())
assert hasattr(moc, "predict_proba")
moc.fit(X, y)
assert hasattr(moc, "predict_proba")
# A base estimator without `predict_proba` should raise an AttributeError
moc = MultiOutputClassifier(LinearSVC())
assert not hasattr(moc, "predict_proba")
outer_msg = "'MultiOutputClassifier' has no attribute 'predict_proba'"
inner_msg = "'LinearSVC' object has no attribute 'predict_proba'"
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
moc.predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg == str(exec_info.value.__cause__)
moc.fit(X, y)
assert not hasattr(moc, "predict_proba")
with pytest.raises(AttributeError, match=outer_msg) as exec_info:
moc.predict_proba(X)
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg == str(exec_info.value.__cause__)
def generate_multilabel_dataset_with_correlations():
# Generate a multilabel data set from a multiclass dataset as a way of
# by representing the integer number of the original class using a binary
# encoding.
X, y = make_classification(
n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0
)
Y_multi = np.array([[int(yyy) for yyy in format(yy, "#06b")[2:]] for yy in y])
return X, Y_multi
@pytest.mark.parametrize("chain_method", ["predict", "decision_function"])
def test_classifier_chain_fit_and_predict_with_linear_svc(chain_method):
# Fit classifier chain and verify predict performance using LinearSVC
X, Y = generate_multilabel_dataset_with_correlations()
classifier_chain = ClassifierChain(
LinearSVC(),
chain_method=chain_method,
).fit(X, Y)
Y_pred = classifier_chain.predict(X)
assert Y_pred.shape == Y.shape
Y_decision = classifier_chain.decision_function(X)
Y_binary = Y_decision >= 0
assert_array_equal(Y_binary, Y_pred)
assert not hasattr(classifier_chain, "predict_proba")
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_classifier_chain_fit_and_predict_with_sparse_data(csr_container):
# Fit classifier chain with sparse data
X, Y = generate_multilabel_dataset_with_correlations()
X_sparse = csr_container(X)
classifier_chain = ClassifierChain(LogisticRegression()).fit(X_sparse, Y)
Y_pred_sparse = classifier_chain.predict(X_sparse)
classifier_chain = ClassifierChain(LogisticRegression()).fit(X, Y)
Y_pred_dense = classifier_chain.predict(X)
assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_classifier_chain_vs_independent_models():
# Verify that an ensemble of classifier chains (each of length
# N) can achieve a higher Jaccard similarity score than N independent
# models
X, Y = generate_multilabel_dataset_with_correlations()
X_train = X[:600, :]
X_test = X[600:, :]
Y_train = Y[:600, :]
Y_test = Y[600:, :]
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, Y_train)
Y_pred_ovr = ovr.predict(X_test)
chain = ClassifierChain(LogisticRegression())
chain.fit(X_train, Y_train)
Y_pred_chain = chain.predict(X_test)
assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score(
Y_test, Y_pred_ovr, average="samples"
)
@pytest.mark.parametrize(
"chain_method",
["predict", "predict_proba", "predict_log_proba", "decision_function"],
)
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
def test_classifier_chain_fit_and_predict(chain_method, response_method):
# Fit classifier chain and verify predict performance
X, Y = generate_multilabel_dataset_with_correlations()
chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
chain.fit(X, Y)
Y_pred = chain.predict(X)
assert Y_pred.shape == Y.shape
assert [c.coef_.size for c in chain.estimators_] == list(
range(X.shape[1], X.shape[1] + Y.shape[1])
)
Y_prob = getattr(chain, response_method)(X)
if response_method == "predict_log_proba":
Y_prob = np.exp(Y_prob)
Y_binary = Y_prob >= 0.5
assert_array_equal(Y_binary, Y_pred)
assert isinstance(chain, ClassifierMixin)
def test_regressor_chain_fit_and_predict():
# Fit regressor chain and verify Y and estimator coefficients shape
X, Y = generate_multilabel_dataset_with_correlations()
chain = RegressorChain(Ridge())
chain.fit(X, Y)
Y_pred = chain.predict(X)
assert Y_pred.shape == Y.shape
assert [c.coef_.size for c in chain.estimators_] == list(
range(X.shape[1], X.shape[1] + Y.shape[1])
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_base_chain_fit_and_predict_with_sparse_data_and_cv(csr_container):
# Fit base chain with sparse data cross_val_predict
X, Y = generate_multilabel_dataset_with_correlations()
X_sparse = csr_container(X)
base_chains = [
ClassifierChain(LogisticRegression(), cv=3),
RegressorChain(Ridge(), cv=3),
]
for chain in base_chains:
chain.fit(X_sparse, Y)
Y_pred = chain.predict(X_sparse)
assert Y_pred.shape == Y.shape
def test_base_chain_random_order():
# Fit base chain with random order
X, Y = generate_multilabel_dataset_with_correlations()
for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
chain_random = clone(chain).set_params(order="random", random_state=42)
chain_random.fit(X, Y)
chain_fixed = clone(chain).set_params(order=chain_random.order_)
chain_fixed.fit(X, Y)
assert_array_equal(chain_fixed.order_, chain_random.order_)
assert list(chain_random.order) != list(range(4))
assert len(chain_random.order_) == 4
assert len(set(chain_random.order_)) == 4
# Randomly ordered chain should behave identically to a fixed order
# chain with the same order.
for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
assert_array_almost_equal(est1.coef_, est2.coef_)
@pytest.mark.parametrize(
"chain_type, chain_method",
[
("classifier", "predict"),
("classifier", "predict_proba"),
("classifier", "predict_log_proba"),
("classifier", "decision_function"),
("regressor", ""),
],
)
def test_base_chain_crossval_fit_and_predict(chain_type, chain_method):
# Fit chain with cross_val_predict and verify predict
# performance
X, Y = generate_multilabel_dataset_with_correlations()
if chain_type == "classifier":
chain = ClassifierChain(LogisticRegression(), chain_method=chain_method)
else:
chain = RegressorChain(Ridge())
chain.fit(X, Y)
chain_cv = clone(chain).set_params(cv=3)
chain_cv.fit(X, Y)
Y_pred_cv = chain_cv.predict(X)
Y_pred = chain.predict(X)
assert Y_pred_cv.shape == Y_pred.shape
assert not np.all(Y_pred == Y_pred_cv)
if isinstance(chain, ClassifierChain):
assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
else:
assert mean_squared_error(Y, Y_pred_cv) < 0.25
@pytest.mark.parametrize(
"estimator",
[
RandomForestClassifier(n_estimators=2),
MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
ClassifierChain(RandomForestClassifier(n_estimators=2)),
],
)
def test_multi_output_classes_(estimator):
# Tests classes_ attribute of multioutput classifiers
# RandomForestClassifier supports multioutput out-of-the-box
estimator.fit(X, y)
assert isinstance(estimator.classes_, list)
assert len(estimator.classes_) == n_outputs
for estimator_classes, expected_classes in zip(classes, estimator.classes_):
assert_array_equal(estimator_classes, expected_classes)
class DummyRegressorWithFitParams(DummyRegressor):
def fit(self, X, y, sample_weight=None, **fit_params):
self._fit_params = fit_params
return super().fit(X, y, sample_weight)
class DummyClassifierWithFitParams(DummyClassifier):
def fit(self, X, y, sample_weight=None, **fit_params):
self._fit_params = fit_params
return super().fit(X, y, sample_weight)
@pytest.mark.filterwarnings("ignore:`n_features_in_` is deprecated")
@pytest.mark.parametrize(
"estimator, dataset",
[
(
MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
datasets.make_multilabel_classification(),
),
(
MultiOutputRegressor(DummyRegressorWithFitParams()),
datasets.make_regression(n_targets=3, random_state=0),
),
],
)
def test_multioutput_estimator_with_fit_params(estimator, dataset):
X, y = dataset
some_param = np.zeros_like(X)
estimator.fit(X, y, some_param=some_param)
for dummy_estimator in estimator.estimators_:
assert "some_param" in dummy_estimator._fit_params
def test_regressor_chain_w_fit_params():
# Make sure fit_params are properly propagated to the sub-estimators
rng = np.random.RandomState(0)
X, y = datasets.make_regression(n_targets=3, random_state=0)
weight = rng.rand(y.shape[0])
class MySGD(SGDRegressor):
def fit(self, X, y, **fit_params):
self.sample_weight_ = fit_params["sample_weight"]
super().fit(X, y, **fit_params)
model = RegressorChain(MySGD())
# Fitting with params
fit_param = {"sample_weight": weight}
model.fit(X, y, **fit_param)
for est in model.estimators_:
assert est.sample_weight_ is weight
@pytest.mark.parametrize(
"MultiOutputEstimator, Estimator",
[(MultiOutputClassifier, LogisticRegression), (MultiOutputRegressor, Ridge)],
)
# FIXME: we should move this test in `estimator_checks` once we are able
# to construct meta-estimator instances
def test_support_missing_values(MultiOutputEstimator, Estimator):
# smoke test to check that pipeline MultioutputEstimators are letting
# the validation of missing values to
# the underlying pipeline, regressor or classifier
rng = np.random.RandomState(42)
X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
mask = rng.choice([1, 0], X.shape, p=[0.01, 0.99]).astype(bool)
X[mask] = np.nan
pipe = make_pipeline(SimpleImputer(), Estimator())
MultiOutputEstimator(pipe).fit(X, y).score(X, y)
@pytest.mark.parametrize("order_type", [list, np.array, tuple])
def test_classifier_chain_tuple_order(order_type):
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [2, 3], [3, 2]]
order = order_type([1, 0])
chain = ClassifierChain(
RandomForestClassifier(n_estimators=2, random_state=0), order=order
)
chain.fit(X, y)
X_test = [[1.5, 2.5, 3.5]]
y_test = [[3, 2]]
assert_array_almost_equal(chain.predict(X_test), y_test)
def test_classifier_chain_tuple_invalid_order():
X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
y = [[3, 2], [2, 3], [3, 2]]
order = tuple([1, 2])
chain = ClassifierChain(RandomForestClassifier(), order=order)
with pytest.raises(ValueError, match="invalid order"):
chain.fit(X, y)
def test_classifier_chain_verbose(capsys):
X, y = make_multilabel_classification(
n_samples=100, n_features=5, n_classes=3, n_labels=3, random_state=0
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pattern = (
r"\[Chain\].*\(1 of 3\) Processing order 0, total=.*\n"
r"\[Chain\].*\(2 of 3\) Processing order 1, total=.*\n"
r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
)
classifier = ClassifierChain(
DecisionTreeClassifier(),
order=[0, 1, 2],
random_state=0,
verbose=True,
)
classifier.fit(X_train, y_train)
assert re.match(pattern, capsys.readouterr()[0])
def test_regressor_chain_verbose(capsys):
X, y = make_regression(n_samples=125, n_targets=3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pattern = (
r"\[Chain\].*\(1 of 3\) Processing order 1, total=.*\n"
r"\[Chain\].*\(2 of 3\) Processing order 0, total=.*\n"
r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
)
regressor = RegressorChain(
LinearRegression(),
order=[1, 0, 2],
random_state=0,
verbose=True,
)
regressor.fit(X_train, y_train)
assert re.match(pattern, capsys.readouterr()[0])
def test_multioutputregressor_ducktypes_fitted_estimator():
"""Test that MultiOutputRegressor checks the fitted estimator for
predict. Non-regression test for #16549."""
X, y = load_linnerud(return_X_y=True)
stacker = StackingRegressor(
estimators=[("sgd", SGDRegressor(random_state=1))],
final_estimator=Ridge(),
cv=2,
)
reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
# Does not raise
reg.predict(X)
@pytest.mark.parametrize(
"Cls, method", [(ClassifierChain, "fit"), (MultiOutputClassifier, "partial_fit")]
)
def test_fit_params_no_routing(Cls, method):
"""Check that we raise an error when passing metadata not requested by the
underlying classifier.
"""
X, y = make_classification(n_samples=50)
clf = Cls(PassiveAggressiveClassifier())
with pytest.raises(ValueError, match="is only supported if"):
getattr(clf, method)(X, y, test=1)
def test_multioutput_regressor_has_partial_fit():
# Test that an unfitted MultiOutputRegressor handles available_if for
# partial_fit correctly
est = MultiOutputRegressor(LinearRegression())
msg = "This 'MultiOutputRegressor' has no attribute 'partial_fit'"
with pytest.raises(AttributeError, match=msg):
getattr(est, "partial_fit")