# Authors: Alexandre Gramfort # License: BSD 3 clause import pytest import numpy as np from numpy.testing import assert_allclose from scipy import sparse from sklearn.base import BaseEstimator from sklearn.model_selection import LeaveOneOut, train_test_split from sklearn.utils._testing import (assert_array_almost_equal, assert_almost_equal, assert_array_equal, assert_raises, ignore_warnings) from sklearn.utils.extmath import softmax from sklearn.exceptions import NotFittedError from sklearn.datasets import make_classification, make_blobs from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import KFold, cross_val_predict from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import LinearSVC from sklearn.isotonic import IsotonicRegression from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.metrics import brier_score_loss from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import _sigmoid_calibration, _SigmoidCalibration from sklearn.calibration import calibration_curve @pytest.fixture(scope="module") def data(): X, y = make_classification( n_samples=200, n_features=6, random_state=42 ) return X, y @pytest.mark.parametrize('method', ['sigmoid', 'isotonic']) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration(data, method, ensemble): # Test calibration objects with isotonic and sigmoid n_samples = 100 X, y = data sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble) assert_raises(ValueError, cal_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: cal_clf = CalibratedClassifierCV( clf, method=method, cv=5, ensemble=ensemble ) # Note that this fit overwrites the fit on the entire training # set cal_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_cal_clf = cal_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(y_test, prob_pos_cal_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] cal_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] cal_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] cal_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_cal_clf, 1 - prob_pos_cal_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss((y_test + 1) % 2, prob_pos_cal_clf_relabeled)) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_bad_method(data, ensemble): # Check only "isotonic" and "sigmoid" are accepted as methods X, y = data clf = LinearSVC() clf_invalid_method = CalibratedClassifierCV( clf, method="foo", ensemble=ensemble ) with pytest.raises(ValueError): clf_invalid_method.fit(X, y) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_regressor(data, ensemble): # `base-estimator` should provide either decision_function or # predict_proba (most regressors, for instance, should fail) X, y = data clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), ensemble=ensemble) with pytest.raises(RuntimeError): clf_base_regressor.fit(X, y) def test_calibration_default_estimator(data): # Check base_estimator default is LinearSVC X, y = data calib_clf = CalibratedClassifierCV(cv=2) calib_clf.fit(X, y) base_est = calib_clf.calibrated_classifiers_[0].base_estimator assert isinstance(base_est, LinearSVC) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_cv_splitter(data, ensemble): # Check when `cv` is a CV splitter X, y = data splits = 5 kfold = KFold(n_splits=splits) calib_clf = CalibratedClassifierCV(cv=kfold, ensemble=ensemble) assert isinstance(calib_clf.cv, KFold) assert calib_clf.cv.n_splits == splits calib_clf.fit(X, y) expected_n_clf = splits if ensemble else 1 assert len(calib_clf.calibrated_classifiers_) == expected_n_clf @pytest.mark.parametrize('method', ['sigmoid', 'isotonic']) @pytest.mark.parametrize('ensemble', [True, False]) def test_sample_weight(data, method, ensemble): n_samples = 100 X, y = data sample_weight = np.random.RandomState(seed=42).uniform(size=len(y)) X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test = X[n_samples:] base_estimator = LinearSVC(random_state=42) calibrated_clf = CalibratedClassifierCV( base_estimator, method=method, ensemble=ensemble ) calibrated_clf.fit(X_train, y_train, sample_weight=sw_train) probs_with_sw = calibrated_clf.predict_proba(X_test) # As the weights are used for the calibration, they should still yield # different predictions calibrated_clf.fit(X_train, y_train) probs_without_sw = calibrated_clf.predict_proba(X_test) diff = np.linalg.norm(probs_with_sw - probs_without_sw) assert diff > 0.1 @pytest.mark.parametrize('method', ['sigmoid', 'isotonic']) @pytest.mark.parametrize('ensemble', [True, False]) def test_parallel_execution(data, method, ensemble): """Test parallel calibration""" X, y = data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) base_estimator = LinearSVC(random_state=42) cal_clf_parallel = CalibratedClassifierCV( base_estimator, method=method, n_jobs=2, ensemble=ensemble ) cal_clf_parallel.fit(X_train, y_train) probs_parallel = cal_clf_parallel.predict_proba(X_test) cal_clf_sequential = CalibratedClassifierCV( base_estimator, method=method, n_jobs=1, ensemble=ensemble ) cal_clf_sequential.fit(X_train, y_train) probs_sequential = cal_clf_sequential.predict_proba(X_test) assert_allclose(probs_parallel, probs_sequential) @pytest.mark.parametrize('method', ['sigmoid', 'isotonic']) @pytest.mark.parametrize('ensemble', [True, False]) # increase the number of RNG seeds to assess the statistical stability of this # test: @pytest.mark.parametrize('seed', range(2)) def test_calibration_multiclass(method, ensemble, seed): def multiclass_brier(y_true, proba_pred, n_classes): Y_onehot = np.eye(n_classes)[y_true] return np.sum((Y_onehot - proba_pred) ** 2) / Y_onehot.shape[0] # Test calibration for multiclass with classifier that implements # only decision function. clf = LinearSVC(random_state=7) X, y = make_blobs(n_samples=500, n_features=100, random_state=seed, centers=10, cluster_std=15.0) # Use an unbalanced dataset by collapsing 8 clusters into one class # to make the naive calibration based on a softmax more unlikely # to work. y[y > 2] = 2 n_classes = np.unique(y).shape[0] X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf.fit(X_train, y_train) cal_clf = CalibratedClassifierCV( clf, method=method, cv=5, ensemble=ensemble ) cal_clf.fit(X_train, y_train) probas = cal_clf.predict_proba(X_test) # Check probabilities sum to 1 assert_allclose(np.sum(probas, axis=1), np.ones(len(X_test))) # Check that the dataset is not too trivial, otherwise it's hard # to get interesting calibration data during the internal # cross-validation loop. assert 0.65 < clf.score(X_test, y_test) < 0.95 # Check that the accuracy of the calibrated model is never degraded # too much compared to the original classifier. assert cal_clf.score(X_test, y_test) > 0.95 * clf.score(X_test, y_test) # Check that Brier loss of calibrated classifier is smaller than # loss obtained by naively turning OvR decision function to # probabilities via a softmax uncalibrated_brier = \ multiclass_brier(y_test, softmax(clf.decision_function(X_test)), n_classes=n_classes) calibrated_brier = multiclass_brier(y_test, probas, n_classes=n_classes) assert calibrated_brier < 1.1 * uncalibrated_brier # Test that calibration of a multiclass classifier decreases log-loss # for RandomForestClassifier clf = RandomForestClassifier(n_estimators=30, random_state=42) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) uncalibrated_brier = multiclass_brier(y_test, clf_probs, n_classes=n_classes) cal_clf = CalibratedClassifierCV( clf, method=method, cv=5, ensemble=ensemble ) cal_clf.fit(X_train, y_train) cal_clf_probs = cal_clf.predict_proba(X_test) calibrated_brier = multiclass_brier(y_test, cal_clf_probs, n_classes=n_classes) assert calibrated_brier < 1.1 * uncalibrated_brier def test_calibration_prefit(): """Test calibration for prefitted classifiers""" n_samples = 50 X, y = make_classification(n_samples=3 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_calib, y_calib, sw_calib = \ X[n_samples:2 * n_samples], y[n_samples:2 * n_samples], \ sample_weight[n_samples:2 * n_samples] X_test, y_test = X[2 * n_samples:], y[2 * n_samples:] # Naive-Bayes clf = MultinomialNB() # Check error if clf not prefit unfit_clf = CalibratedClassifierCV(clf, cv="prefit") with pytest.raises(NotFittedError): unfit_clf.fit(X_calib, y_calib) clf.fit(X_train, y_train, sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] # Naive Bayes with calibration for this_X_calib, this_X_test in [(X_calib, X_test), (sparse.csr_matrix(X_calib), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv="prefit") for sw in [sw_calib, None]: cal_clf.fit(this_X_calib, y_calib, sample_weight=sw) y_prob = cal_clf.predict_proba(this_X_test) y_pred = cal_clf.predict(this_X_test) prob_pos_cal_clf = y_prob[:, 1] assert_array_equal(y_pred, np.array([0, 1])[np.argmax(y_prob, axis=1)]) assert (brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(y_test, prob_pos_cal_clf)) @pytest.mark.parametrize('method', ['sigmoid', 'isotonic']) def test_calibration_ensemble_false(data, method): # Test that `ensemble=False` is the same as using predictions from # `cross_val_predict` to train calibrator. X, y = data clf = LinearSVC(random_state=7) cal_clf = CalibratedClassifierCV(clf, method=method, cv=3, ensemble=False) cal_clf.fit(X, y) cal_probas = cal_clf.predict_proba(X) # Get probas manually unbiased_preds = cross_val_predict( clf, X, y, cv=3, method='decision_function' ) if method == 'isotonic': calibrator = IsotonicRegression(out_of_bounds='clip') else: calibrator = _SigmoidCalibration() calibrator.fit(unbiased_preds, y) # Use `clf` fit on all data clf.fit(X, y) clf_df = clf.decision_function(X) manual_probas = calibrator.predict(clf_df) assert_allclose(cal_probas[:, 1], manual_probas) def test_sigmoid_calibration(): """Test calibration values with Platt sigmoid model""" exF = np.array([5, -4, 1.0]) exY = np.array([1, -1, -1]) # computed from my python port of the C++ code in LibSVM AB_lin_libsvm = np.array([-0.20261354391187855, 0.65236314980010512]) assert_array_almost_equal(AB_lin_libsvm, _sigmoid_calibration(exF, exY), 3) lin_prob = 1. / (1. + np.exp(AB_lin_libsvm[0] * exF + AB_lin_libsvm[1])) sk_prob = _SigmoidCalibration().fit(exF, exY).predict(exF) assert_array_almost_equal(lin_prob, sk_prob, 6) # check that _SigmoidCalibration().fit only accepts 1d array or 2d column # arrays assert_raises(ValueError, _SigmoidCalibration().fit, np.vstack((exF, exF)), exY) def test_calibration_curve(): """Check calibration_curve function""" y_true = np.array([0, 0, 0, 1, 1, 1]) y_pred = np.array([0., 0.1, 0.2, 0.8, 0.9, 1.]) prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2) prob_true_unnormalized, prob_pred_unnormalized = \ calibration_curve(y_true, y_pred * 2, n_bins=2, normalize=True) assert len(prob_true) == len(prob_pred) assert len(prob_true) == 2 assert_almost_equal(prob_true, [0, 1]) assert_almost_equal(prob_pred, [0.1, 0.9]) assert_almost_equal(prob_true, prob_true_unnormalized) assert_almost_equal(prob_pred, prob_pred_unnormalized) # probabilities outside [0, 1] should not be accepted when normalize # is set to False assert_raises(ValueError, calibration_curve, [1.1], [-0.1], normalize=False) # test that quantiles work as expected y_true2 = np.array([0, 0, 0, 0, 1, 1]) y_pred2 = np.array([0., 0.1, 0.2, 0.5, 0.9, 1.]) prob_true_quantile, prob_pred_quantile = calibration_curve( y_true2, y_pred2, n_bins=2, strategy='quantile') assert len(prob_true_quantile) == len(prob_pred_quantile) assert len(prob_true_quantile) == 2 assert_almost_equal(prob_true_quantile, [0, 2 / 3]) assert_almost_equal(prob_pred_quantile, [0.1, 0.8]) # Check that error is raised when invalid strategy is selected assert_raises(ValueError, calibration_curve, y_true2, y_pred2, strategy='percentile') @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_nan_imputer(ensemble): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV( clf, cv=2, method='isotonic', ensemble=ensemble ) clf_c.fit(X, y) clf_c.predict(X) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_prob_sum(ensemble): # Test that sum of probabilities is 1. A non-regression test for # issue #7796 num_classes = 2 X, y = make_classification(n_samples=10, n_features=5, n_classes=num_classes) clf = LinearSVC(C=1.0, random_state=7) clf_prob = CalibratedClassifierCV( clf, method="sigmoid", cv=LeaveOneOut(), ensemble=ensemble ) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0])) @pytest.mark.parametrize('ensemble', [True, False]) def test_calibration_less_classes(ensemble): # Test to check calibration works fine when train set in a test-train # split does not contain all classes # Since this test uses LOO, at each iteration train set will not contain a # class label X = np.random.randn(10, 5) y = np.arange(10) clf = LinearSVC(C=1.0, random_state=7) cal_clf = CalibratedClassifierCV( clf, method="sigmoid", cv=LeaveOneOut(), ensemble=ensemble ) cal_clf.fit(X, y) for i, calibrated_classifier in \ enumerate(cal_clf.calibrated_classifiers_): proba = calibrated_classifier.predict_proba(X) if ensemble: # Check that the unobserved class has proba=0 assert_array_equal(proba[:, i], np.zeros(len(y))) # Check for all other classes proba>0 assert np.all(proba[:, :i] > 0) assert np.all(proba[:, i + 1:] > 0) else: # Check `proba` are all 1/n_classes assert np.allclose(proba, 1 / proba.shape[0]) @ignore_warnings(category=FutureWarning) @pytest.mark.parametrize('X', [np.random.RandomState(42).randn(15, 5, 2), np.random.RandomState(42).randn(15, 5, 2, 6)]) def test_calibration_accepts_ndarray(X): """Test that calibration accepts n-dimensional arrays as input""" y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0] class MockTensorClassifier(BaseEstimator): """A toy estimator that accepts tensor inputs""" def fit(self, X, y): self.classes_ = np.unique(y) return self def decision_function(self, X): # toy decision function that just needs to have the right shape: return X.reshape(X.shape[0], -1).sum(axis=1) calibrated_clf = CalibratedClassifierCV(MockTensorClassifier()) # we should be able to fit this classifier with no error calibrated_clf.fit(X, y) @pytest.fixture def text_data(): text_data = [ {'state': 'NY', 'age': 'adult'}, {'state': 'TX', 'age': 'adult'}, {'state': 'VT', 'age': 'child'}, ] text_labels = [1, 0, 1] return text_data, text_labels @pytest.fixture def text_data_pipeline(text_data): X, y = text_data pipeline_prefit = Pipeline([ ('vectorizer', DictVectorizer()), ('clf', RandomForestClassifier()) ]) return pipeline_prefit.fit(X, y) def test_calibration_pipeline(text_data, text_data_pipeline): # Test that calibration works in prefit pipeline with transformer, # where `X` is not array-like, sparse matrix or dataframe at the start. # See https://github.com/scikit-learn/scikit-learn/issues/8710 X, y = text_data clf = text_data_pipeline calib_clf = CalibratedClassifierCV(clf, cv='prefit') calib_clf.fit(X, y) # Check attributes are obtained from fitted estimator assert_array_equal(calib_clf.classes_, clf.classes_) msg = "'CalibratedClassifierCV' object has no attribute" with pytest.raises(AttributeError, match=msg): calib_clf.n_features_in_ @pytest.mark.parametrize('clf, cv', [ pytest.param(LinearSVC(C=1), 2), pytest.param(LinearSVC(C=1), 'prefit'), ]) def test_calibration_attributes(clf, cv): # Check that `n_features_in_` and `classes_` attributes created properly X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7) if cv == 'prefit': clf = clf.fit(X, y) calib_clf = CalibratedClassifierCV(clf, cv=cv) calib_clf.fit(X, y) if cv == 'prefit': assert_array_equal(calib_clf.classes_, clf.classes_) assert calib_clf.n_features_in_ == clf.n_features_in_ else: classes = LabelEncoder().fit(y).classes_ assert_array_equal(calib_clf.classes_, classes) assert calib_clf.n_features_in_ == X.shape[1] # FIXME: remove in 1.1 def test_calibrated_classifier_cv_deprecation(data): # Check that we raise the proper deprecation warning if accessing # `calibrators_` from the `_CalibratedClassifier`. X, y = data calib_clf = CalibratedClassifierCV(cv=2).fit(X, y) with pytest.warns(FutureWarning): calibrators = calib_clf.calibrated_classifiers_[0].calibrators_ for clf1, clf2 in zip( calibrators, calib_clf.calibrated_classifiers_[0].calibrators ): assert clf1 is clf2