import re import warnings import numpy as np import pytest from scipy import stats from sklearn import datasets, svm from sklearn.datasets import make_multilabel_classification from sklearn.exceptions import UndefinedMetricWarning from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( accuracy_score, auc, average_precision_score, coverage_error, dcg_score, det_curve, label_ranking_average_precision_score, label_ranking_loss, ndcg_score, precision_recall_curve, roc_auc_score, roc_curve, top_k_accuracy_score, ) from sklearn.metrics._ranking import _dcg_sample_scores, _ndcg_sample_scores from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.random_projection import _sparse_random_matrix from sklearn.utils._testing import ( _convert_container, assert_allclose, assert_almost_equal, assert_array_almost_equal, assert_array_equal, ignore_warnings, ) from sklearn.utils.extmath import softmax from sklearn.utils.fixes import CSR_CONTAINERS from sklearn.utils.validation import ( check_array, check_consistent_length, check_random_state, ) ############################################################################### # Utilities for testing CURVE_FUNCS = [ det_curve, precision_recall_curve, roc_curve, ] def make_prediction(dataset=None, binary=False): """Make some classification predictions on a toy dataset using a SVC If binary is True restrict to a binary classification problem instead of a multiclass classification problem """ if dataset is None: # import some data to play with dataset = datasets.load_iris() X = dataset.data y = dataset.target if binary: # restrict to a binary classification task X, y = X[y < 2], y[y < 2] n_samples, n_features = X.shape p = np.arange(n_samples) rng = check_random_state(37) rng.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # add noisy features to make the problem harder and avoid perfect results rng = np.random.RandomState(0) X = np.c_[X, rng.randn(n_samples, 200 * n_features)] # run classifier, get class probabilities and label predictions clf = svm.SVC(kernel="linear", probability=True, random_state=0) y_score = clf.fit(X[:half], y[:half]).predict_proba(X[half:]) if binary: # only interested in probabilities of the positive case # XXX: do we really want a special API for the binary case? y_score = y_score[:, 1] y_pred = clf.predict(X[half:]) y_true = y[half:] return y_true, y_pred, y_score ############################################################################### # Tests def _auc(y_true, y_score): """Alternative implementation to check for correctness of `roc_auc_score`.""" pos_label = np.unique(y_true)[1] # Count the number of times positive samples are correctly ranked above # negative samples. pos = y_score[y_true == pos_label] neg = y_score[y_true != pos_label] diff_matrix = pos.reshape(1, -1) - neg.reshape(-1, 1) n_correct = np.sum(diff_matrix > 0) return n_correct / float(len(pos) * len(neg)) def _average_precision(y_true, y_score): """Alternative implementation to check for correctness of `average_precision_score`. Note that this implementation fails on some edge cases. For example, for constant predictions e.g. [0.5, 0.5, 0.5], y_true = [1, 0, 0] returns an average precision of 0.33... but y_true = [0, 0, 1] returns 1.0. """ pos_label = np.unique(y_true)[1] n_pos = np.sum(y_true == pos_label) order = np.argsort(y_score)[::-1] y_score = y_score[order] y_true = y_true[order] score = 0 for i in range(len(y_score)): if y_true[i] == pos_label: # Compute precision up to document i # i.e, percentage of relevant documents up to document i. prec = 0 for j in range(0, i + 1): if y_true[j] == pos_label: prec += 1.0 prec /= i + 1.0 score += prec return score / n_pos def _average_precision_slow(y_true, y_score): """A second alternative implementation of average precision that closely follows the Wikipedia article's definition (see References). This should give identical results as `average_precision_score` for all inputs. References ---------- .. [1] `Wikipedia entry for the Average precision `_ """ precision, recall, threshold = precision_recall_curve(y_true, y_score) precision = list(reversed(precision)) recall = list(reversed(recall)) average_precision = 0 for i in range(1, len(precision)): average_precision += precision[i] * (recall[i] - recall[i - 1]) return average_precision def _partial_roc_auc_score(y_true, y_predict, max_fpr): """Alternative implementation to check for correctness of `roc_auc_score` with `max_fpr` set. """ def _partial_roc(y_true, y_predict, max_fpr): fpr, tpr, _ = roc_curve(y_true, y_predict) new_fpr = fpr[fpr <= max_fpr] new_fpr = np.append(new_fpr, max_fpr) new_tpr = tpr[fpr <= max_fpr] idx_out = np.argmax(fpr > max_fpr) idx_in = idx_out - 1 x_interp = [fpr[idx_in], fpr[idx_out]] y_interp = [tpr[idx_in], tpr[idx_out]] new_tpr = np.append(new_tpr, np.interp(max_fpr, x_interp, y_interp)) return (new_fpr, new_tpr) new_fpr, new_tpr = _partial_roc(y_true, y_predict, max_fpr) partial_auc = auc(new_fpr, new_tpr) # Formula (5) from McClish 1989 fpr1 = 0 fpr2 = max_fpr min_area = 0.5 * (fpr2 - fpr1) * (fpr2 + fpr1) max_area = fpr2 - fpr1 return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) @pytest.mark.parametrize("drop", [True, False]) def test_roc_curve(drop): # Test Area under Receiver Operating Characteristic (ROC) curve y_true, _, y_score = make_prediction(binary=True) expected_auc = _auc(y_true, y_score) fpr, tpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=drop) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, expected_auc, decimal=2) assert_almost_equal(roc_auc, roc_auc_score(y_true, y_score)) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape def test_roc_curve_end_points(): # Make sure that roc_curve returns a curve start at 0 and ending and # 1 even in corner cases rng = np.random.RandomState(0) y_true = np.array([0] * 50 + [1] * 50) y_pred = rng.randint(3, size=100) fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True) assert fpr[0] == 0 assert fpr[-1] == 1 assert fpr.shape == tpr.shape assert fpr.shape == thr.shape def test_roc_returns_consistency(): # Test whether the returned threshold matches up with tpr # make small toy dataset y_true, _, y_score = make_prediction(binary=True) fpr, tpr, thresholds = roc_curve(y_true, y_score) # use the given thresholds to determine the tpr tpr_correct = [] for t in thresholds: tp = np.sum((y_score >= t) & y_true) p = np.sum(y_true) tpr_correct.append(1.0 * tp / p) # compare tpr and tpr_correct to see if the thresholds' order was correct assert_array_almost_equal(tpr, tpr_correct, decimal=2) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape def test_roc_curve_multi(): # roc_curve not applicable for multi-class problems y_true, _, y_score = make_prediction(binary=False) with pytest.raises(ValueError): roc_curve(y_true, y_score) def test_roc_curve_confidence(): # roc_curve for confidence scores y_true, _, y_score = make_prediction(binary=True) fpr, tpr, thresholds = roc_curve(y_true, y_score - 0.5) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.90, decimal=2) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape def test_roc_curve_hard(): # roc_curve for hard decisions y_true, pred, y_score = make_prediction(binary=True) # always predict one trivial_pred = np.ones(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape # always predict zero trivial_pred = np.zeros(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape # hard decisions fpr, tpr, thresholds = roc_curve(y_true, pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.78, decimal=2) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape def test_roc_curve_one_label(): y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # assert there are warnings expected_message = ( "No negative samples in y_true, false positive value should be meaningless" ) with pytest.warns(UndefinedMetricWarning, match=expected_message): fpr, tpr, thresholds = roc_curve(y_true, y_pred) # all true labels, all fpr should be nan assert_array_equal(fpr, np.full(len(thresholds), np.nan)) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape # assert there are warnings expected_message = ( "No positive samples in y_true, true positive value should be meaningless" ) with pytest.warns(UndefinedMetricWarning, match=expected_message): fpr, tpr, thresholds = roc_curve([1 - x for x in y_true], y_pred) # all negative labels, all tpr should be nan assert_array_equal(tpr, np.full(len(thresholds), np.nan)) assert fpr.shape == tpr.shape assert fpr.shape == thresholds.shape def test_roc_curve_toydata(): # Binary classification y_true = [0, 1] y_score = [0, 1] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) assert_almost_equal(roc_auc, 1.0) y_true = [0, 1] y_score = [1, 0] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1, 1]) assert_array_almost_equal(fpr, [0, 0, 1]) assert_almost_equal(roc_auc, 0.0) y_true = [1, 0] y_score = [1, 1] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1]) assert_array_almost_equal(fpr, [0, 1]) assert_almost_equal(roc_auc, 0.5) y_true = [1, 0] y_score = [1, 0] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) assert_almost_equal(roc_auc, 1.0) y_true = [1, 0] y_score = [0.5, 0.5] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1]) assert_array_almost_equal(fpr, [0, 1]) assert_almost_equal(roc_auc, 0.5) y_true = [0, 0] y_score = [0.25, 0.75] # assert UndefinedMetricWarning because of no positive sample in y_true expected_message = ( "No positive samples in y_true, true positive value should be meaningless" ) with pytest.warns(UndefinedMetricWarning, match=expected_message): tpr, fpr, _ = roc_curve(y_true, y_score) with pytest.raises(ValueError): roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0.0, 0.5, 1.0]) assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan]) y_true = [1, 1] y_score = [0.25, 0.75] # assert UndefinedMetricWarning because of no negative sample in y_true expected_message = ( "No negative samples in y_true, false positive value should be meaningless" ) with pytest.warns(UndefinedMetricWarning, match=expected_message): tpr, fpr, _ = roc_curve(y_true, y_score) with pytest.raises(ValueError): roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan]) assert_array_almost_equal(fpr, [0.0, 0.5, 1.0]) # Multi-label classification task y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [0, 1]]) with pytest.raises(ValueError): roc_auc_score(y_true, y_score, average="macro") with pytest.raises(ValueError): roc_auc_score(y_true, y_score, average="weighted") assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.0) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.0) y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) with pytest.raises(ValueError): roc_auc_score(y_true, y_score, average="macro") with pytest.raises(ValueError): roc_auc_score(y_true, y_score, average="weighted") assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0.5, 0.5], [0.5, 0.5]]) assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0.5) assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0.5) assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) def test_roc_curve_drop_intermediate(): # Test that drop_intermediate drops the correct thresholds y_true = [0, 0, 0, 0, 1, 1] y_score = [0.0, 0.2, 0.5, 0.6, 0.7, 1.0] tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.7, 0.0]) # Test dropping thresholds with repeating scores y_true = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] y_score = [0.0, 0.1, 0.6, 0.6, 0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.9, 0.9, 1.0] tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True) assert_array_almost_equal(thresholds, [np.inf, 1.0, 0.9, 0.7, 0.6, 0.0]) def test_roc_curve_fpr_tpr_increasing(): # Ensure that fpr and tpr returned by roc_curve are increasing. # Construct an edge case with float y_score and sample_weight # when some adjacent values of fpr and tpr are actually the same. y_true = [0, 0, 1, 1, 1] y_score = [0.1, 0.7, 0.3, 0.4, 0.5] sample_weight = np.repeat(0.2, 5) fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) assert (np.diff(fpr) < 0).sum() == 0 assert (np.diff(tpr) < 0).sum() == 0 def test_auc(): # Test Area Under Curve (AUC) computation x = [0, 1] y = [0, 1] assert_array_almost_equal(auc(x, y), 0.5) x = [1, 0] y = [0, 1] assert_array_almost_equal(auc(x, y), 0.5) x = [1, 0, 0] y = [0, 1, 1] assert_array_almost_equal(auc(x, y), 0.5) x = [0, 1] y = [1, 1] assert_array_almost_equal(auc(x, y), 1) x = [0, 0.5, 1] y = [0, 0.5, 1] assert_array_almost_equal(auc(x, y), 0.5) def test_auc_errors(): # Incompatible shapes with pytest.raises(ValueError): auc([0.0, 0.5, 1.0], [0.1, 0.2]) # Too few x values with pytest.raises(ValueError): auc([0.0], [0.1]) # x is not in order x = [2, 1, 3, 4] y = [5, 6, 7, 8] error_message = "x is neither increasing nor decreasing : {}".format(np.array(x)) with pytest.raises(ValueError, match=re.escape(error_message)): auc(x, y) @pytest.mark.parametrize( "y_true, labels", [ (np.array([0, 1, 0, 2]), [0, 1, 2]), (np.array([0, 1, 0, 2]), None), (["a", "b", "a", "c"], ["a", "b", "c"]), (["a", "b", "a", "c"], None), ], ) def test_multiclass_ovo_roc_auc_toydata(y_true, labels): # Tests the one-vs-one multiclass ROC AUC algorithm # on a small example, representative of an expected use case. y_scores = np.array( [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] ) # Used to compute the expected output. # Consider labels 0 and 1: # positive label is 0, negative label is 1 score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35]) # positive label is 1, negative label is 0 score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5]) average_score_01 = (score_01 + score_10) / 2 # Consider labels 0 and 2: score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0]) score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8]) average_score_02 = (score_02 + score_20) / 2 # Consider labels 1 and 2: score_12 = roc_auc_score([1, 0], [0.4, 0.2]) score_21 = roc_auc_score([0, 1], [0.3, 0.8]) average_score_12 = (score_12 + score_21) / 2 # Unweighted, one-vs-one multiclass ROC AUC algorithm ovo_unweighted_score = (average_score_01 + average_score_02 + average_score_12) / 3 assert_almost_equal( roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_unweighted_score, ) # Weighted, one-vs-one multiclass ROC AUC algorithm # Each term is weighted by the prevalence for the positive label. pair_scores = [average_score_01, average_score_02, average_score_12] prevalence = [0.75, 0.75, 0.50] ovo_weighted_score = np.average(pair_scores, weights=prevalence) assert_almost_equal( roc_auc_score( y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" ), ovo_weighted_score, ) # Check that average=None raises NotImplemented error error_message = "average=None is not implemented for multi_class='ovo'." with pytest.raises(NotImplementedError, match=error_message): roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo", average=None) @pytest.mark.parametrize( "y_true, labels", [ (np.array([0, 2, 0, 2]), [0, 1, 2]), (np.array(["a", "d", "a", "d"]), ["a", "b", "d"]), ], ) def test_multiclass_ovo_roc_auc_toydata_binary(y_true, labels): # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true # # on a small example, representative of an expected use case. y_scores = np.array( [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]] ) # Used to compute the expected output. # Consider labels 0 and 1: # positive label is 0, negative label is 1 score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4]) # positive label is 1, negative label is 0 score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6]) ovo_score = (score_01 + score_10) / 2 assert_almost_equal( roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"), ovo_score ) # Weighted, one-vs-one multiclass ROC AUC algorithm assert_almost_equal( roc_auc_score( y_true, y_scores, labels=labels, multi_class="ovo", average="weighted" ), ovo_score, ) @pytest.mark.parametrize( "y_true, labels", [ (np.array([0, 1, 2, 2]), None), (["a", "b", "c", "c"], None), ([0, 1, 2, 2], [0, 1, 2]), (["a", "b", "c", "c"], ["a", "b", "c"]), ], ) def test_multiclass_ovr_roc_auc_toydata(y_true, labels): # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm # on a small example, representative of an expected use case. y_scores = np.array( [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]] ) # Compute the expected result by individually computing the 'one-vs-rest' # ROC AUC scores for classes 0, 1, and 2. out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0]) out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1]) out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2]) assert_almost_equal( roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels, average=None), [out_0, out_1, out_2], ) # Compute unweighted results (default behaviour is average="macro") result_unweighted = (out_0 + out_1 + out_2) / 3.0 assert_almost_equal( roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels), result_unweighted, ) # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm # on the same input (Provost & Domingos, 2000) result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 assert_almost_equal( roc_auc_score( y_true, y_scores, multi_class="ovr", labels=labels, average="weighted" ), result_weighted, ) @pytest.mark.parametrize( "multi_class, average", [ ("ovr", "macro"), ("ovr", "micro"), ("ovo", "macro"), ], ) def test_perfect_imperfect_chance_multiclass_roc_auc(multi_class, average): y_true = np.array([3, 1, 2, 0]) # Perfect classifier (from a ranking point of view) has roc_auc_score = 1.0 y_perfect = [ [0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.75, 0.05, 0.05, 0.15], ] assert_almost_equal( roc_auc_score(y_true, y_perfect, multi_class=multi_class, average=average), 1.0, ) # Imperfect classifier has roc_auc_score < 1.0 y_imperfect = [ [0.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0], ] assert ( roc_auc_score(y_true, y_imperfect, multi_class=multi_class, average=average) < 1.0 ) # Chance level classifier has roc_auc_score = 5.0 y_chance = 0.25 * np.ones((4, 4)) assert roc_auc_score( y_true, y_chance, multi_class=multi_class, average=average ) == pytest.approx(0.5) def test_micro_averaged_ovr_roc_auc(global_random_seed): seed = global_random_seed # Let's generate a set of random predictions and matching true labels such # that the predictions are not perfect. To make the problem more interesting, # we use an imbalanced class distribution (by using different parameters # in the Dirichlet prior (conjugate prior of the multinomial distribution). y_pred = stats.dirichlet.rvs([2.0, 1.0, 0.5], size=1000, random_state=seed) y_true = np.asarray( [ stats.multinomial.rvs(n=1, p=y_pred_i, random_state=seed).argmax() for y_pred_i in y_pred ] ) y_onehot = label_binarize(y_true, classes=[0, 1, 2]) fpr, tpr, _ = roc_curve(y_onehot.ravel(), y_pred.ravel()) roc_auc_by_hand = auc(fpr, tpr) roc_auc_auto = roc_auc_score(y_true, y_pred, multi_class="ovr", average="micro") assert roc_auc_by_hand == pytest.approx(roc_auc_auto) @pytest.mark.parametrize( "msg, y_true, labels", [ ("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]), ( "Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]), ["a", "a", "b"], ), ( ( "Number of classes in y_true not equal to the number of columns " "in 'y_score'" ), np.array([0, 2, 0, 2]), None, ), ( "Parameter 'labels' must be ordered", np.array(["a", "b", "c", "c"]), ["a", "c", "b"], ), ( ( "Number of given labels, 2, not equal to the number of columns in " "'y_score', 3" ), np.array([0, 1, 2, 2]), [0, 1], ), ( ( "Number of given labels, 2, not equal to the number of columns in " "'y_score', 3" ), np.array(["a", "b", "c", "c"]), ["a", "b"], ), ( ( "Number of given labels, 4, not equal to the number of columns in " "'y_score', 3" ), np.array([0, 1, 2, 2]), [0, 1, 2, 3], ), ( ( "Number of given labels, 4, not equal to the number of columns in " "'y_score', 3" ), np.array(["a", "b", "c", "c"]), ["a", "b", "c", "d"], ), ( "'y_true' contains labels not in parameter 'labels'", np.array(["a", "b", "c", "e"]), ["a", "b", "c"], ), ( "'y_true' contains labels not in parameter 'labels'", np.array(["a", "b", "c", "d"]), ["a", "b", "c"], ), ( "'y_true' contains labels not in parameter 'labels'", np.array([0, 1, 2, 3]), [0, 1, 2], ), ], ) @pytest.mark.parametrize("multi_class", ["ovo", "ovr"]) def test_roc_auc_score_multiclass_labels_error(msg, y_true, labels, multi_class): y_scores = np.array( [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]] ) with pytest.raises(ValueError, match=msg): roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class) @pytest.mark.parametrize( "msg, kwargs", [ ( ( r"average must be one of \('macro', 'weighted', None\) for " r"multiclass problems" ), {"average": "samples", "multi_class": "ovo"}, ), ( ( r"average must be one of \('micro', 'macro', 'weighted', None\) for " r"multiclass problems" ), {"average": "samples", "multi_class": "ovr"}, ), ( ( r"sample_weight is not supported for multiclass one-vs-one " r"ROC AUC, 'sample_weight' must be None in this case" ), {"multi_class": "ovo", "sample_weight": []}, ), ( ( r"Partial AUC computation not available in multiclass setting, " r"'max_fpr' must be set to `None`, received `max_fpr=0.5` " r"instead" ), {"multi_class": "ovo", "max_fpr": 0.5}, ), (r"multi_class must be in \('ovo', 'ovr'\)", {}), ], ) def test_roc_auc_score_multiclass_error(msg, kwargs): # Test that roc_auc_score function returns an error when trying # to compute multiclass AUC for parameters where an output # is not defined. rng = check_random_state(404) y_score = rng.rand(20, 3) y_prob = softmax(y_score) y_true = rng.randint(0, 3, size=20) with pytest.raises(ValueError, match=msg): roc_auc_score(y_true, y_prob, **kwargs) def test_auc_score_non_binary_class(): # Test that roc_auc_score function returns an error when trying # to compute AUC for non-binary class values. rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") err_msg = "ROC AUC score is not defined" with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) y_true = np.ones(10, dtype="int") with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) y_true = np.full(10, -1, dtype="int") with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) with warnings.catch_warnings(record=True): rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) y_true = np.ones(10, dtype="int") with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) y_true = np.full(10, -1, dtype="int") with pytest.raises(ValueError, match=err_msg): roc_auc_score(y_true, y_pred) @pytest.mark.parametrize("curve_func", CURVE_FUNCS) def test_binary_clf_curve_multiclass_error(curve_func): rng = check_random_state(404) y_true = rng.randint(0, 3, size=10) y_pred = rng.rand(10) msg = "multiclass format is not supported" with pytest.raises(ValueError, match=msg): curve_func(y_true, y_pred) @pytest.mark.parametrize("curve_func", CURVE_FUNCS) def test_binary_clf_curve_implicit_pos_label(curve_func): # Check that using string class labels raises an informative # error for any supported string dtype: msg = ( "y_true takes value in {'a', 'b'} and pos_label is " "not specified: either make y_true take " "value in {0, 1} or {-1, 1} or pass pos_label " "explicitly." ) with pytest.raises(ValueError, match=msg): curve_func(np.array(["a", "b"], dtype="= 0 and y_score.max() <= 1 else 0 y_pred = (y_score > threshold).astype(np.int64) if k == 1 else y_true score = top_k_accuracy_score(y_true, y_score, k=k) score_acc = accuracy_score(y_true, y_pred) assert score == score_acc == pytest.approx(true_score) @pytest.mark.parametrize( "y_true, true_score, labels", [ (np.array([0, 1, 1, 2]), 0.75, [0, 1, 2, 3]), (np.array([0, 1, 1, 1]), 0.5, [0, 1, 2, 3]), (np.array([1, 1, 1, 1]), 0.5, [0, 1, 2, 3]), (np.array(["a", "e", "e", "a"]), 0.75, ["a", "b", "d", "e"]), ], ) @pytest.mark.parametrize("labels_as_ndarray", [True, False]) def test_top_k_accuracy_score_multiclass_with_labels( y_true, true_score, labels, labels_as_ndarray ): """Test when labels and y_score are multiclass.""" if labels_as_ndarray: labels = np.asarray(labels) y_score = np.array( [ [0.4, 0.3, 0.2, 0.1], [0.1, 0.3, 0.4, 0.2], [0.4, 0.1, 0.2, 0.3], [0.3, 0.2, 0.4, 0.1], ] ) score = top_k_accuracy_score(y_true, y_score, k=2, labels=labels) assert score == pytest.approx(true_score) def test_top_k_accuracy_score_increasing(): # Make sure increasing k leads to a higher score X, y = datasets.make_classification( n_classes=10, n_samples=1000, n_informative=10, random_state=0 ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) for X, y in zip((X_train, X_test), (y_train, y_test)): scores = [ top_k_accuracy_score(y, clf.predict_proba(X), k=k) for k in range(2, 10) ] assert np.all(np.diff(scores) > 0) @pytest.mark.parametrize( "y_true, k, true_score", [ ([0, 1, 2, 3], 1, 0.25), ([0, 1, 2, 3], 2, 0.5), ([0, 1, 2, 3], 3, 1), ], ) def test_top_k_accuracy_score_ties(y_true, k, true_score): # Make sure highest indices labels are chosen first in case of ties y_score = np.array( [ [5, 5, 7, 0], [1, 5, 5, 5], [0, 0, 3, 3], [1, 1, 1, 1], ] ) assert top_k_accuracy_score(y_true, y_score, k=k) == pytest.approx(true_score) @pytest.mark.parametrize( "y_true, k", [ ([0, 1, 2, 3], 4), ([0, 1, 2, 3], 5), ], ) def test_top_k_accuracy_score_warning(y_true, k): y_score = np.array( [ [0.4, 0.3, 0.2, 0.1], [0.1, 0.4, 0.3, 0.2], [0.2, 0.1, 0.4, 0.3], [0.3, 0.2, 0.1, 0.4], ] ) expected_message = ( r"'k' \(\d+\) greater than or equal to 'n_classes' \(\d+\) will result in a " "perfect score and is therefore meaningless." ) with pytest.warns(UndefinedMetricWarning, match=expected_message): score = top_k_accuracy_score(y_true, y_score, k=k) assert score == 1 @pytest.mark.parametrize( "y_true, y_score, labels, msg", [ ( [0, 0.57, 1, 2], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], None, "y type must be 'binary' or 'multiclass', got 'continuous'", ), ( [0, 1, 2, 3], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], None, r"Number of classes in 'y_true' \(4\) not equal to the number of " r"classes in 'y_score' \(3\).", ), ( ["c", "c", "a", "b"], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], ["a", "b", "c", "c"], "Parameter 'labels' must be unique.", ), ( ["c", "c", "a", "b"], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], ["a", "c", "b"], "Parameter 'labels' must be ordered.", ), ( [0, 0, 1, 2], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], [0, 1, 2, 3], r"Number of given labels \(4\) not equal to the number of classes in " r"'y_score' \(3\).", ), ( [0, 0, 1, 2], [ [0.2, 0.1, 0.7], [0.4, 0.3, 0.3], [0.3, 0.4, 0.3], [0.4, 0.5, 0.1], ], [0, 1, 3], "'y_true' contains labels not in parameter 'labels'.", ), ( [0, 1], [[0.5, 0.2, 0.2], [0.3, 0.4, 0.2]], None, ( "`y_true` is binary while y_score is 2d with 3 classes. If" " `y_true` does not contain all the labels, `labels` must be provided" ), ), ], ) def test_top_k_accuracy_score_error(y_true, y_score, labels, msg): with pytest.raises(ValueError, match=msg): top_k_accuracy_score(y_true, y_score, k=2, labels=labels) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_label_ranking_avg_precision_score_should_allow_csr_matrix_for_y_true_input( csr_container, ): # Test that label_ranking_avg_precision_score accept sparse y_true. # Non-regression test for #22575 y_true = csr_container([[1, 0, 0], [0, 0, 1]]) y_score = np.array([[0.5, 0.9, 0.6], [0, 0, 1]]) result = label_ranking_average_precision_score(y_true, y_score) assert result == pytest.approx(2 / 3) @pytest.mark.parametrize( "metric", [average_precision_score, det_curve, precision_recall_curve, roc_curve] ) @pytest.mark.parametrize( "classes", [(False, True), (0, 1), (0.0, 1.0), ("zero", "one")] ) def test_ranking_metric_pos_label_types(metric, classes): """Check that the metric works with different types of `pos_label`. We can expect `pos_label` to be a bool, an integer, a float, a string. No error should be raised for those types. """ rng = np.random.RandomState(42) n_samples, pos_label = 10, classes[-1] y_true = rng.choice(classes, size=n_samples, replace=True) y_proba = rng.rand(n_samples) result = metric(y_true, y_proba, pos_label=pos_label) if isinstance(result, float): assert not np.isnan(result) else: metric_1, metric_2, thresholds = result assert not np.isnan(metric_1).any() assert not np.isnan(metric_2).any() assert not np.isnan(thresholds).any() def test_roc_curve_with_probablity_estimates(global_random_seed): """Check that thresholds do not exceed 1.0 when `y_score` is a probability estimate. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/26193 """ rng = np.random.RandomState(global_random_seed) y_true = rng.randint(0, 2, size=10) y_score = rng.rand(10) _, _, thresholds = roc_curve(y_true, y_score) assert np.isinf(thresholds[0]) # TODO(1.7): remove def test_precision_recall_curve_deprecation_warning(): """Check the message for future deprecation.""" # Check precision_recall_curve function y_true, _, y_score = make_prediction(binary=True) warn_msg = "probas_pred was deprecated in version 1.5" with pytest.warns(FutureWarning, match=warn_msg): precision_recall_curve( y_true, probas_pred=y_score, ) error_msg = "`probas_pred` and `y_score` cannot be both specified" with pytest.raises(ValueError, match=error_msg): precision_recall_curve( y_true, probas_pred=y_score, y_score=y_score, )