from functools import partial from inspect import signature from itertools import product from itertools import chain from itertools import permutations import numpy as np import scipy.sparse as sp import pytest from sklearn.datasets import make_multilabel_classification from sklearn.preprocessing import LabelBinarizer from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import _num_samples from sklearn.utils.validation import check_random_state from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_less from sklearn.utils._testing import ignore_warnings from sklearn.metrics import accuracy_score from sklearn.metrics import average_precision_score from sklearn.metrics import balanced_accuracy_score from sklearn.metrics import brier_score_loss from sklearn.metrics import cohen_kappa_score from sklearn.metrics import confusion_matrix from sklearn.metrics import coverage_error from sklearn.metrics import d2_tweedie_score from sklearn.metrics import d2_pinball_score from sklearn.metrics import d2_absolute_error_score from sklearn.metrics import det_curve from sklearn.metrics import explained_variance_score from sklearn.metrics import f1_score from sklearn.metrics import fbeta_score from sklearn.metrics import hamming_loss from sklearn.metrics import hinge_loss from sklearn.metrics import jaccard_score from sklearn.metrics import label_ranking_average_precision_score from sklearn.metrics import label_ranking_loss from sklearn.metrics import log_loss from sklearn.metrics import max_error from sklearn.metrics import matthews_corrcoef from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_absolute_percentage_error from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_tweedie_deviance from sklearn.metrics import mean_poisson_deviance from sklearn.metrics import mean_gamma_deviance from sklearn.metrics import median_absolute_error from sklearn.metrics import multilabel_confusion_matrix from sklearn.metrics import mean_pinball_loss from sklearn.metrics import precision_recall_curve from sklearn.metrics import precision_score from sklearn.metrics import r2_score from sklearn.metrics import recall_score from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve from sklearn.metrics import zero_one_loss from sklearn.metrics import ndcg_score from sklearn.metrics import dcg_score from sklearn.metrics import top_k_accuracy_score from sklearn.metrics._base import _average_binary_score # Note toward developers about metric testing # ------------------------------------------- # It is often possible to write one general test for several metrics: # # - invariance properties, e.g. invariance to sample order # - common behavior for an argument, e.g. the "normalize" with value True # will return the mean of the metrics and with value False will return # the sum of the metrics. # # In order to improve the overall metric testing, it is a good idea to write # first a specific test for the given metric and then add a general test for # all metrics that have the same behavior. # # Two types of datastructures are used in order to implement this system: # dictionaries of metrics and lists of metrics with common properties. # # Dictionaries of metrics # ------------------------ # The goal of having those dictionaries is to have an easy way to call a # particular metric and associate a name to each function: # # - REGRESSION_METRICS: all regression metrics. # - CLASSIFICATION_METRICS: all classification metrics # which compare a ground truth and the estimated targets as returned by a # classifier. # - THRESHOLDED_METRICS: all classification metrics which # compare a ground truth and a score, e.g. estimated probabilities or # decision function (format might vary) # # Those dictionaries will be used to test systematically some invariance # properties, e.g. invariance toward several input layout. # REGRESSION_METRICS = { "max_error": max_error, "mean_absolute_error": mean_absolute_error, "mean_squared_error": mean_squared_error, "mean_pinball_loss": mean_pinball_loss, "median_absolute_error": median_absolute_error, "mean_absolute_percentage_error": mean_absolute_percentage_error, "explained_variance_score": explained_variance_score, "r2_score": partial(r2_score, multioutput="variance_weighted"), "mean_normal_deviance": partial(mean_tweedie_deviance, power=0), "mean_poisson_deviance": mean_poisson_deviance, "mean_gamma_deviance": mean_gamma_deviance, "mean_compound_poisson_deviance": partial(mean_tweedie_deviance, power=1.4), "d2_tweedie_score": partial(d2_tweedie_score, power=1.4), "d2_pinball_score": d2_pinball_score, "d2_absolute_error_score": d2_absolute_error_score, } CLASSIFICATION_METRICS = { "accuracy_score": accuracy_score, "balanced_accuracy_score": balanced_accuracy_score, "adjusted_balanced_accuracy_score": partial(balanced_accuracy_score, adjusted=True), "unnormalized_accuracy_score": partial(accuracy_score, normalize=False), # `confusion_matrix` returns absolute values and hence behaves unnormalized # . Naming it with an unnormalized_ prefix is necessary for this module to # skip sample_weight scaling checks which will fail for unnormalized # metrics. "unnormalized_confusion_matrix": confusion_matrix, "normalized_confusion_matrix": lambda *args, **kwargs: ( confusion_matrix(*args, **kwargs).astype("float") / confusion_matrix(*args, **kwargs).sum(axis=1)[:, np.newaxis] ), "unnormalized_multilabel_confusion_matrix": multilabel_confusion_matrix, "unnormalized_multilabel_confusion_matrix_sample": partial( multilabel_confusion_matrix, samplewise=True ), "hamming_loss": hamming_loss, "zero_one_loss": zero_one_loss, "unnormalized_zero_one_loss": partial(zero_one_loss, normalize=False), # These are needed to test averaging "jaccard_score": jaccard_score, "precision_score": precision_score, "recall_score": recall_score, "f1_score": f1_score, "f2_score": partial(fbeta_score, beta=2), "f0.5_score": partial(fbeta_score, beta=0.5), "matthews_corrcoef_score": matthews_corrcoef, "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5), "weighted_f1_score": partial(f1_score, average="weighted"), "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2), "weighted_precision_score": partial(precision_score, average="weighted"), "weighted_recall_score": partial(recall_score, average="weighted"), "weighted_jaccard_score": partial(jaccard_score, average="weighted"), "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5), "micro_f1_score": partial(f1_score, average="micro"), "micro_f2_score": partial(fbeta_score, average="micro", beta=2), "micro_precision_score": partial(precision_score, average="micro"), "micro_recall_score": partial(recall_score, average="micro"), "micro_jaccard_score": partial(jaccard_score, average="micro"), "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5), "macro_f1_score": partial(f1_score, average="macro"), "macro_f2_score": partial(fbeta_score, average="macro", beta=2), "macro_precision_score": partial(precision_score, average="macro"), "macro_recall_score": partial(recall_score, average="macro"), "macro_jaccard_score": partial(jaccard_score, average="macro"), "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5), "samples_f1_score": partial(f1_score, average="samples"), "samples_f2_score": partial(fbeta_score, average="samples", beta=2), "samples_precision_score": partial(precision_score, average="samples"), "samples_recall_score": partial(recall_score, average="samples"), "samples_jaccard_score": partial(jaccard_score, average="samples"), "cohen_kappa_score": cohen_kappa_score, } def precision_recall_curve_padded_thresholds(*args, **kwargs): """ The dimensions of precision-recall pairs and the threshold array as returned by the precision_recall_curve do not match. See func:`sklearn.metrics.precision_recall_curve` This prevents implicit conversion of return value triple to an higher dimensional np.array of dtype('float64') (it will be of dtype('object) instead). This again is needed for assert_array_equal to work correctly. As a workaround we pad the threshold array with NaN values to match the dimension of precision and recall arrays respectively. """ precision, recall, thresholds = precision_recall_curve(*args, **kwargs) pad_threshholds = len(precision) - len(thresholds) return np.array( [ precision, recall, np.pad( thresholds.astype(np.float64), pad_width=(0, pad_threshholds), mode="constant", constant_values=[np.nan], ), ] ) CURVE_METRICS = { "roc_curve": roc_curve, "precision_recall_curve": precision_recall_curve_padded_thresholds, "det_curve": det_curve, } THRESHOLDED_METRICS = { "coverage_error": coverage_error, "label_ranking_loss": label_ranking_loss, "log_loss": log_loss, "unnormalized_log_loss": partial(log_loss, normalize=False), "hinge_loss": hinge_loss, "brier_score_loss": brier_score_loss, "roc_auc_score": roc_auc_score, # default: average="macro" "weighted_roc_auc": partial(roc_auc_score, average="weighted"), "samples_roc_auc": partial(roc_auc_score, average="samples"), "micro_roc_auc": partial(roc_auc_score, average="micro"), "ovr_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovr"), "weighted_ovr_roc_auc": partial( roc_auc_score, average="weighted", multi_class="ovr" ), "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovo"), "weighted_ovo_roc_auc": partial( roc_auc_score, average="weighted", multi_class="ovo" ), "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5), "average_precision_score": average_precision_score, # default: average="macro" "weighted_average_precision_score": partial( average_precision_score, average="weighted" ), "samples_average_precision_score": partial( average_precision_score, average="samples" ), "micro_average_precision_score": partial(average_precision_score, average="micro"), "label_ranking_average_precision_score": label_ranking_average_precision_score, "ndcg_score": ndcg_score, "dcg_score": dcg_score, "top_k_accuracy_score": top_k_accuracy_score, } ALL_METRICS = dict() ALL_METRICS.update(THRESHOLDED_METRICS) ALL_METRICS.update(CLASSIFICATION_METRICS) ALL_METRICS.update(REGRESSION_METRICS) ALL_METRICS.update(CURVE_METRICS) # Lists of metrics with common properties # --------------------------------------- # Lists of metrics with common properties are used to test systematically some # functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics that # are symmetric with respect to their input argument y_true and y_pred. # # When you add a new metric or functionality, check if a general test # is already written. # Those metrics don't support binary inputs METRIC_UNDEFINED_BINARY = { "samples_f0.5_score", "samples_f1_score", "samples_f2_score", "samples_precision_score", "samples_recall_score", "samples_jaccard_score", "coverage_error", "unnormalized_multilabel_confusion_matrix_sample", "label_ranking_loss", "label_ranking_average_precision_score", "dcg_score", "ndcg_score", } # Those metrics don't support multiclass inputs METRIC_UNDEFINED_MULTICLASS = { "brier_score_loss", "micro_roc_auc", "samples_roc_auc", "partial_roc_auc", "roc_auc_score", "weighted_roc_auc", "average_precision_score", "weighted_average_precision_score", "micro_average_precision_score", "samples_average_precision_score", "jaccard_score", # with default average='binary', multiclass is prohibited "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", # curves "roc_curve", "precision_recall_curve", "det_curve", } # Metric undefined with "binary" or "multiclass" input METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union( METRIC_UNDEFINED_MULTICLASS ) # Metrics with an "average" argument METRICS_WITH_AVERAGING = { "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", "jaccard_score", } # Threshold-based metrics with an "average" argument THRESHOLDED_METRICS_WITH_AVERAGING = { "roc_auc_score", "average_precision_score", "partial_roc_auc", } # Metrics with a "pos_label" argument METRICS_WITH_POS_LABEL = { "roc_curve", "precision_recall_curve", "det_curve", "brier_score_loss", "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", "jaccard_score", "average_precision_score", "weighted_average_precision_score", "micro_average_precision_score", "samples_average_precision_score", } # Metrics with a "labels" argument # TODO: Handle multi_class metrics that has a labels argument as well as a # decision function argument. e.g hinge_loss METRICS_WITH_LABELS = { "unnormalized_confusion_matrix", "normalized_confusion_matrix", "roc_curve", "precision_recall_curve", "det_curve", "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score", "jaccard_score", "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score", "weighted_precision_score", "weighted_recall_score", "weighted_jaccard_score", "micro_f0.5_score", "micro_f1_score", "micro_f2_score", "micro_precision_score", "micro_recall_score", "micro_jaccard_score", "macro_f0.5_score", "macro_f1_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "macro_jaccard_score", "unnormalized_multilabel_confusion_matrix", "unnormalized_multilabel_confusion_matrix_sample", "cohen_kappa_score", } # Metrics with a "normalize" option METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score", "top_k_accuracy_score", "zero_one_loss", } # Threshold-based metrics with "multilabel-indicator" format support THRESHOLDED_MULTILABEL_METRICS = { "log_loss", "unnormalized_log_loss", "roc_auc_score", "weighted_roc_auc", "samples_roc_auc", "micro_roc_auc", "partial_roc_auc", "average_precision_score", "weighted_average_precision_score", "samples_average_precision_score", "micro_average_precision_score", "coverage_error", "label_ranking_loss", "ndcg_score", "dcg_score", "label_ranking_average_precision_score", } # Classification metrics with "multilabel-indicator" format MULTILABELS_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "zero_one_loss", "unnormalized_zero_one_loss", "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score", "weighted_precision_score", "weighted_recall_score", "weighted_jaccard_score", "macro_f0.5_score", "macro_f1_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "macro_jaccard_score", "micro_f0.5_score", "micro_f1_score", "micro_f2_score", "micro_precision_score", "micro_recall_score", "micro_jaccard_score", "unnormalized_multilabel_confusion_matrix", "samples_f0.5_score", "samples_f1_score", "samples_f2_score", "samples_precision_score", "samples_recall_score", "samples_jaccard_score", } # Regression metrics with "multioutput-continuous" format support MULTIOUTPUT_METRICS = { "mean_absolute_error", "median_absolute_error", "mean_squared_error", "r2_score", "explained_variance_score", "mean_absolute_percentage_error", "mean_pinball_loss", "d2_pinball_score", "d2_absolute_error_score", } # Symmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) == metric(y_pred, y_true). SYMMETRIC_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "zero_one_loss", "unnormalized_zero_one_loss", "micro_jaccard_score", "macro_jaccard_score", "jaccard_score", "samples_jaccard_score", "f1_score", "micro_f1_score", "macro_f1_score", "weighted_recall_score", # P = R = F = accuracy in multiclass case "micro_f0.5_score", "micro_f1_score", "micro_f2_score", "micro_precision_score", "micro_recall_score", "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error", "median_absolute_error", "max_error", # Pinball loss is only symmetric for alpha=0.5 which is the default. "mean_pinball_loss", "cohen_kappa_score", "mean_normal_deviance", } # Asymmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) != metric(y_pred, y_true). NOT_SYMMETRIC_METRICS = { "balanced_accuracy_score", "adjusted_balanced_accuracy_score", "explained_variance_score", "r2_score", "unnormalized_confusion_matrix", "normalized_confusion_matrix", "roc_curve", "precision_recall_curve", "det_curve", "precision_score", "recall_score", "f2_score", "f0.5_score", "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score", "weighted_precision_score", "weighted_jaccard_score", "unnormalized_multilabel_confusion_matrix", "macro_f0.5_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "hinge_loss", "mean_gamma_deviance", "mean_poisson_deviance", "mean_compound_poisson_deviance", "d2_tweedie_score", "d2_pinball_score", "d2_absolute_error_score", "mean_absolute_percentage_error", } # No Sample weight support METRICS_WITHOUT_SAMPLE_WEIGHT = { "median_absolute_error", "max_error", "ovo_roc_auc", "weighted_ovo_roc_auc", } METRICS_REQUIRE_POSITIVE_Y = { "mean_poisson_deviance", "mean_gamma_deviance", "mean_compound_poisson_deviance", "d2_tweedie_score", } def _require_positive_targets(y1, y2): """Make targets strictly positive""" offset = abs(min(y1.min(), y2.min())) + 1 y1 += offset y2 += offset return y1, y2 def test_symmetry_consistency(): # We shouldn't forget any metrics assert ( SYMMETRIC_METRICS | NOT_SYMMETRIC_METRICS | set(THRESHOLDED_METRICS) | METRIC_UNDEFINED_BINARY_MULTICLASS ) == set(ALL_METRICS) assert (SYMMETRIC_METRICS & NOT_SYMMETRIC_METRICS) == set() @pytest.mark.parametrize("name", sorted(SYMMETRIC_METRICS)) def test_symmetric_metric(name): # Test the symmetry of score and loss functions random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20,)) y_pred = random_state.randint(0, 2, size=(20,)) if name in METRICS_REQUIRE_POSITIVE_Y: y_true, y_pred = _require_positive_targets(y_true, y_pred) y_true_bin = random_state.randint(0, 2, size=(20, 25)) y_pred_bin = random_state.randint(0, 2, size=(20, 25)) metric = ALL_METRICS[name] if name in METRIC_UNDEFINED_BINARY: if name in MULTILABELS_METRICS: assert_allclose( metric(y_true_bin, y_pred_bin), metric(y_pred_bin, y_true_bin), err_msg="%s is not symmetric" % name, ) else: assert False, "This case is currently unhandled" else: assert_allclose( metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name, ) @pytest.mark.parametrize("name", sorted(NOT_SYMMETRIC_METRICS)) def test_not_symmetric_metric(name): # Test the symmetry of score and loss functions random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20,)) y_pred = random_state.randint(0, 2, size=(20,)) if name in METRICS_REQUIRE_POSITIVE_Y: y_true, y_pred = _require_positive_targets(y_true, y_pred) metric = ALL_METRICS[name] # use context manager to supply custom error message with pytest.raises(AssertionError): assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true)) raise ValueError("%s seems to be symmetric" % name) @pytest.mark.parametrize( "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) ) def test_sample_order_invariance(name): random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20,)) y_pred = random_state.randint(0, 2, size=(20,)) if name in METRICS_REQUIRE_POSITIVE_Y: y_true, y_pred = _require_positive_targets(y_true, y_pred) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) with ignore_warnings(): metric = ALL_METRICS[name] assert_allclose( metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name, ) @ignore_warnings def test_sample_order_invariance_multilabel_and_multioutput(): random_state = check_random_state(0) # Generate some data y_true = random_state.randint(0, 2, size=(20, 25)) y_pred = random_state.randint(0, 2, size=(20, 25)) y_score = random_state.normal(size=y_true.shape) y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle( y_true, y_pred, y_score, random_state=0 ) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] assert_allclose( metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name, ) for name in THRESHOLDED_MULTILABEL_METRICS: metric = ALL_METRICS[name] assert_allclose( metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name, ) for name in MULTIOUTPUT_METRICS: metric = ALL_METRICS[name] assert_allclose( metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name, ) assert_allclose( metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name, ) @pytest.mark.parametrize( "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) ) def test_format_invariance_with_1d_vectors(name): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20,)) y2 = random_state.randint(0, 2, size=(20,)) if name in METRICS_REQUIRE_POSITIVE_Y: y1, y2 = _require_positive_targets(y1, y2) y1_list = list(y1) y2_list = list(y2) y1_1d, y2_1d = np.array(y1), np.array(y2) assert_array_equal(y1_1d.ndim, 1) assert_array_equal(y2_1d.ndim, 1) y1_column = np.reshape(y1_1d, (-1, 1)) y2_column = np.reshape(y2_1d, (-1, 1)) y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) with ignore_warnings(): metric = ALL_METRICS[name] measure = metric(y1, y2) assert_allclose( metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant with list" % name, ) assert_allclose( metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant with np-array-1d" % name, ) assert_allclose( metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant with np-array-column" % name, ) # Mix format support assert_allclose( metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant with mix np-array-1d and list" % name, ) assert_allclose( metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant with mix np-array-1d and list" % name, ) assert_allclose( metric(y1_1d, y2_column), measure, err_msg=( "%s is not representation invariant with mix " "np-array-1d and np-array-column" ) % name, ) assert_allclose( metric(y1_column, y2_1d), measure, err_msg=( "%s is not representation invariant with mix " "np-array-1d and np-array-column" ) % name, ) assert_allclose( metric(y1_list, y2_column), measure, err_msg=( "%s is not representation invariant with mix list and np-array-column" ) % name, ) assert_allclose( metric(y1_column, y2_list), measure, err_msg=( "%s is not representation invariant with mix list and np-array-column" ) % name, ) # These mix representations aren't allowed with pytest.raises(ValueError): metric(y1_1d, y2_row) with pytest.raises(ValueError): metric(y1_row, y2_1d) with pytest.raises(ValueError): metric(y1_list, y2_row) with pytest.raises(ValueError): metric(y1_row, y2_list) with pytest.raises(ValueError): metric(y1_column, y2_row) with pytest.raises(ValueError): metric(y1_row, y2_column) # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. if name not in ( MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS ): with pytest.raises(ValueError): metric(y1_row, y2_row) @pytest.mark.parametrize( "name", sorted(set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) ) def test_classification_invariance_string_vs_numbers_labels(name): # Ensure that classification metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20,)) y2 = random_state.randint(0, 2, size=(20,)) y1_str = np.array(["eggs", "spam"])[y1] y2_str = np.array(["eggs", "spam"])[y2] pos_label_str = "spam" labels_str = ["eggs", "spam"] with ignore_warnings(): metric = CLASSIFICATION_METRICS[name] measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal( measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance test".format(name), ) measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O")) assert_array_equal( measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number invariance test".format(name), ) if name in METRICS_WITH_LABELS: metric_str = partial(metric_str, labels=labels_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal( measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance test".format(name), ) measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O")) assert_array_equal( measure_with_number, measure_with_strobj, err_msg="{0} failed string vs number invariance test".format(name), ) @pytest.mark.parametrize("name", THRESHOLDED_METRICS) def test_thresholded_invariance_string_vs_numbers_labels(name): # Ensure that thresholded metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20,)) y2 = random_state.randint(0, 2, size=(20,)) y1_str = np.array(["eggs", "spam"])[y1] pos_label_str = "spam" with ignore_warnings(): metric = THRESHOLDED_METRICS[name] if name not in METRIC_UNDEFINED_BINARY: # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_number = metric(y1, y2) measure_with_str = metric_str(y1_str, y2) assert_array_equal( measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance test".format(name), ) measure_with_strobj = metric_str(y1_str.astype("O"), y2) assert_array_equal( measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number invariance test".format( name ), ) else: # TODO those metrics doesn't support string label yet with pytest.raises(ValueError): metric(y1_str, y2) with pytest.raises(ValueError): metric(y1_str.astype("O"), y2) invalids_nan_inf = [ ([0, 1], [np.inf, np.inf]), ([0, 1], [np.nan, np.nan]), ([0, 1], [np.nan, np.inf]), ([0, 1], [np.inf, 1]), ([0, 1], [np.nan, 1]), ] @pytest.mark.parametrize( "metric", chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values()) ) @pytest.mark.parametrize("y_true, y_score", invalids_nan_inf) def test_regression_thresholded_inf_nan_input(metric, y_true, y_score): # Reshape since coverage_error only accepts 2D arrays. if metric == coverage_error: y_true = [y_true] y_score = [y_score] with pytest.raises(ValueError, match=r"contains (NaN|infinity)"): metric(y_true, y_score) @pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values()) @pytest.mark.parametrize( "y_true, y_score", invalids_nan_inf + # Add an additional case for classification only # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/6809 [ ([np.nan, 1, 2], [1, 2, 3]), ([np.inf, 1, 2], [1, 2, 3]), ], # type: ignore ) def test_classification_inf_nan_input(metric, y_true, y_score): """check that classification metrics raise a message mentioning the occurrence of non-finite values in the target vectors.""" if not np.isfinite(y_true).all(): input_name = "y_true" if np.isnan(y_true).any(): unexpected_value = "NaN" else: unexpected_value = "infinity or a value too large" else: input_name = "y_pred" if np.isnan(y_score).any(): unexpected_value = "NaN" else: unexpected_value = "infinity or a value too large" err_msg = f"Input {input_name} contains {unexpected_value}" with pytest.raises(ValueError, match=err_msg): metric(y_true, y_score) @pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values()) def test_classification_binary_continuous_input(metric): """check that classification metrics raise a message of mixed type data with continuous/binary target vectors.""" y_true, y_score = ["a", "b", "a"], [0.1, 0.2, 0.3] err_msg = ( "Classification metrics can't handle a mix of binary and continuous targets" ) with pytest.raises(ValueError, match=err_msg): metric(y_true, y_score) @ignore_warnings def check_single_sample(name): # Non-regression test: scores should work with a single sample. # This is important for leave-one-out cross validation. # Score functions tested are those that formerly called np.squeeze, # which turns an array of size 1 into a 0-d array (!). metric = ALL_METRICS[name] # assert that no exception is thrown if name in METRICS_REQUIRE_POSITIVE_Y: values = [1, 2] else: values = [0, 1] for i, j in product(values, repeat=2): metric([i], [j]) @ignore_warnings def check_single_sample_multioutput(name): metric = ALL_METRICS[name] for i, j, k, l in product([0, 1], repeat=4): metric(np.array([[i, j]]), np.array([[k, l]])) @pytest.mark.parametrize( "name", sorted( set(ALL_METRICS) # Those metrics are not always defined with one sample # or in multiclass classification - METRIC_UNDEFINED_BINARY_MULTICLASS - set(THRESHOLDED_METRICS) ), ) def test_single_sample(name): check_single_sample(name) @pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS | MULTILABELS_METRICS)) def test_single_sample_multioutput(name): check_single_sample_multioutput(name) @pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS)) def test_multioutput_number_of_output_differ(name): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0], [1, 0], [0, 0]]) metric = ALL_METRICS[name] with pytest.raises(ValueError): metric(y_true, y_pred) @pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS)) def test_multioutput_regression_invariance_to_dimension_shuffling(name): # test invariance to dimension shuffling random_state = check_random_state(0) y_true = random_state.uniform(0, 2, size=(20, 5)) y_pred = random_state.uniform(0, 2, size=(20, 5)) metric = ALL_METRICS[name] error = metric(y_true, y_pred) for _ in range(3): perm = random_state.permutation(y_true.shape[1]) assert_allclose( metric(y_true[:, perm], y_pred[:, perm]), error, err_msg="%s is not dimension shuffling invariant" % (name), ) @ignore_warnings def test_multilabel_representation_invariance(): # Generate some data n_classes = 4 n_samples = 50 _, y1 = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples, allow_unlabeled=True, ) _, y2 = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples, allow_unlabeled=True, ) # To make sure at least one empty label is present y1 = np.vstack([y1, [[0] * n_classes]]) y2 = np.vstack([y2, [[0] * n_classes]]) y1_sparse_indicator = sp.coo_matrix(y1) y2_sparse_indicator = sp.coo_matrix(y2) y1_list_array_indicator = list(y1) y2_list_array_indicator = list(y2) y1_list_list_indicator = [list(a) for a in y1_list_array_indicator] y2_list_list_indicator = [list(a) for a in y2_list_array_indicator] for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] # XXX cruel hack to work with partial functions if isinstance(metric, partial): metric.__module__ = "tmp" metric.__name__ = name measure = metric(y1, y2) # Check representation invariance assert_allclose( metric(y1_sparse_indicator, y2_sparse_indicator), measure, err_msg=( "%s failed representation invariance between " "dense and sparse indicator formats." ) % name, ) assert_almost_equal( metric(y1_list_list_indicator, y2_list_list_indicator), measure, err_msg=( "%s failed representation invariance " "between dense array and list of list " "indicator formats." ) % name, ) assert_almost_equal( metric(y1_list_array_indicator, y2_list_array_indicator), measure, err_msg=( "%s failed representation invariance " "between dense and list of array " "indicator formats." ) % name, ) @pytest.mark.parametrize("name", sorted(MULTILABELS_METRICS)) def test_raise_value_error_multilabel_sequences(name): # make sure the multilabel-sequence format raises ValueError multilabel_sequences = [ [[1], [2], [0, 1]], [(), (2), (0, 1)], [[]], [()], np.array([[], [1, 2]], dtype="object"), ] metric = ALL_METRICS[name] for seq in multilabel_sequences: with pytest.raises(ValueError): metric(seq, seq) @pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION)) def test_normalize_option_binary_classification(name): # Test in the binary case n_classes = 2 n_samples = 20 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples,)) y_pred = random_state.randint(0, n_classes, size=(n_samples,)) y_score = random_state.normal(size=y_true.shape) metrics = ALL_METRICS[name] pred = y_score if name in THRESHOLDED_METRICS else y_pred measure_normalized = metrics(y_true, pred, normalize=True) measure_not_normalized = metrics(y_true, pred, normalize=False) assert_array_less( -1.0 * measure_normalized, 0, err_msg="We failed to test correctly the normalize option", ) assert_allclose( measure_normalized, measure_not_normalized / n_samples, err_msg=f"Failed with {name}", ) @pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION)) def test_normalize_option_multiclass_classification(name): # Test in the multiclass case n_classes = 4 n_samples = 20 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples,)) y_pred = random_state.randint(0, n_classes, size=(n_samples,)) y_score = random_state.uniform(size=(n_samples, n_classes)) metrics = ALL_METRICS[name] pred = y_score if name in THRESHOLDED_METRICS else y_pred measure_normalized = metrics(y_true, pred, normalize=True) measure_not_normalized = metrics(y_true, pred, normalize=False) assert_array_less( -1.0 * measure_normalized, 0, err_msg="We failed to test correctly the normalize option", ) assert_allclose( measure_normalized, measure_not_normalized / n_samples, err_msg=f"Failed with {name}", ) @pytest.mark.parametrize( "name", sorted(METRICS_WITH_NORMALIZE_OPTION.intersection(MULTILABELS_METRICS)) ) def test_normalize_option_multilabel_classification(name): # Test in the multilabel case n_classes = 4 n_samples = 100 random_state = check_random_state(0) # for both random_state 0 and 1, y_true and y_pred has at least one # unlabelled entry _, y_true = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=0, allow_unlabeled=True, n_samples=n_samples, ) _, y_pred = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=1, allow_unlabeled=True, n_samples=n_samples, ) y_score = random_state.uniform(size=y_true.shape) # To make sure at least one empty label is present y_true += [0] * n_classes y_pred += [0] * n_classes metrics = ALL_METRICS[name] pred = y_score if name in THRESHOLDED_METRICS else y_pred measure_normalized = metrics(y_true, pred, normalize=True) measure_not_normalized = metrics(y_true, pred, normalize=False) assert_array_less( -1.0 * measure_normalized, 0, err_msg="We failed to test correctly the normalize option", ) assert_allclose( measure_normalized, measure_not_normalized / n_samples, err_msg=f"Failed with {name}", ) @ignore_warnings def _check_averaging( metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel ): n_samples, n_classes = y_true_binarize.shape # No averaging label_measure = metric(y_true, y_pred, average=None) assert_allclose( label_measure, [ metric(y_true_binarize[:, i], y_pred_binarize[:, i]) for i in range(n_classes) ], ) # Micro measure micro_measure = metric(y_true, y_pred, average="micro") assert_allclose( micro_measure, metric(y_true_binarize.ravel(), y_pred_binarize.ravel()) ) # Macro measure macro_measure = metric(y_true, y_pred, average="macro") assert_allclose(macro_measure, np.mean(label_measure)) # Weighted measure weights = np.sum(y_true_binarize, axis=0, dtype=int) if np.sum(weights) != 0: weighted_measure = metric(y_true, y_pred, average="weighted") assert_allclose(weighted_measure, np.average(label_measure, weights=weights)) else: weighted_measure = metric(y_true, y_pred, average="weighted") assert_allclose(weighted_measure, 0) # Sample measure if is_multilabel: sample_measure = metric(y_true, y_pred, average="samples") assert_allclose( sample_measure, np.mean( [ metric(y_true_binarize[i], y_pred_binarize[i]) for i in range(n_samples) ] ), ) with pytest.raises(ValueError): metric(y_true, y_pred, average="unknown") with pytest.raises(ValueError): metric(y_true, y_pred, average="garbage") def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score): is_multilabel = type_of_target(y_true).startswith("multilabel") metric = ALL_METRICS[name] if name in METRICS_WITH_AVERAGING: _check_averaging( metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel ) elif name in THRESHOLDED_METRICS_WITH_AVERAGING: _check_averaging( metric, y_true, y_score, y_true_binarize, y_score, is_multilabel ) else: raise ValueError("Metric is not recorded as having an average option") @pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) def test_averaging_multiclass(name): n_samples, n_classes = 50, 3 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples,)) y_pred = random_state.randint(0, n_classes, size=(n_samples,)) y_score = random_state.uniform(size=(n_samples, n_classes)) lb = LabelBinarizer().fit(y_true) y_true_binarize = lb.transform(y_true) y_pred_binarize = lb.transform(y_pred) check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) @pytest.mark.parametrize( "name", sorted(METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING) ) def test_averaging_multilabel(name): n_samples, n_classes = 40, 5 _, y = make_multilabel_classification( n_features=1, n_classes=n_classes, random_state=5, n_samples=n_samples, allow_unlabeled=False, ) y_true = y[:20] y_pred = y[20:] y_score = check_random_state(0).normal(size=(20, n_classes)) y_true_binarize = y_true y_pred_binarize = y_pred check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) @pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) def test_averaging_multilabel_all_zeroes(name): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_score = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) def test_averaging_binary_multilabel_all_zeroes(): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred # Test _average_binary_score for weight.sum() == 0 binary_metric = lambda y_true, y_score, average="macro": _average_binary_score( precision_score, y_true, y_score, average ) _check_averaging( binary_metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel=True, ) @pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING)) def test_averaging_multilabel_all_ones(name): y_true = np.ones((20, 3)) y_pred = np.ones((20, 3)) y_score = np.ones((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score) @ignore_warnings def check_sample_weight_invariance(name, metric, y1, y2): rng = np.random.RandomState(0) sample_weight = rng.randint(1, 10, size=len(y1)) # top_k_accuracy_score always lead to a perfect score for k > 1 in the # binary case metric = partial(metric, k=1) if name == "top_k_accuracy_score" else metric # check that unit weights gives the same score as no weight unweighted_score = metric(y1, y2, sample_weight=None) assert_allclose( unweighted_score, metric(y1, y2, sample_weight=np.ones(shape=len(y1))), err_msg="For %s sample_weight=None is not equivalent to sample_weight=ones" % name, ) # check that the weighted and unweighted scores are unequal weighted_score = metric(y1, y2, sample_weight=sample_weight) # use context manager to supply custom error message with pytest.raises(AssertionError): assert_allclose(unweighted_score, weighted_score) raise ValueError( "Unweighted and weighted scores are unexpectedly " "almost equal (%s) and (%s) " "for %s" % (unweighted_score, weighted_score, name) ) # check that sample_weight can be a list weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist()) assert_allclose( weighted_score, weighted_score_list, err_msg=( "Weighted scores for array and list " "sample_weight input are not equal (%s != %s) for %s" ) % (weighted_score, weighted_score_list, name), ) # check that integer weights is the same as repeated samples repeat_weighted_score = metric( np.repeat(y1, sample_weight, axis=0), np.repeat(y2, sample_weight, axis=0), sample_weight=None, ) assert_allclose( weighted_score, repeat_weighted_score, err_msg="Weighting %s is not equal to repeating samples" % name, ) # check that ignoring a fraction of the samples is equivalent to setting # the corresponding weights to zero sample_weight_subset = sample_weight[1::2] sample_weight_zeroed = np.copy(sample_weight) sample_weight_zeroed[::2] = 0 y1_subset = y1[1::2] y2_subset = y2[1::2] weighted_score_subset = metric( y1_subset, y2_subset, sample_weight=sample_weight_subset ) weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed) assert_allclose( weighted_score_subset, weighted_score_zeroed, err_msg=( "Zeroing weights does not give the same result as " "removing the corresponding samples (%s != %s) for %s" ) % (weighted_score_zeroed, weighted_score_subset, name), ) if not name.startswith("unnormalized"): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: assert_allclose( weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), err_msg="%s sample_weight is not invariant under scaling" % name, ) # Check that if number of samples in y_true and sample_weight are not # equal, meaningful error is raised. error_message = ( r"Found input variables with inconsistent numbers of " r"samples: \[{}, {}, {}\]".format( _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2 ) ) with pytest.raises(ValueError, match=error_message): metric(y1, y2, sample_weight=np.hstack([sample_weight, sample_weight])) @pytest.mark.parametrize( "name", sorted( set(ALL_METRICS).intersection(set(REGRESSION_METRICS)) - METRICS_WITHOUT_SAMPLE_WEIGHT ), ) def test_regression_sample_weight_invariance(name): n_samples = 50 random_state = check_random_state(0) # regression y_true = random_state.random_sample(size=(n_samples,)) y_pred = random_state.random_sample(size=(n_samples,)) metric = ALL_METRICS[name] check_sample_weight_invariance(name, metric, y_true, y_pred) @pytest.mark.parametrize( "name", sorted( set(ALL_METRICS) - set(REGRESSION_METRICS) - METRICS_WITHOUT_SAMPLE_WEIGHT - METRIC_UNDEFINED_BINARY ), ) def test_binary_sample_weight_invariance(name): # binary n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples,)) y_pred = random_state.randint(0, 2, size=(n_samples,)) y_score = random_state.random_sample(size=(n_samples,)) metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: check_sample_weight_invariance(name, metric, y_true, y_score) else: check_sample_weight_invariance(name, metric, y_true, y_pred) @pytest.mark.parametrize( "name", sorted( set(ALL_METRICS) - set(REGRESSION_METRICS) - METRICS_WITHOUT_SAMPLE_WEIGHT - METRIC_UNDEFINED_BINARY_MULTICLASS ), ) def test_multiclass_sample_weight_invariance(name): # multiclass n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples,)) y_pred = random_state.randint(0, 5, size=(n_samples,)) y_score = random_state.random_sample(size=(n_samples, 5)) metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: # softmax temp = np.exp(-y_score) y_score_norm = temp / temp.sum(axis=-1).reshape(-1, 1) check_sample_weight_invariance(name, metric, y_true, y_score_norm) else: check_sample_weight_invariance(name, metric, y_true, y_pred) @pytest.mark.parametrize( "name", sorted( (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS) - METRICS_WITHOUT_SAMPLE_WEIGHT ), ) def test_multilabel_sample_weight_invariance(name): # multilabel indicator random_state = check_random_state(0) _, ya = make_multilabel_classification( n_features=1, n_classes=10, random_state=0, n_samples=50, allow_unlabeled=False ) _, yb = make_multilabel_classification( n_features=1, n_classes=10, random_state=1, n_samples=50, allow_unlabeled=False ) y_true = np.vstack([ya, yb]) y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: check_sample_weight_invariance(name, metric, y_true, y_score) else: check_sample_weight_invariance(name, metric, y_true, y_pred) @ignore_warnings def test_no_averaging_labels(): # test labels argument when not using averaging # in multi-class and multi-label cases y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]]) y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]]) y_true_multiclass = np.array([0, 1, 2]) y_pred_multiclass = np.array([0, 2, 3]) labels = np.array([3, 0, 1, 2]) _, inverse_labels = np.unique(labels, return_inverse=True) for name in METRICS_WITH_AVERAGING: for y_true, y_pred in [ [y_true_multiclass, y_pred_multiclass], [y_true_multilabel, y_pred_multilabel], ]: if name not in MULTILABELS_METRICS and y_pred.ndim > 1: continue metric = ALL_METRICS[name] score_labels = metric(y_true, y_pred, labels=labels, average=None) score = metric(y_true, y_pred, average=None) assert_array_equal(score_labels, score[inverse_labels]) @pytest.mark.parametrize( "name", sorted(MULTILABELS_METRICS - {"unnormalized_multilabel_confusion_matrix"}) ) def test_multilabel_label_permutations_invariance(name): random_state = check_random_state(0) n_samples, n_classes = 20, 4 y_true = random_state.randint(0, 2, size=(n_samples, n_classes)) y_score = random_state.randint(0, 2, size=(n_samples, n_classes)) metric = ALL_METRICS[name] score = metric(y_true, y_score) for perm in permutations(range(n_classes), n_classes): y_score_perm = y_score[:, perm] y_true_perm = y_true[:, perm] current_score = metric(y_true_perm, y_score_perm) assert_almost_equal(score, current_score) @pytest.mark.parametrize( "name", sorted(THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS) ) def test_thresholded_multilabel_multioutput_permutations_invariance(name): random_state = check_random_state(0) n_samples, n_classes = 20, 4 y_true = random_state.randint(0, 2, size=(n_samples, n_classes)) y_score = random_state.normal(size=y_true.shape) # Makes sure all samples have at least one label. This works around errors # when running metrics where average="sample" y_true[y_true.sum(1) == 4, 0] = 0 y_true[y_true.sum(1) == 0, 0] = 1 metric = ALL_METRICS[name] score = metric(y_true, y_score) for perm in permutations(range(n_classes), n_classes): y_score_perm = y_score[:, perm] y_true_perm = y_true[:, perm] current_score = metric(y_true_perm, y_score_perm) if metric == mean_absolute_percentage_error: assert np.isfinite(current_score) assert current_score > 1e6 # Here we are not comparing the values in case of MAPE because # whenever y_true value is exactly zero, the MAPE value doesn't # signify anything. Thus, in this case we are just expecting # very large finite value. else: assert_almost_equal(score, current_score) @pytest.mark.parametrize( "name", sorted(set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) ) def test_thresholded_metric_permutation_invariance(name): n_samples, n_classes = 100, 3 random_state = check_random_state(0) y_score = random_state.rand(n_samples, n_classes) temp = np.exp(-y_score) y_score = temp / temp.sum(axis=-1).reshape(-1, 1) y_true = random_state.randint(0, n_classes, size=n_samples) metric = ALL_METRICS[name] score = metric(y_true, y_score) for perm in permutations(range(n_classes), n_classes): inverse_perm = np.zeros(n_classes, dtype=int) inverse_perm[list(perm)] = np.arange(n_classes) y_score_perm = y_score[:, inverse_perm] y_true_perm = np.take(perm, y_true) current_score = metric(y_true_perm, y_score_perm) assert_almost_equal(score, current_score) @pytest.mark.parametrize("metric_name", CLASSIFICATION_METRICS) def test_metrics_consistent_type_error(metric_name): # check that an understable message is raised when the type between y_true # and y_pred mismatch rng = np.random.RandomState(42) y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=object) y2 = rng.randint(0, 2, size=y1.size) err_msg = "Labels in y_true and y_pred should be of the same type." with pytest.raises(TypeError, match=err_msg): CLASSIFICATION_METRICS[metric_name](y1, y2) @pytest.mark.parametrize( "metric, y_pred_threshold", [ (average_precision_score, True), (brier_score_loss, True), (f1_score, False), (partial(fbeta_score, beta=1), False), (jaccard_score, False), (precision_recall_curve, True), (precision_score, False), (recall_score, False), (roc_curve, True), ], ) @pytest.mark.parametrize("dtype_y_str", [str, object]) def test_metrics_pos_label_error_str(metric, y_pred_threshold, dtype_y_str): # check that the error message if `pos_label` is not specified and the # targets is made of strings. rng = np.random.RandomState(42) y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str) y2 = rng.randint(0, 2, size=y1.size) if not y_pred_threshold: y2 = np.array(["spam", "eggs"], dtype=dtype_y_str)[y2] err_msg_pos_label_None = ( "y_true takes value in {'eggs', 'spam'} and pos_label is not " "specified: either make y_true take value in {0, 1} or {-1, 1} or " "pass pos_label explicit" ) err_msg_pos_label_1 = ( r"pos_label=1 is not a valid label. It should be one of " r"\['eggs', 'spam'\]" ) pos_label_default = signature(metric).parameters["pos_label"].default err_msg = err_msg_pos_label_1 if pos_label_default == 1 else err_msg_pos_label_None with pytest.raises(ValueError, match=err_msg): metric(y1, y2)