214 lines
6.6 KiB
Python
214 lines
6.6 KiB
Python
# Sebastian Raschka 2014-2020
|
|
# mlxtend Machine Learning Library Extensions
|
|
#
|
|
# Author: Sebastian Raschka <sebastianraschka.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import numpy as np
|
|
import scipy.stats
|
|
import itertools
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import get_scorer
|
|
|
|
|
|
def ftest(y_target, *y_model_predictions):
|
|
"""
|
|
F-Test test to compare 2 or more models.
|
|
|
|
Parameters
|
|
-----------
|
|
y_target : array-like, shape=[n_samples]
|
|
True class labels as 1D NumPy array.
|
|
|
|
*y_model_predictions : array-likes, shape=[n_samples]
|
|
Variable number of 2 or more arrays that
|
|
contain the predicted class labels
|
|
from models as 1D NumPy array.
|
|
|
|
Returns
|
|
-----------
|
|
|
|
f, p : float or None, float
|
|
Returns the F-value and the p-value
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/evaluate/ftest/
|
|
|
|
"""
|
|
|
|
num_models = len(y_model_predictions)
|
|
|
|
# Checks
|
|
model_lens = set()
|
|
y_model_predictions = list(y_model_predictions)
|
|
for ary in ([y_target] + y_model_predictions):
|
|
if len(ary.shape) != 1:
|
|
raise ValueError('One or more input arrays are not 1-dimensional.')
|
|
model_lens.add(ary.shape[0])
|
|
|
|
if len(model_lens) > 1:
|
|
raise ValueError('Each prediction array must have the '
|
|
'same number of samples.')
|
|
|
|
if num_models < 2:
|
|
raise ValueError('Provide at least 2 model prediction arrays.')
|
|
|
|
num_examples = len(y_target)
|
|
|
|
accuracies = []
|
|
correctly_classified_all_models = 0
|
|
correctly_classified_collection = []
|
|
for pred in y_model_predictions:
|
|
correctly_classified = (y_target == pred).sum()
|
|
acc = correctly_classified / num_examples
|
|
accuracies.append(acc)
|
|
correctly_classified_all_models += correctly_classified
|
|
correctly_classified_collection.append(correctly_classified)
|
|
|
|
avg_acc = sum(accuracies) / len(accuracies)
|
|
|
|
# sum squares of classifiers
|
|
ssa = (num_examples * sum([acc**2 for acc in accuracies])
|
|
- num_examples*num_models*avg_acc**2)
|
|
|
|
# sum squares of models
|
|
binary_combin = list(itertools.product([0, 1], repeat=num_models))
|
|
ary = np.hstack([(y_target == mod).reshape(-1, 1) for
|
|
mod in y_model_predictions]).astype(int)
|
|
correctly_classified_objects = 0
|
|
binary_combin_totals = np.zeros(len(binary_combin))
|
|
for i, c in enumerate(binary_combin):
|
|
binary_combin_totals[i] = ((ary == c).sum(axis=1) == num_models).sum()
|
|
|
|
correctly_classified_objects += (sum(c)**2 * binary_combin_totals[i])
|
|
|
|
ssb = (1./num_models * correctly_classified_objects
|
|
- num_examples*num_models*avg_acc**2)
|
|
|
|
# total sum of squares
|
|
sst = num_examples*num_models*avg_acc*(1 - avg_acc)
|
|
|
|
# sum squares for classification-object interaction
|
|
ssab = sst - ssa - ssb
|
|
|
|
mean_ssa = ssa / (num_models - 1)
|
|
mean_ssab = ssab / ((num_models - 1)*(num_examples - 1))
|
|
|
|
f = mean_ssa / mean_ssab
|
|
|
|
degrees_of_freedom_1 = num_models - 1
|
|
degrees_of_freedom_2 = degrees_of_freedom_1 * num_examples
|
|
|
|
p_value = scipy.stats.f.sf(f, degrees_of_freedom_1, degrees_of_freedom_2)
|
|
|
|
return f, p_value
|
|
|
|
|
|
def combined_ftest_5x2cv(estimator1, estimator2, X, y,
|
|
scoring=None,
|
|
random_seed=None):
|
|
"""
|
|
Implements the 5x2cv combined F test proposed
|
|
by Alpaydin 1999,
|
|
to compare the performance of two models.
|
|
|
|
Parameters
|
|
----------
|
|
estimator1 : scikit-learn classifier or regressor
|
|
|
|
estimator2 : scikit-learn classifier or regressor
|
|
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
y : array-like, shape = [n_samples]
|
|
Target values.
|
|
|
|
scoring : str, callable, or None (default: None)
|
|
If None (default), uses 'accuracy' for sklearn classifiers
|
|
and 'r2' for sklearn regressors.
|
|
If str, uses a sklearn scoring metric string identifier, for example
|
|
{accuracy, f1, precision, recall, roc_auc} for classifiers,
|
|
{'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error',
|
|
'median_absolute_error', 'r2'} for regressors.
|
|
If a callable object or function is provided, it has to be conform with
|
|
sklearn's signature ``scorer(estimator, X, y)``; see
|
|
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
|
|
for more information.
|
|
|
|
random_seed : int or None (default: None)
|
|
Random seed for creating the test/train splits.
|
|
|
|
Returns
|
|
----------
|
|
f : float
|
|
The F-statistic
|
|
|
|
pvalue : float
|
|
Two-tailed p-value.
|
|
If the chosen significance level is larger
|
|
than the p-value, we reject the null hypothesis
|
|
and accept that there are significant differences
|
|
in the two compared models.
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/evaluate/combined_ftest_5x2cv/
|
|
|
|
"""
|
|
rng = np.random.RandomState(random_seed)
|
|
|
|
if scoring is None:
|
|
if estimator1._estimator_type == 'classifier':
|
|
scoring = 'accuracy'
|
|
elif estimator1._estimator_type == 'regressor':
|
|
scoring = 'r2'
|
|
else:
|
|
raise AttributeError('Estimator must '
|
|
'be a Classifier or Regressor.')
|
|
if isinstance(scoring, str):
|
|
scorer = get_scorer(scoring)
|
|
else:
|
|
scorer = scoring
|
|
|
|
variances = []
|
|
differences = []
|
|
|
|
def score_diff(X_1, X_2, y_1, y_2):
|
|
|
|
estimator1.fit(X_1, y_1)
|
|
estimator2.fit(X_1, y_1)
|
|
est1_score = scorer(estimator1, X_2, y_2)
|
|
est2_score = scorer(estimator2, X_2, y_2)
|
|
score_diff = est1_score - est2_score
|
|
return score_diff
|
|
|
|
for i in range(5):
|
|
|
|
randint = rng.randint(low=0, high=32767)
|
|
X_1, X_2, y_1, y_2 = \
|
|
train_test_split(X, y, test_size=0.5,
|
|
random_state=randint)
|
|
|
|
score_diff_1 = score_diff(X_1, X_2, y_1, y_2)
|
|
score_diff_2 = score_diff(X_2, X_1, y_2, y_1)
|
|
score_mean = (score_diff_1 + score_diff_2) / 2.
|
|
score_var = ((score_diff_1 - score_mean)**2 +
|
|
(score_diff_2 - score_mean)**2)
|
|
|
|
differences.extend([score_diff_1**2, score_diff_2**2])
|
|
variances.append(score_var)
|
|
|
|
numerator = sum(differences)
|
|
denominator = 2*(sum(variances))
|
|
f_stat = numerator / denominator
|
|
|
|
p_value = scipy.stats.f.sf(f_stat, 10, 5)
|
|
|
|
return float(f_stat), float(p_value)
|