Bootstrap-t-student/bootstrap-t.ipynb

3.7 KiB

import numpy as np
import pandas as pd
from math import sqrt
from scipy.stats import sem
from scipy.stats import t
def generate_bootstraps(data, n_bootstraps=100):
    data_size = data.shape[0]
    for b in range(n_bootstraps):
        indicies =  np.random.choice(len(data), size=data_size)
        yield data.iloc[indicies, :]
def get_t_stat(data1, data2):
    mean1 = np.mean(data1)
    mean2 = np.mean(data2)
    sem1 = sem(data1)
    sem2 = sem(data2)

    sed = sqrt(sem1**2.0 + sem2**2.0)
    return (mean1 - mean2) / sed
def independent_t_test(data, columns, alpha=0.05):
    t_stat_sum = 0
    for sample in generate_bootstraps(data):
        t_stat_sum += get_t_stat(sample[columns[0]], sample[columns[1]])

    data_size = data.shape[0]
    t_stat = t_stat_sum / data_size
    df = 2 * data_size - 2
    cv = t.ppf(1.0 - alpha, df)
    p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
    return t_stat, df, cv, p
def make_decision(data, columns, alpha=0.05):
    t_stat, df, cv, p = independent_t_test(data, columns, alpha)
    print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\n')
    if abs(t_stat) <= cv:
	    print('Accept null hypothesis that the means are equal.')
    else:
        print('Reject the null hypothesis that the means are equal.')
    if p > alpha:
        print('Accept null hypothesis that the means are equal.')
    else:
	    print('Reject the null hypothesis that the means are equal.')
dataset = pd.read_csv('experiment_data.csv')
make_decision(dataset, ['Weight', 'Age'])
t: 6.903407918031469, df: 998, cv: 1.6463818766348755, p: 9.018563673635072e-12

Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.