3.7 KiB
3.7 KiB
import numpy as np
import pandas as pd
from math import sqrt
from scipy.stats import sem
from scipy.stats import t
def generate_bootstraps(data, n_bootstraps=100):
data_size = data.shape[0]
for b in range(n_bootstraps):
indicies = np.random.choice(len(data), size=data_size)
yield data.iloc[indicies, :]
def get_t_stat(data1, data2):
mean1 = np.mean(data1)
mean2 = np.mean(data2)
sem1 = sem(data1)
sem2 = sem(data2)
sed = sqrt(sem1**2.0 + sem2**2.0)
return (mean1 - mean2) / sed
def independent_t_test(data, columns, alpha=0.05):
t_stat_sum = 0
for sample in generate_bootstraps(data):
t_stat_sum += get_t_stat(sample[columns[0]], sample[columns[1]])
data_size = data.shape[0]
t_stat = t_stat_sum / data_size
df = 2 * data_size - 2
cv = t.ppf(1.0 - alpha, df)
p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
return t_stat, df, cv, p
def make_decision(data, columns, alpha=0.05):
t_stat, df, cv, p = independent_t_test(data, columns, alpha)
print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\n')
if abs(t_stat) <= cv:
print('Accept null hypothesis that the means are equal.')
else:
print('Reject the null hypothesis that the means are equal.')
if p > alpha:
print('Accept null hypothesis that the means are equal.')
else:
print('Reject the null hypothesis that the means are equal.')
dataset = pd.read_csv('experiment_data.csv')
make_decision(dataset, ['Weight', 'Age'])
t: 6.903407918031469, df: 998, cv: 1.6463818766348755, p: 9.018563673635072e-12 Reject the null hypothesis that the means are equal. Reject the null hypothesis that the means are equal.