Bootstrap-t-student/bootstrap-t.ipynb

356 lines
16 KiB
Plaintext
Raw Normal View History

2022-05-11 15:02:15 +02:00
{
"cells": [
2022-05-13 22:06:56 +02:00
{
"cell_type": "markdown",
"source": [
"Bootstrapowa wersja testu t.\n",
"Implementacja powinna obejmować test dla jednej próby, dla dwóch prób niezależnych oraz dla dwóch prób zależnych.\n",
"W każdej sytuacji oczekiwanym wejście jest zbiór danych w odpowiednim formacie, a wyjściem p-wartość oraz ostateczna decyzja.\n",
"Dodatkowo powinien być rysowany odpowiedni rozkład statystyki testowej."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Zbiór danych - ???\n",
"Hipoteza zerowa - ???\n",
2022-05-13 23:43:00 +02:00
"Hipoteza alternatywna - ???\n",
"\n",
"Dla każdego z 3 testów inne\n",
"https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test.html"
2022-05-13 22:06:56 +02:00
],
"metadata": {
"collapsed": false
}
},
2022-05-11 15:02:15 +02:00
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 239,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from math import sqrt\n",
"from scipy.stats import sem\n",
2022-05-13 23:43:00 +02:00
"from scipy.stats import t\n",
2022-05-14 15:31:47 +02:00
"import matplotlib.pyplot as plt\n",
"from statistics import mean, stdev\n",
"from scipy.stats import ttest_ind, ttest_1samp, ttest_rel"
2022-05-11 15:02:15 +02:00
]
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 240,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def generate_bootstraps(data, n_bootstraps=100):\n",
" data_size = data.shape[0]\n",
2022-05-13 22:06:56 +02:00
" for _ in range(n_bootstraps):\n",
" indices = np.random.choice(len(data), size=data_size)\n",
" yield data.iloc[indices, :]"
2022-05-11 15:02:15 +02:00
]
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 241,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
"def t_stat_single(sample, population_mean):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n",
" sample_size = len(sample)\n",
" return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 242,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
2022-05-14 16:47:42 +02:00
"def t_stat_ind(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n",
2022-05-14 16:47:42 +02:00
" sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n",
" return (mean(sample_1) - mean(sample_2)) / sed"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 243,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
"def t_stat_dep(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n",
" differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n",
" sample_size = len(sample_1)\n",
2022-05-14 17:09:29 +02:00
" mu = 0 # The constant is zero if we want to test whether the average of the difference is significantly different.\n",
" return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
2022-05-14 17:09:29 +02:00
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 244,
2022-05-14 17:09:29 +02:00
"outputs": [],
"source": [
"def bootstrap_one_sample():\n",
" return"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 245,
2022-05-14 17:09:29 +02:00
"outputs": [],
"source": [
"def bootstrap_independent():\n",
" return"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 246,
2022-05-14 17:09:29 +02:00
"outputs": [],
"source": [
"def bootstrap_dependent():\n",
" return"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
2022-05-14 15:31:47 +02:00
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 247,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def independent_t_test(data, columns, alpha=0.05):\n",
" t_stat_sum = 0\n",
2022-05-16 18:52:49 +02:00
" t_stat_list = []\n",
2022-05-11 15:02:15 +02:00
" for sample in generate_bootstraps(data):\n",
2022-05-16 18:52:49 +02:00
" stat = t_stat_ind(sample[columns[0]], sample[columns[1]])\n",
" t_stat_list.append(stat)\n",
" t_stat_sum += stat\n",
2022-05-11 15:02:15 +02:00
" data_size = data.shape[0]\n",
" t_stat = t_stat_sum / data_size\n",
" df = 2 * data_size - 2\n",
" cv = t.ppf(1.0 - alpha, df)\n",
" p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n",
2022-05-16 18:52:49 +02:00
" return t_stat, df, cv, p, t_stat_list"
2022-05-11 15:02:15 +02:00
]
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 248,
2022-05-11 15:02:15 +02:00
"outputs": [],
"source": [
"def make_decision(data, columns, alpha=0.05):\n",
2022-05-16 18:52:49 +02:00
" t_stat, df, cv, p, stats = independent_t_test(data, columns, alpha)\n",
2022-05-11 15:02:15 +02:00
" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
2022-05-16 18:52:49 +02:00
" draw_distribution(stats)\n",
2022-05-11 15:02:15 +02:00
" if abs(t_stat) <= cv:\n",
"\t print('Accept null hypothesis that the means are equal.')\n",
" else:\n",
" print('Reject the null hypothesis that the means are equal.')\n",
" if p > alpha:\n",
" print('Accept null hypothesis that the means are equal.')\n",
" else:\n",
"\t print('Reject the null hypothesis that the means are equal.')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 249,
"outputs": [],
2022-05-13 22:06:56 +02:00
"source": [
2022-05-16 18:52:49 +02:00
"def draw_distribution(stats): # To powinno być zdefiniowane przed make decision w sumie\n",
" \"\"\"\n",
" Funkcja rysuje rozkład statystyki testowej\n",
" stats: lista statystyk testowych\n",
" \"\"\"\n",
" plt.hist(stats)\n",
" plt.xlabel('Test statistic value')\n",
" plt.ylabel('Frequency')\n",
" plt.show()"
2022-05-13 22:06:56 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 250,
2022-05-11 15:02:15 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyka testowa dla jednej próby:\n",
"1.414213562373095 - z naszej funkcji\n",
"1.414213562373095 - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób niezależnych:\n",
"-3.0 - z naszej funkcji\n",
"-3.0 - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób zależnych:\n",
"-1.6329931618554525 - z naszej funkcji\n",
"-1.632993161855452 - z gotowej biblioteki\n"
]
}
],
"source": [
"# Testy\n",
"dummy = [1, 2, 3, 4, 5]\n",
"dummy2 = [4, 5, 6, 7, 8]\n",
"dummy3 = [1, 3 , 3, 4, 6]\n",
"t_stat_selfmade = t_stat_single(dummy, 2)\n",
"t_stat_lib, _ = ttest_1samp(dummy, 2)\n",
"print('Statystyka testowa dla jednej próby:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')\n",
"print()\n",
2022-05-14 16:47:42 +02:00
"t_stat_selfmade = t_stat_ind(dummy, dummy2)\n",
"t_stat_lib, _ = ttest_ind(dummy, dummy2)\n",
"print('Statystyka testowa dla dwóch prób niezależnych:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')\n",
"print()\n",
"t_stat_selfmade = t_stat_dep(dummy, dummy3)\n",
"t_stat_lib, _ = ttest_rel(dummy, dummy3)\n",
"print('Statystyka testowa dla dwóch prób zależnych:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-16 18:52:49 +02:00
"execution_count": 251,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2022-05-16 18:52:49 +02:00
"t: 6.929903381575467, df: 998, cv: 1.6463818766348755, p: 7.544853630747639e-12\n",
"\n"
]
},
{
"data": {
"text/plain": "<Figure size 432x288 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEJCAYAAACT/UyFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAATd0lEQVR4nO3dfbRldX3f8feHh4jIg1CuZKqQawyJUpUJGWgiMWI0lUh9oBoNbQzLJk6aaI0kaZ3Yrkqy6lq4WiE1bWxwiaBBjYoSBDQi9TmtOCDPg9EmQwqMMEZawCQS4Ns/9h45ztxz59zL3Xffmd/7tdZZs/dvP3053Pu5+/zO3r+dqkKS1I59xi5AkrS6DH5JaozBL0mNMfglqTEGvyQ1xuCXpMYMFvxJjkry6SS3JLk5ya/37WcluSPJdf3rhUPVIEnaVYa6jj/JOmBdVV2b5GDgGuClwCuA+6vqPw9yYEnSovYbasdVtQ3Y1k/fl2QL8MTl7OuII46o+fn5FaxOkvZ+11xzzTeram7n9sGCf1KSeeBHgS8BJwGvS/KLwGbgN6vqnsW2n5+fZ/PmzYPXKUl7kyS3LdQ++Je7SQ4CLgbeUFX3Au8AngKsp/tE8LYp221MsjnJ5u3btw9dpiQ1Y9DgT7I/XehfVFUfAaiqu6rqoap6GHgncOJC21bVeVW1oao2zM3t8klFkrRMQ17VE+BdwJaqOmeifd3EaqcBNw1VgyRpV0P28Z8EvAq4Mcl1fdubgNOTrAcK2Ar8yoA1SJJ2MuRVPV8AssCiK4Y6piRp97xzV5IaY/BLUmMMfklqjMEvSY1ZlTt3paHNb7p8lONuPfvUUY4rPRqe8UtSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGuODWKRHYawHwIAPgdHyecYvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYMFvxJjkry6SS3JLk5ya/37YcnuTLJ1/p/DxuqBknSroY8438Q+M2qOhb4ceC1SY4FNgFXVdUxwFX9vCRplQwW/FW1raqu7afvA7YATwReAlzYr3Yh8NKhapAk7WpV+viTzAM/CnwJOLKqtvWLvgEcuRo1SJI6gwd/koOAi4E3VNW9k8uqqoCast3GJJuTbN6+ffvQZUpSMwYN/iT704X+RVX1kb75riTr+uXrgLsX2raqzquqDVW1YW5ubsgyJakpQ17VE+BdwJaqOmdi0aXAGf30GcCfDFWDJGlXQz5z9yTgVcCNSa7r294EnA18MMkvAbcBrxiwBknSTgYL/qr6ApApi5831HElSYvzzl1JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUmP3GLkB7j/lNl49dgqQZeMYvSY0x+CWpMQa/JDVmsOBPcn6Su5PcNNF2VpI7klzXv1441PElSQubKfiTPGMZ+74AOGWB9nOran3/umIZ+5UkPQqznvH/QZKrk/xakkNn2aCqPgd8a/mlSZKGMFPwV9WzgX8BHAVck+R9SX5mmcd8XZIb+q6gw5a5D0nSMs3cx19VXwP+PfBG4DnA25PcmuSfLeF47wCeAqwHtgFvm7Ziko1JNifZvH379iUcQpK0mFn7+J+Z5FxgC/DTwIuq6mn99LmzHqyq7qqqh6rqYeCdwImLrHteVW2oqg1zc3OzHkKStBuznvH/PnAtcFxVvbaqrgWoqjvpPgXMJMm6idnTgJumrStJGsasQzacCvxtVT0EkGQf4ICq+puqeu9CGyR5P3AycESS24E3AycnWQ8UsBX4lUdVvSRpyWYN/k8Bzwfu7+cPBD4JPGvaBlV1+gLN71pSdZKkFTdrV88BVbUj9OmnDxymJEnSkGYN/m8nOX7HTJIfA/52mJIkSUOatavnDcCHktwJBPh+4JVDFSVJGs5MwV9VX07yVOBH+qavVtXfD1eWJGkoS3kQywnAfL/N8UmoqvcMUpUkaTAzBX+S99LdcXsd8FDfXIDBL0l7mFnP+DcAx1ZVDVmMJGl4s17VcxPdF7qSpD3crGf8RwC3JLka+M6Oxqp68SBVSZIGM2vwnzVkEZKk1TPr5ZyfTfIDwDFV9akkBwL7DluaJGkIsw7L/Brgw8Af9k1PBC4ZqCZJ0oBm/XL3tcBJwL3w3YeyPGGooiRJw5k1+L9TVQ/smEmyH911/JKkPcyswf/ZJG8CHts/a/dDwMeGK0uSNJRZg38TsB24ke7hKVewhCdvSZLWjlmv6tnxjNx3DluOJGlos47V85cs0KdfVT+44hVJkga1lLF6djgA+Dng8JUvR5I0tJn6+Kvqryded1TV79E9gF2StIeZtavn+InZfeg+ASxlLH9J0hoxa3i/bWL6QWAr8IoVr0YrYn7T5WOXoFUw1v/nrWf7YX9PN+tVPc8duhBJ0uqYtavnNxZbXlXnrEw5kqShLeWqnhOAS/v5FwFXA18boihJ0nBmDf4nAcdX1X0ASc4CLq+qXxiqMEnSMGYdsuFI4IGJ+Qf6NknSHmbWM/73AFcn+Wg//1LgwkEqkiQNataret6S5OPAs/umV1fVV4YrS5I0lFm7egAOBO6tqv8C3J7kyQPVJEka0KyPXnwz8Ebgt/um/YE/GqooSdJwZj3jPw14MfBtgKq6Ezh4qKIkScOZNfgfqKqiH5o5yeOGK0mSNKRZg/+DSf4QeHyS1wCfwoeySNIeabdX9SQJ8MfAU4F7gR8B/kNVXTlwbZKkAew2+KuqklxRVc8AZg77JOcD/xS4u6qe3rcdTvdHZJ5+hM+qumcZdUuSlmnWrp5rk5ywxH1fAJyyU9sm4KqqOga4qp+XJK2iWYP/HwP/K8n/TnJDkhuT3LDYBlX1OeBbOzW/hEfu+L2Q7g5gSdIqWrSrJ8nRVfVXwAtW6HhHVtW2fvobON6PJK263fXxX0I3KudtSS6uqpet1IH77w5q2vIkG4GNAEcfffRKHVaSmre7rp5MTP/gChzvriTrAPp/7562YlWdV1UbqmrD3NzcChxakgS7D/6aMr1clwJn9NNnAH+yAvuUJC3B7rp6jktyL92Z/2P7afr5qqpDpm2Y5P3AycARSW4H3gycTXcz2C8Bt+ED2yVp1S0a/FW173J3XFWnT1n0vOXuU5L06C1lWGZJ0l7A4Jekxhj8ktSYWZ+5K0kAzG+6fLRjbz371NGOvTfxjF+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjTH4JakxBr8kNcbgl6TGGPyS1BiDX5IaY/BLUmMMfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1Zr8xDppkK3Af8BDwYFVtGKMOSWrRKMHfe25VfXPE40tSk+zqkaTGjBX8BXwyyTVJNo5UgyQ1aayunp+sqjuSPAG4MsmtVfW5yRX6PwgbAY4++ugxapSkvdIoZ/xVdUf/793AR4ETF1jnvKraUFUb5ubmVrtESdprrXrwJ3lckoN3TAP/BLhpteuQpFaN0dVzJPDRJDuO/76q+sQIdUhSk1Y9+KvqL4DjVvu4kqSOl3NKUmMMfklqjMEvSY0x+CWpMQa/JDXG4Jekxhj8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1xuCXpMYY/JLUGINfkhpj8EtSYwx+SWqMwS9JjTH4JakxBr8kNWa/sQsY2vymy0c79tazTx3t2JI0jWf8ktQYg1+SGmPwS1JjDH5JaozBL0mNMfglqTEGvyQ1Zq+/jn9MY95DIO2NWvydGuJ+IM/4JakxBr8kNcbgl6TGGPyS1JhRgj/
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2022-05-11 15:02:15 +02:00
"Reject the null hypothesis that the means are equal.\n",
"Reject the null hypothesis that the means are equal.\n"
]
}
],
"source": [
"dataset = pd.read_csv('experiment_data.csv')\n",
"make_decision(dataset, ['Weight', 'Age'])"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"interpreter": {
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
},
"kernelspec": {
"display_name": "Python 3.9.1 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}