Bootstrap-t-student/bootstrap-t.ipynb

301 lines
12 KiB
Plaintext
Raw Normal View History

2022-05-11 15:02:15 +02:00
{
"cells": [
2022-05-13 22:06:56 +02:00
{
"cell_type": "markdown",
"source": [
"Bootstrapowa wersja testu t.\n",
"Implementacja powinna obejmować test dla jednej próby, dla dwóch prób niezależnych oraz dla dwóch prób zależnych.\n",
"W każdej sytuacji oczekiwanym wejście jest zbiór danych w odpowiednim formacie, a wyjściem p-wartość oraz ostateczna decyzja.\n",
"Dodatkowo powinien być rysowany odpowiedni rozkład statystyki testowej."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Zbiór danych - ???\n",
"Hipoteza zerowa - ???\n",
2022-05-13 23:43:00 +02:00
"Hipoteza alternatywna - ???\n",
"\n",
"Dla każdego z 3 testów inne\n",
"https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test.html"
2022-05-13 22:06:56 +02:00
],
"metadata": {
"collapsed": false
}
},
2022-05-11 15:02:15 +02:00
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 142,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from math import sqrt\n",
"from scipy.stats import sem\n",
2022-05-13 23:43:00 +02:00
"from scipy.stats import t\n",
2022-05-14 15:31:47 +02:00
"import matplotlib.pyplot as plt\n",
"from statistics import mean, stdev\n",
"from scipy.stats import ttest_ind, ttest_1samp, ttest_rel"
2022-05-11 15:02:15 +02:00
]
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 143,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def generate_bootstraps(data, n_bootstraps=100):\n",
" data_size = data.shape[0]\n",
2022-05-13 22:06:56 +02:00
" for _ in range(n_bootstraps):\n",
" indices = np.random.choice(len(data), size=data_size)\n",
" yield data.iloc[indices, :]"
2022-05-11 15:02:15 +02:00
]
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 144,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
"def t_stat_single(sample, population_mean):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n",
" sample_size = len(sample)\n",
" return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 145,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
2022-05-14 16:47:42 +02:00
"def t_stat_ind(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n",
2022-05-14 16:47:42 +02:00
" sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n",
" return (mean(sample_1) - mean(sample_2)) / sed"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 146,
2022-05-14 15:31:47 +02:00
"outputs": [],
"source": [
"def t_stat_dep(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n",
" differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n",
" sample_size = len(sample_1)\n",
" mu = 0 # The constant = zero if we want to test whether the average of the difference is significantly different.\n",
" return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))"
2022-05-14 15:31:47 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 147,
2022-05-11 15:02:15 +02:00
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def independent_t_test(data, columns, alpha=0.05):\n",
" t_stat_sum = 0\n",
" for sample in generate_bootstraps(data):\n",
2022-05-14 16:47:42 +02:00
" t_stat_sum += t_stat_ind(sample[columns[0]], sample[columns[1]])\n",
2022-05-11 15:02:15 +02:00
"\n",
" data_size = data.shape[0]\n",
" t_stat = t_stat_sum / data_size\n",
" df = 2 * data_size - 2\n",
" cv = t.ppf(1.0 - alpha, df)\n",
" p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n",
" return t_stat, df, cv, p"
]
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 148,
2022-05-11 15:02:15 +02:00
"outputs": [],
"source": [
"def make_decision(data, columns, alpha=0.05):\n",
" t_stat, df, cv, p = independent_t_test(data, columns, alpha)\n",
" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
" if abs(t_stat) <= cv:\n",
"\t print('Accept null hypothesis that the means are equal.')\n",
" else:\n",
" print('Reject the null hypothesis that the means are equal.')\n",
" if p > alpha:\n",
" print('Accept null hypothesis that the means are equal.')\n",
" else:\n",
"\t print('Reject the null hypothesis that the means are equal.')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 149,
2022-05-13 23:43:00 +02:00
"outputs": [
{
"data": {
"text/plain": "<Figure size 432x288 with 1 Axes>",
2022-05-14 16:47:42 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAMkElEQVR4nO3dfYxl9V3H8fenrKBttUCZElyIu7GgISaluCKG9EFotO0aF7USGqOblrim0kqpD12MkSb+s2i1YmJqVpZmTRoeRCJYfEIEGxO77WyLwvIQVrqUpQtMY1ufYhH79Y85mx2WGeayM3fufmfer4TMPeeeu+f3yyXv/ObM3DOpKiRJ/bxi0gOQJB0bAy5JTRlwSWrKgEtSUwZckppat5InO+2002rDhg0reUpJam/v3r1fqaqpo/evaMA3bNjA9PT0Sp5SktpL8sR8+72EIklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU2t6CcxpePVhu13TezcB3Zsnti51ZsrcElqyhW4jiuTXAlL3bgCl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNTVSwJNcnWRfkgeT3JTkW5NsTLInyf4ktyQ5cdyDlSQdsWjAk6wHfgnYVFXfB5wAXA5cB3ysql4PfBW4YpwDlSS90KiXUNYB35ZkHfBK4BBwMXDb8Pxu4NJlH50kaUGLBryqngI+CnyJ2XB/HdgLfK2qnh8OOwisn+/1SbYlmU4yPTMzszyjliSNdAnlFGALsBH4TuBVwNtHPUFV7ayqTVW1aWpq6pgHKkl6oVEuobwN+GJVzVTV/wK3AxcBJw+XVADOBJ4a0xglSfMYJeBfAi5M8sokAS4BHgLuBd41HLMVuGM8Q5QkzWeUa+B7mP1h5eeBB4bX7AQ+DHwoyX7gtcCuMY5TknSUdYsfAlV1LXDtUbsfBy5Y9hFJkkbiJzElqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpkb6PXBJ47Nh+10TOe+BHZsncl4tH1fgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTXk3Qr3IpO6OJ+nlcQUuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLU1EgBT3JyktuSPJLk4SQ/lOTUJHcneWz4esq4BytJOmLUFfj1wF9X1fcCbwAeBrYD91TV2cA9w7YkaYUsGvAkrwHeDOwCqKrnquprwBZg93DYbuDS8QxRkjSfUVbgG4EZ4BNJvpDkhiSvAk6vqkPDMU8Dp8/34iTbkkwnmZ6ZmVmeUUuSRgr4OuB84ONV9UbgvzjqcklVFVDzvbiqdlbVpqraNDU1tdTxSpIGowT8IHCwqvYM27cxG/RnkpwBMHx9djxDlCTNZ9GAV9XTwJNJvmfYdQnwEHAnsHXYtxW4YywjlCTNa9Q/avwB4JNJTgQeB97DbPxvTXIF8ARw2XiGKEmaz0gBr6r7gU3zPHXJso5GkjQyP4kpSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlPrJj0ASZOxYftdEzv3gR2bJ3bu1cQVuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmho54ElOSPKFJJ8atjcm2ZNkf5Jbkpw4vmFKko72clbgVwEPz9m+DvhYVb0e+CpwxXIOTJL00kYKeJIzgc3ADcN2gIuB24ZDdgOXjmF8kqQFjLoC/33g14BvDtuvBb5WVc8P2weB9cs7NEnSS1k04El+DHi2qvYeywmSbEsynWR6ZmbmWP4JSdI8RlmBXwT8eJIDwM3MXjq5Hjg5yeF7qZwJPDXfi6tqZ1VtqqpNU1NTyzBkSRKMEPCquqaqzqyqDcDlwN9X1c8A9wLvGg7bCtwxtlFKkl5kKb8H/mHgQ0n2M3tNfNfyDEmSNIqXdTvZqroPuG94/DhwwfIPSZI0Cj+JKUlN+QcdjmOTvOG+pOOfK3BJasqAS1JTBlySmjLgktSUP8RchD9IlHS8cgUuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSn/oIOkFTepP5RyYMfmiZx3XFyBS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1NSiAU9yVpJ7kzyUZF+Sq4b9pya5O8ljw9dTxj9cSdJho6zAnwd+uarOBS4ErkxyLrAduKeqzgbuGbYlSStk0YBX1aGq+vzw+D+Ah4H1wBZg93DYbuDSMY1RkjSPl3UNPMkG4I3AHuD0qjo0PPU0cPoCr9mWZDrJ9MzMzFLGKkmaY+SAJ3k18GfAB6vq3+c+V1UF1Hyvq6qdVbWpqjZNTU0tabCSpCNGCniSb2E23p+sqtuH3c8kOWN4/gzg2fEMUZI0n1F+CyXALuDhqvq9OU/dCWwdHm8F7lj+4UmSFjLKn1S7CPhZ4IEk9w/7fh3YAdya5ArgCeCysYxQkjSvRQNeVf8IZIGnL1ne4UiSRuUnMSWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJampUT5KL0mrwobtd03kvAd2bB7Lv+sKXJKaMuCS1FSbSyiT+tZHko5XrsAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLU1JICnuTtSR5Nsj/J9uUalCRpcccc8CQnAH8IvAM4F3h3knOXa2CSpJe2lBX4BcD+qnq8qp4Dbga2LM+wJEmLWbeE164HnpyzfRD4waMPSrIN2DZs/meSR5dwzpV2GvCVSQ9iQtbq3J332rIi8851S/4nvmu+nUsJ+Eiqaiewc9znGYck01W1adLjmIS1OnfnvbZ0n/dSLqE8BZw1Z/vMYZ8kaQUsJeCfA85OsjHJicDlwJ3LMyxJ0mKO+RJKVT2f5P3A3wAnADdW1b5lG9nxoeWln2WyVufuvNeW1vNOVU16DJKkY+AnMSWpKQMuSU2t6YAnuTHJs0kenLPvI0meSnL/8N875zx3zXDbgEeT/OhkRr1088172P+BJI8k2Zfkt+fsX7XzTnLLnPf6QJL75zy3KuYNC879vCSfGeY+neSCYX+S/MEw939Jcv7kRr40C8z7DUn+KckDSf4iyXfMea7Xe15Va/Y/4M3A+cCDc/Z9BPiVeY49F/hn4CRgI/CvwAmTnsMyzvuHgb8DThq2X7cW5n3U878L/OZqm/dLvOd/C7xjePxO4L45j/8KCHAhsGfS41/meX8OeMvw+L3Ab3V9z9f0CryqPg3824iHbwFurqpvVNUXgf3M3k6gnQXm/T5gR1V9Yzjm2WH/ap83MLvqBC4Dbhp2rZp5w4JzL+Dw6vM1wJeHx1uAP6lZnwFOTnLGyox0eS0w73OATw+P7wZ+anjc7j1f0wF/Ce8fvnW8Mckpw775bh2wfuWHNjbnAG9KsifJPyT5gWH/ap/3YW8Cnqmqx4bttTDvDwK/k+RJ4KPANcP+1T73fRy5b9NPc+QDie3mbcBf7OPAdwPnAYeY/bZ6LVgHnMrst8y/Ctw6rErXindzZPW9VrwPuLqqzgKuBnZNeDwr5b3ALybZC3w78NyEx3PMxn4vlG6q6pnDj5P8MfCpYXO13zrgIHB7zV4M/GySbzJ7o5/VPm+SrAN+Evj+ObtX/byBrcB
2022-05-13 23:43:00 +02:00
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
2022-05-13 22:06:56 +02:00
"source": [
"def draw_distribution():\n",
" \"\"\"Funkcja rysuje rozkład statystyki testowej\"\"\"\n",
2022-05-13 23:43:00 +02:00
" dummy = np.random.normal(170, 10, 500)\n",
" plt.hist(dummy)\n",
" plt.show()\n",
" pass\n",
"draw_distribution()"
2022-05-13 22:06:56 +02:00
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 150,
2022-05-11 15:02:15 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyka testowa dla jednej próby:\n",
"1.414213562373095 - z naszej funkcji\n",
"1.414213562373095 - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób niezależnych:\n",
"-3.0 - z naszej funkcji\n",
"-3.0 - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób zależnych:\n",
"-1.6329931618554525 - z naszej funkcji\n",
"-1.632993161855452 - z gotowej biblioteki\n"
]
}
],
"source": [
"# Testy\n",
"dummy = [1, 2, 3, 4, 5]\n",
"dummy2 = [4, 5, 6, 7, 8]\n",
"dummy3 = [1, 3 , 3, 4, 6]\n",
"t_stat_selfmade = t_stat_single(dummy, 2)\n",
"t_stat_lib, _ = ttest_1samp(dummy, 2)\n",
"print('Statystyka testowa dla jednej próby:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')\n",
"print()\n",
2022-05-14 16:47:42 +02:00
"t_stat_selfmade = t_stat_ind(dummy, dummy2)\n",
"t_stat_lib, _ = ttest_ind(dummy, dummy2)\n",
"print('Statystyka testowa dla dwóch prób niezależnych:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')\n",
"print()\n",
"t_stat_selfmade = t_stat_dep(dummy, dummy3)\n",
"t_stat_lib, _ = ttest_rel(dummy, dummy3)\n",
"print('Statystyka testowa dla dwóch prób zależnych:')\n",
"print(t_stat_selfmade, '- z naszej funkcji')\n",
"print(t_stat_lib, '- z gotowej biblioteki')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
2022-05-14 16:47:42 +02:00
"execution_count": 151,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2022-05-14 16:47:42 +02:00
"t: 6.914346193374633, df: 998, cv: 1.6463818766348755, p: 8.378631122241131e-12\n",
2022-05-11 15:02:15 +02:00
"\n",
"Reject the null hypothesis that the means are equal.\n",
"Reject the null hypothesis that the means are equal.\n"
]
}
],
"source": [
"dataset = pd.read_csv('experiment_data.csv')\n",
"make_decision(dataset, ['Weight', 'Age'])"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"interpreter": {
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
},
"kernelspec": {
"display_name": "Python 3.9.1 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}