{ "cells": [ { "cell_type": "markdown", "source": [ "Bootstrapowa wersja testu t.\n", "Implementacja powinna obejmować test dla jednej próby, dla dwóch prób niezależnych oraz dla dwóch prób zależnych.\n", "W każdej sytuacji oczekiwanym wejście jest zbiór danych w odpowiednim formacie, a wyjściem p-wartość oraz ostateczna decyzja.\n", "Dodatkowo powinien być rysowany odpowiedni rozkład statystyki testowej." ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Zbiór danych - ???\n", "Hipoteza zerowa - ???\n", "Hipoteza alternatywna - ???\n", "\n", "Dla każdego z 3 testów inne\n", "https://www.jmp.com/en_ch/statistics-knowledge-portal/t-test.html" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 142, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from math import sqrt\n", "from scipy.stats import sem\n", "from scipy.stats import t\n", "import matplotlib.pyplot as plt\n", "from statistics import mean, stdev\n", "from scipy.stats import ttest_ind, ttest_1samp, ttest_rel" ] }, { "cell_type": "code", "execution_count": 143, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def generate_bootstraps(data, n_bootstraps=100):\n", " data_size = data.shape[0]\n", " for _ in range(n_bootstraps):\n", " indices = np.random.choice(len(data), size=data_size)\n", " yield data.iloc[indices, :]" ] }, { "cell_type": "code", "execution_count": 144, "outputs": [], "source": [ "def t_stat_single(sample, population_mean):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n", " sample_size = len(sample)\n", " return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 145, "outputs": [], "source": [ "def t_stat_ind(sample_1, sample_2):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n", " sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n", " return (mean(sample_1) - mean(sample_2)) / sed" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 146, "outputs": [], "source": [ "def t_stat_dep(sample_1, sample_2):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n", " differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n", " sample_size = len(sample_1)\n", " mu = 0 # The constant is zero if we want to test whether the average of the difference is significantly different.\n", " return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "def bootstrap_one_sample():\n", " return" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "def bootstrap_independent():\n", " return" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [ "def bootstrap_dependent():\n", " return" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 147, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def independent_t_test(data, columns, alpha=0.05):\n", " t_stat_sum = 0\n", " for sample in generate_bootstraps(data):\n", " t_stat_sum += t_stat_ind(sample[columns[0]], sample[columns[1]])\n", "\n", " data_size = data.shape[0]\n", " t_stat = t_stat_sum / data_size\n", " df = 2 * data_size - 2\n", " cv = t.ppf(1.0 - alpha, df)\n", " p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n", " return t_stat, df, cv, p" ] }, { "cell_type": "code", "execution_count": 148, "outputs": [], "source": [ "def make_decision(data, columns, alpha=0.05):\n", " t_stat, df, cv, p = independent_t_test(data, columns, alpha)\n", " print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n", " if abs(t_stat) <= cv:\n", "\t print('Accept null hypothesis that the means are equal.')\n", " else:\n", " print('Reject the null hypothesis that the means are equal.')\n", " if p > alpha:\n", " print('Accept null hypothesis that the means are equal.')\n", " else:\n", "\t print('Reject the null hypothesis that the means are equal.')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 149, "outputs": [ { "data": { "text/plain": "
", "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAMkElEQVR4nO3dfYxl9V3H8fenrKBttUCZElyIu7GgISaluCKG9EFotO0aF7USGqOblrim0kqpD12MkSb+s2i1YmJqVpZmTRoeRCJYfEIEGxO77WyLwvIQVrqUpQtMY1ufYhH79Y85mx2WGeayM3fufmfer4TMPeeeu+f3yyXv/ObM3DOpKiRJ/bxi0gOQJB0bAy5JTRlwSWrKgEtSUwZckppat5InO+2002rDhg0reUpJam/v3r1fqaqpo/evaMA3bNjA9PT0Sp5SktpL8sR8+72EIklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU2t6CcxpePVhu13TezcB3Zsnti51ZsrcElqyhW4jiuTXAlL3bgCl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNTVSwJNcnWRfkgeT3JTkW5NsTLInyf4ktyQ5cdyDlSQdsWjAk6wHfgnYVFXfB5wAXA5cB3ysql4PfBW4YpwDlSS90KiXUNYB35ZkHfBK4BBwMXDb8Pxu4NJlH50kaUGLBryqngI+CnyJ2XB/HdgLfK2qnh8OOwisn+/1SbYlmU4yPTMzszyjliSNdAnlFGALsBH4TuBVwNtHPUFV7ayqTVW1aWpq6pgHKkl6oVEuobwN+GJVzVTV/wK3AxcBJw+XVADOBJ4a0xglSfMYJeBfAi5M8sokAS4BHgLuBd41HLMVuGM8Q5QkzWeUa+B7mP1h5eeBB4bX7AQ+DHwoyX7gtcCuMY5TknSUdYsfAlV1LXDtUbsfBy5Y9hFJkkbiJzElqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpkb6PXBJ47Nh+10TOe+BHZsncl4tH1fgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTXk3Qr3IpO6OJ+nlcQUuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLU1EgBT3JyktuSPJLk4SQ/lOTUJHcneWz4esq4BytJOmLUFfj1wF9X1fcCbwAeBrYD91TV2cA9w7YkaYUsGvAkrwHeDOwCqKrnquprwBZg93DYbuDS8QxRkjSfUVbgG4EZ4BNJvpDkhiSvAk6vqkPDMU8Dp8/34iTbkkwnmZ6ZmVmeUUuSRgr4OuB84ONV9UbgvzjqcklVFVDzvbiqdlbVpqraNDU1tdTxSpIGowT8IHCwqvYM27cxG/RnkpwBMHx9djxDlCTNZ9GAV9XTwJNJvmfYdQnwEHAnsHXYtxW4YywjlCTNa9Q/avwB4JNJTgQeB97DbPxvTXIF8ARw2XiGKEmaz0gBr6r7gU3zPHXJso5GkjQyP4kpSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlPrJj0ASZOxYftdEzv3gR2bJ3bu1cQVuCQ1ZcAlqSkDLklNGXBJasqAS1JTBlySmho54ElOSPKFJJ8atjcm2ZNkf5Jbkpw4vmFKko72clbgVwEPz9m+DvhYVb0e+CpwxXIOTJL00kYKeJIzgc3ADcN2gIuB24ZDdgOXjmF8kqQFjLoC/33g14BvDtuvBb5WVc8P2weB9cs7NEnSS1k04El+DHi2qvYeywmSbEsynWR6ZmbmWP4JSdI8RlmBXwT8eJIDwM3MXjq5Hjg5yeF7qZwJPDXfi6tqZ1VtqqpNU1NTyzBkSRKMEPCquqaqzqyqDcDlwN9X1c8A9wLvGg7bCtwxtlFKkl5kKb8H/mHgQ0n2M3tNfNfyDEmSNIqXdTvZqroPuG94/DhwwfIPSZI0Cj+JKUlN+QcdjmOTvOG+pOOfK3BJasqAS1JTBlySmjLgktSUP8RchD9IlHS8cgUuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLUlAGXpKYMuCQ1ZcAlqSn/oIOkFTepP5RyYMfmiZx3XFyBS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1NSiAU9yVpJ7kzyUZF+Sq4b9pya5O8ljw9dTxj9cSdJho6zAnwd+uarOBS4ErkxyLrAduKeqzgbuGbYlSStk0YBX1aGq+vzw+D+Ah4H1wBZg93DYbuDSMY1RkjSPl3UNPMkG4I3AHuD0qjo0PPU0cPoCr9mWZDrJ9MzMzFLGKkmaY+SAJ3k18GfAB6vq3+c+V1UF1Hyvq6qdVbWpqjZNTU0tabCSpCNGCniSb2E23p+sqtuH3c8kOWN4/gzg2fEMUZI0n1F+CyXALuDhqvq9OU/dCWwdHm8F7lj+4UmSFjLKn1S7CPhZ4IEk9w/7fh3YAdya5ArgCeCysYxQkjSvRQNeVf8IZIGnL1ne4UiSRuUnMSWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJampUT5KL0mrwobtd03kvAd2bB7Lv+sKXJKaMuCS1FSbSyiT+tZHko5XrsAlqSkDLklNGXBJasqAS1JTBlySmjLgktSUAZekpgy4JDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1JQBl6SmDLgkNWXAJakpAy5JTRlwSWrKgEtSUwZckpoy4JLU1JICnuTtSR5Nsj/J9uUalCRpcccc8CQnAH8IvAM4F3h3knOXa2CSpJe2lBX4BcD+qnq8qp4Dbga2LM+wJEmLWbeE164HnpyzfRD4waMPSrIN2DZs/meSR5dwzpV2GvCVSQ9iQtbq3J332rIi8851S/4nvmu+nUsJ+Eiqaiewc9znGYck01W1adLjmIS1OnfnvbZ0n/dSLqE8BZw1Z/vMYZ8kaQUsJeCfA85OsjHJicDlwJ3LMyxJ0mKO+RJKVT2f5P3A3wAnADdW1b5lG9nxoeWln2WyVufuvNeW1vNOVU16DJKkY+AnMSWpKQMuSU2t6YAnuTHJs0kenLPvI0meSnL/8N875zx3zXDbgEeT/OhkRr1088172P+BJI8k2Zfkt+fsX7XzTnLLnPf6QJL75zy3KuYNC879vCSfGeY+neSCYX+S/MEw939Jcv7kRr40C8z7DUn+KckDSf4iyXfMea7Xe15Va/Y/4M3A+cCDc/Z9BPiVeY49F/hn4CRgI/CvwAmTnsMyzvuHgb8DThq2X7cW5n3U878L/OZqm/dLvOd/C7xjePxO4L45j/8KCHAhsGfS41/meX8OeMvw+L3Ab3V9z9f0CryqPg3824iHbwFurqpvVNUXgf3M3k6gnQXm/T5gR1V9Yzjm2WH/ap83MLvqBC4Dbhp2rZp5w4JzL+Dw6vM1wJeHx1uAP6lZnwFOTnLGyox0eS0w73OATw+P7wZ+anjc7j1f0wF/Ce8fvnW8Mckpw775bh2wfuWHNjbnAG9KsifJPyT5gWH/ap/3YW8Cnqmqx4bttTDvDwK/k+RJ4KPANcP+1T73fRy5b9NPc+QDie3mbcBf7OPAdwPnAYeY/bZ6LVgHnMrst8y/Ctw6rErXindzZPW9VrwPuLqqzgKuBnZNeDwr5b3ALybZC3w78NyEx3PMxn4vlG6q6pnDj5P8MfCpYXO13zrgIHB7zV4M/GySbzJ7o5/VPm+SrAN+Evj+ObtX/byBrcBVw+M/BW4YHq/quVfVI8CPACQ5B9g8PNVu3q7Aj3LUtb6fAA7/9PpO4PIkJyXZCJwNfHalxzdGf87sDzIP/099IrN3aVvt8wZ4G/BIVR2cs28tzPvLwFuGxxcDhy8f3Qn83PDbKBcCX6+qQ5MY4Dgked3w9RXAbwB/NDzV7j1f0yvwJDcBbwVOS3IQuBZ4a5LzmP0BzwHgFwCqal+SW4GHgOeBK6vq/yYw7CVbYN43AjcOv271HLB1WI2v6nlX1S5m7+Pzgssnq+n9hgXf858Hrh++A/kfjtz2+S+Z/U2U/cB/A+9Z8QEvkwXm/eokVw6H3A58Anq+536UXpKa8hKKJDVlwCWpKQMuSU0ZcElqyoBLUlMGXJKaMuCS1NT/A059g8y1NrEyAAAAAElFTkSuQmCC\n" }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "def draw_distribution():\n", " \"\"\"Funkcja rysuje rozkład statystyki testowej\"\"\"\n", " # Losowe dane bo nie jestem pewien co tu dać teraz\n", " dummy = np.random.normal(170, 10, 500)\n", " plt.hist(dummy)\n", " plt.show()\n", "draw_distribution() # To trzeba wywalić potem" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 150, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Statystyka testowa dla jednej próby:\n", "1.414213562373095 - z naszej funkcji\n", "1.414213562373095 - z gotowej biblioteki\n", "\n", "Statystyka testowa dla dwóch prób niezależnych:\n", "-3.0 - z naszej funkcji\n", "-3.0 - z gotowej biblioteki\n", "\n", "Statystyka testowa dla dwóch prób zależnych:\n", "-1.6329931618554525 - z naszej funkcji\n", "-1.632993161855452 - z gotowej biblioteki\n" ] } ], "source": [ "# Testy\n", "dummy = [1, 2, 3, 4, 5]\n", "dummy2 = [4, 5, 6, 7, 8]\n", "dummy3 = [1, 3 , 3, 4, 6]\n", "t_stat_selfmade = t_stat_single(dummy, 2)\n", "t_stat_lib, _ = ttest_1samp(dummy, 2)\n", "print('Statystyka testowa dla jednej próby:')\n", "print(t_stat_selfmade, '- z naszej funkcji')\n", "print(t_stat_lib, '- z gotowej biblioteki')\n", "print()\n", "t_stat_selfmade = t_stat_ind(dummy, dummy2)\n", "t_stat_lib, _ = ttest_ind(dummy, dummy2)\n", "print('Statystyka testowa dla dwóch prób niezależnych:')\n", "print(t_stat_selfmade, '- z naszej funkcji')\n", "print(t_stat_lib, '- z gotowej biblioteki')\n", "print()\n", "t_stat_selfmade = t_stat_dep(dummy, dummy3)\n", "t_stat_lib, _ = ttest_rel(dummy, dummy3)\n", "print('Statystyka testowa dla dwóch prób zależnych:')\n", "print(t_stat_selfmade, '- z naszej funkcji')\n", "print(t_stat_lib, '- z gotowej biblioteki')" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } }, { "cell_type": "code", "execution_count": 151, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "t: 6.914346193374633, df: 998, cv: 1.6463818766348755, p: 8.378631122241131e-12\n", "\n", "Reject the null hypothesis that the means are equal.\n", "Reject the null hypothesis that the means are equal.\n" ] } ], "source": [ "dataset = pd.read_csv('experiment_data.csv')\n", "make_decision(dataset, ['Weight', 'Age'])" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } } } ], "metadata": { "interpreter": { "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594" }, "kernelspec": { "display_name": "Python 3.9.1 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }