Bootstrap-t-student/bootstrap-t.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "Bootstrapowa wersja testu t.\n",
    "Implementacja powinna obejmować test dla jednej próby, dla dwóch prób niezależnych oraz dla dwóch prób zależnych.\n",
    "W każdej sytuacji oczekiwanym wejście jest zbiór danych w odpowiednim formacie, a wyjściem p-wartość oraz ostateczna decyzja.\n",
    "Dodatkowo powinien być rysowany odpowiedni rozkład statystyki testowej."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "Zbiór danych          - ???\n",
    "Hipoteza zerowa       - ???\n",
    "Hipoteza alternatywna - ???"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from math import sqrt\n",
    "from scipy.stats import sem\n",
    "from scipy.stats import t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def generate_bootstraps(data, n_bootstraps=100):\n",
    "    data_size = data.shape[0]\n",
    "    for _ in range(n_bootstraps):\n",
    "        indices =  np.random.choice(len(data), size=data_size)\n",
    "        yield data.iloc[indices, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "outputs": [],
   "source": [
    "def get_t_stat(data1, data2):\n",
    "    mean1 = np.mean(data1)\n",
    "    mean2 = np.mean(data2)\n",
    "    sem1 = sem(data1)\n",
    "    sem2 = sem(data2)\n",
    "\n",
    "    sed = sqrt(sem1**2.0 + sem2**2.0)\n",
    "    return (mean1 - mean2) / sed"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def independent_t_test(data, columns, alpha=0.05):\n",
    "    t_stat_sum = 0\n",
    "    for sample in generate_bootstraps(data):\n",
    "        t_stat_sum += get_t_stat(sample[columns[0]], sample[columns[1]])\n",
    "\n",
    "    data_size = data.shape[0]\n",
    "    t_stat = t_stat_sum / data_size\n",
    "    df = 2 * data_size - 2\n",
    "    cv = t.ppf(1.0 - alpha, df)\n",
    "    p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n",
    "    return t_stat, df, cv, p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [],
   "source": [
    "def make_decision(data, columns, alpha=0.05):\n",
    "    t_stat, df, cv, p = independent_t_test(data, columns, alpha)\n",
    "    print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
    "    if abs(t_stat) <= cv:\n",
    "\t    print('Accept null hypothesis that the means are equal.')\n",
    "    else:\n",
    "        print('Reject the null hypothesis that the means are equal.')\n",
    "    if p > alpha:\n",
    "        print('Accept null hypothesis that the means are equal.')\n",
    "    else:\n",
    "\t    print('Reject the null hypothesis that the means are equal.')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "outputs": [],
   "source": [
    "def draw_distribution():\n",
    "    \"\"\"Funkcja rysuje rozkład statystyki testowej\"\"\"\n",
    "    pass"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t: 6.891235313595221, df: 998, cv: 1.6463818766348755, p: 9.78683800667568e-12\n",
      "\n",
      "Reject the null hypothesis that the means are equal.\n",
      "Reject the null hypothesis that the means are equal.\n"
     ]
    }
   ],
   "source": [
    "dataset = pd.read_csv('experiment_data.csv')\n",
    "make_decision(dataset, ['Weight', 'Age'])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
  },
  "kernelspec": {
   "display_name": "Python 3.9.1 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Initial implementation 2022-05-11 15:02:15 +02:00			`{`
			`"cells": [`
misc. changes 2022-05-13 22:06:56 +02:00			`{`
			`"cell_type": "markdown",`
			`"source": [`
			`"Bootstrapowa wersja testu t.\n",`
			`"Implementacja powinna obejmować test dla jednej próby, dla dwóch prób niezależnych oraz dla dwóch prób zależnych.\n",`
			`"W każdej sytuacji oczekiwanym wejście jest zbiór danych w odpowiednim formacie, a wyjściem p-wartość oraz ostateczna decyzja.\n",`
			`"Dodatkowo powinien być rysowany odpowiedni rozkład statystyki testowej."`
			`],`
			`"metadata": {`
			`"collapsed": false`
			`}`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"source": [`
			`"Zbiór danych - ???\n",`
			`"Hipoteza zerowa - ???\n",`
			`"Hipoteza alternatywna - ???"`
			`],`
			`"metadata": {`
			`"collapsed": false`
			`}`
			`},`
Initial implementation 2022-05-11 15:02:15 +02:00			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 50,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"metadata": {`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`},`
			`"outputs": [],`
			`"source": [`
			`"import numpy as np\n",`
			`"import pandas as pd\n",`
			`"from math import sqrt\n",`
			`"from scipy.stats import sem\n",`
			`"from scipy.stats import t"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 51,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"metadata": {`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`},`
			`"outputs": [],`
			`"source": [`
			`"def generate_bootstraps(data, n_bootstraps=100):\n",`
			`" data_size = data.shape[0]\n",`
misc. changes 2022-05-13 22:06:56 +02:00			`" for _ in range(n_bootstraps):\n",`
			`" indices = np.random.choice(len(data), size=data_size)\n",`
			`" yield data.iloc[indices, :]"`
Initial implementation 2022-05-11 15:02:15 +02:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 52,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"outputs": [],`
			`"source": [`
			`"def get_t_stat(data1, data2):\n",`
			`" mean1 = np.mean(data1)\n",`
			`" mean2 = np.mean(data2)\n",`
			`" sem1 = sem(data1)\n",`
			`" sem2 = sem(data2)\n",`
			`"\n",`
			`" sed = sqrt(sem12.0 + sem22.0)\n",`
			`" return (mean1 - mean2) / sed"`
			`],`
			`"metadata": {`
			`"collapsed": false,`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`}`
			`},`
			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 53,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"metadata": {`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`},`
			`"outputs": [],`
			`"source": [`
			`"def independent_t_test(data, columns, alpha=0.05):\n",`
			`" t_stat_sum = 0\n",`
			`" for sample in generate_bootstraps(data):\n",`
			`" t_stat_sum += get_t_stat(sample[columns[0]], sample[columns[1]])\n",`
			`"\n",`
			`" data_size = data.shape[0]\n",`
			`" t_stat = t_stat_sum / data_size\n",`
			`" df = 2 * data_size - 2\n",`
			`" cv = t.ppf(1.0 - alpha, df)\n",`
			`" p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n",`
			`" return t_stat, df, cv, p"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 54,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"outputs": [],`
			`"source": [`
			`"def make_decision(data, columns, alpha=0.05):\n",`
			`" t_stat, df, cv, p = independent_t_test(data, columns, alpha)\n",`
			`" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",`
			`" if abs(t_stat) <= cv:\n",`
			`"\t print('Accept null hypothesis that the means are equal.')\n",`
			`" else:\n",`
			`" print('Reject the null hypothesis that the means are equal.')\n",`
			`" if p > alpha:\n",`
			`" print('Accept null hypothesis that the means are equal.')\n",`
			`" else:\n",`
			`"\t print('Reject the null hypothesis that the means are equal.')"`
			`],`
			`"metadata": {`
			`"collapsed": false,`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`}`
			`},`
			`{`
			`"cell_type": "code",`
misc. changes 2022-05-13 22:06:56 +02:00			`"execution_count": 55,`
			`"outputs": [],`
			`"source": [`
			`"def draw_distribution():\n",`
			`" \"\"\"Funkcja rysuje rozkład statystyki testowej\"\"\"\n",`
			`" pass"`
			`],`
			`"metadata": {`
			`"collapsed": false,`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`}`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 56,`
Initial implementation 2022-05-11 15:02:15 +02:00			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
misc. changes 2022-05-13 22:06:56 +02:00			`"t: 6.891235313595221, df: 998, cv: 1.6463818766348755, p: 9.78683800667568e-12\n",`
Initial implementation 2022-05-11 15:02:15 +02:00			`"\n",`
			`"Reject the null hypothesis that the means are equal.\n",`
			`"Reject the null hypothesis that the means are equal.\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"dataset = pd.read_csv('experiment_data.csv')\n",`
			`"make_decision(dataset, ['Weight', 'Age'])"`
			`],`
			`"metadata": {`
			`"collapsed": false,`
			`"pycharm": {`
			`"name": "#%%\n"`
			`}`
			`}`
			`}`
			`],`
			`"metadata": {`
			`"interpreter": {`
			`"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"`
			`},`
			`"kernelspec": {`
			`"display_name": "Python 3.9.1 64-bit",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.9.1"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`