Bootstrap-t-student/bootstrap-t.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Projekt - Test t studenta\n",
    "\n",
    "- Marcin Kostrzewski\n",
    "- Krystian Wasilewski\n",
    "- Mateusz Tylka\n",
    "\n",
    "## Test t studenta\n",
    "\n",
    "Metoda statystyczna służącą do porównania dwóch średnich między sobą gdy znamy liczbę badanych próbek, średnią arytmetyczną oraz wartość odchylenia standardowego lub wariancji.\n",
    "Jest to jeden z mniej skomplikowanych i bardzo często wykorzystywanych testów statystycznych używanych do weryfikacji hipotez. Dzięki niemu możemy dowiedzieć się czy dwie różne średnie są różne niechcący (w wyniku przypadku) czy są różne istotnie statystycznie (np. z uwagi na naszą manipulację eksperymentalna).\n",
    "Wyróżniamy 3 wersję testu t:\n",
    "\n",
    "1. test t Studenta dla jednej próby\n",
    "2. test t Studenta dla prób niezależnych\n",
    "3. test t Studenta dla prób zależnych\n",
    "\n",
    "Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from enum import Enum\n",
    "from math import sqrt\n",
    "from scipy import stats\n",
    "from scipy.stats import sem\n",
    "from scipy.stats import t\n",
    "import matplotlib.pyplot as plt\n",
    "from statistics import mean, stdev\n",
    "from scipy.stats import ttest_ind, ttest_1samp, ttest_rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_csv('experiment_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Alternatives(Enum):\n",
    "    LESS = 'less'\n",
    "    GREATER = 'greater'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_t_difference(t_stat_sample, t_stat_list, alternative):\n",
    "    \"\"\"\n",
    "    Funkcja oblicza procent statystyk testowych powstałych z prób bootstrapowych, \n",
    "    które róznią się od statystyki testowej powstałej ze zbioru według hipotezy alternatywnej.\n",
    "    \"\"\"\n",
    "    all_stats = len(t_stat_list)\n",
    "    stats_different_count = 0\n",
    "    for t_stat_boot in t_stat_list:\n",
    "        if alternative is Alternatives.LESS and t_stat_boot < t_stat_sample:\n",
    "            stats_different_count += 1 \n",
    "        elif alternative is Alternatives.GREATER and t_stat_boot > t_stat_sample:\n",
    "            stats_different_count += 1\n",
    "    return stats_different_count / all_stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def t_test_1_samp(sample_1, population_mean=None, alternative=Alternatives.LESS):\n",
    "    \"\"\"\n",
    "    Funkcja przeprowadza test T-studenta dla jednej zmiennej.\n",
    "    \"\"\"\n",
    "    t_stat_from_sample, _ = ttest_1samp(a=sample_1, popmean=population_mean, alternative=alternative.value)\n",
    "    t_stat_list = get_t_stats(sample_1, t_stat_fn=ttest_1samp, alternative=alternative, population_mean=population_mean)\n",
    "\n",
    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
    "\n",
    "    return p, t_stat_from_sample, t_stat_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "def t_test_ind(sample_1, sample_2, alternative=Alternatives.LESS):\n",
    "    \"\"\"\n",
    "    Funkcja przeprowadza test T-studenta dla dwóch zmiennych niezależnych.\n",
    "    \"\"\"\n",
    "    t_stat_from_sample, _ = ttest_ind(sample_1, sample_2, alternative=alternative.value)\n",
    "    t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_ind)\n",
    "\n",
    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
    "\n",
    "    return p, t_stat_from_sample, t_stat_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "def t_test_dep(sample_1, sample_2, alternative=Alternatives.LESS):\n",
    "    \"\"\"\n",
    "    Funkcja przeprowadza test T-studenta dla dwóch zmiennych zależnych.\n",
    "    \"\"\"\n",
    "    t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_rel)\n",
    "    t_stat_from_sample, _ = ttest_rel(sample_1, sample_2, alternative=alternative.value)\n",
    "\n",
    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
    "\n",
    "    return p, t_stat_from_sample, t_stat_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_t_stats(sample_1, sample_2=None, t_stat_fn=ttest_1samp, alternative=Alternatives.LESS, population_mean=None):\n",
    "    \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
    "    t_stat_list = []\n",
    "\n",
    "    # One sample test\n",
    "    if t_stat_fn is ttest_1samp and sample_2 is None:\n",
    "        if not population_mean:\n",
    "            raise Exception(\"population_mean not provided\")\n",
    "        for bootstrap in generate_bootstraps(sample_1):\n",
    "            stat, _ = t_stat_fn(bootstrap, population_mean, alternative=alternative.value)\n",
    "            t_stat_list.append(stat)\n",
    "        return t_stat_list\n",
    "\n",
    "    # Two sample test\n",
    "    for bootstrap_sample in generate_bootstraps(pd.concat((sample_1, sample_2))):\n",
    "        bootstrap_1, bootstrap_2 = bootstrap_sample.iloc[: round(len(bootstrap_sample) * 0.5)], bootstrap_sample.iloc[: round(-len(bootstrap_sample) * 0.5)]\n",
    "        stat, _ = t_stat_fn(bootstrap_1, bootstrap_2, alternative=alternative.value)\n",
    "        t_stat_list.append(stat)\n",
    "    return t_stat_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pretty_print_test(p, t_stat_from_sample, t_stat_list, thesis, alternative, max_print=5):\n",
    "    print('Wyniki bootstrapowej wersji testu T-studenta')\n",
    "    print()\n",
    "    print(f'Hipoteza: {thesis}')\n",
    "    if alternative is Alternatives.LESS:\n",
    "        print(f'Hipoteza alternatywna: średnia jest mniejsza')\n",
    "    else:\n",
    "        print(f'Hipoteza alternatywna: średnia jest większa')\n",
    "    print()\n",
    "    print(f'p: {p}')\n",
    "    print(f'Wartość statystyki testowej z próby: {t_stat_from_sample}')\n",
    "    print(f'Wartości statystyk z prób boostrapowych:')\n",
    "\n",
    "    t_stat_list_len = len(t_stat_list)\n",
    "    for i in range(min(max_print, t_stat_list_len)):\n",
    "        print(f'{t_stat_list[i]}, ', end='')\n",
    "    if max_print < t_stat_list_len:\n",
    "        remaining = t_stat_list_len - max_print\n",
    "        print(f'... (i {remaining} pozostałych)')\n",
    "\n",
    "    print()\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test Shapiro Wilka\n",
    "\n",
    "Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testowanie hipotez metodą bootstrap\n",
    "\n",
    "**Bootstrap** – metoda szacowania (estymacji) wyników poprzez wielokrotne losowanie ze zwracaniem z próby. Polega ona na utworzeniu nowego rozkładu wyników, na podstawie posiadanych danych, poprzez wielokrotne losowanie wartości z posiadanej próby. Metoda ze zwracaniem polega na tym, że po wylosowaniu danej wartości, “wraca” ona z powrotem do zbioru.\n",
    "\n",
    "Metoda bootstrapowa znajduje zastosowanie w sytuacji, w której nie znamy rozkładu z populacji z której pochodzi próbka lub w przypadku rozkładów małych lub asymetrycznych. W takim wypadku, dzięki tej metodzie, wyniki testów parametrycznych i analiz opartych o modele liniowe są bardziej precyzyjne. Zazwyczaj losuje się wiele próbek, np. 2000 czy 5000."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def generate_bootstraps(data, n_bootstraps=100):\n",
    "    data_size = data.shape[0]\n",
    "    for _ in range(n_bootstraps):\n",
    "        indices =  np.random.choice(len(data), size=data_size)\n",
    "        yield data.iloc[indices, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test t studenta dla jednej próby\n",
    "\n",
    "**Test t Studenta dla jednej próby** wykorzystujemy gdy chcemy porównać średnią “teoretyczną” ze średnią, którą faktycznie możemy zaobserwować w naszej bazie danych. Średnia teoretyczna to średnia pochodząca z innych badań lub po prostu bez większych uzasadnień pochodząca z naszej głowy.\n",
    "\n",
    "Wyobraźmy sobie, że mamy dane z takimi zmiennymi jak wzrost pewnej grupy ludzi. Dzięki testowi t Studenta dla jednej próby możemy dowiedzieć się np. czy wzrost naszego młodszego brata wynoszący 155cm odbiega znacząco od średniej wzrostu tej grupy. Hipoteza zerowa w takim badaniu wyglądałaby następująco H0: Badana próba została wylosowana z populacji, w której wzrost osób wynosi średnio 155cm. Z kolei hipoteza alternatywna będzie brzmiała H1: Badana próba nie została wylosowana z populacji gdzie średni wzrost wynosi 155cm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def bootstrap_one_sample(sample, population_mean, alternative=Alternatives.LESS):\n",
    "    p, t, ts = t_test_1_samp(\n",
    "        sample_1=sample,\n",
    "        population_mean=population_mean,\n",
    "        alternative=alternative,\n",
    "    )\n",
    "    \n",
    "    pretty_print_test(p, t, ts, f'średnia jest równa {population_mean}', alternative)\n",
    "    print()\n",
    "    return p, t, ts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sprawdzenie czy osoba o wzroście 165cm pasuje do populacji (nie jest odmieńcem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
    "dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
    "dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wyniki bootstrapowej wersji testu T-studenta\n",
      "\n",
      "Hipoteza: średnia jest równa 165\n",
      "Hipoteza alternatywna: średnia jest mniejsza\n",
      "\n",
      "p: 0.58\n",
      "Wartość statystyki testowej z próby: [-229.1025971]\n",
      "Wartości statystyk z prób boostrapowych:\n",
      "[-316.92367438], [-406.5], [-201.5], [-243.92267776], [-330.27286699], ... (i 95 pozostałych)\n"
     ]
    }
   ],
   "source": [
    "#TODO: poprawić kod aby można było podawać kolumny\n",
    "\n",
    "p, t, ts = bootstrap_one_sample(dummy, 165)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO: Wniosek"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test t studenta dla prób niezależnych\n",
    "\n",
    "**Test t Studenta dla prób niezależnych** jest najczęściej stosowaną metodą statystyczną w celu porównania średnich z dwóch niezależnych od siebie grup. Wykorzystujemy go gdy chcemy porównać dwie grupy pod względem jakiejś zmiennej ilościowej. Na przykład gdy chcemy porównać średni wzrost kobiet i mężczyzn w danej grupie.\n",
    "\n",
    "Zazwyczaj dwie średnie z różnych od siebie grup będą się różnić. Test t Studenta powie nam jednak czy owe różnice są istotne statystycznie – czy nie są przypadkowe. Hipoteza zerowa takiego testu będzie brzmiała H0: Średni wzrost w grupie mężczyzn jest taki sam jak średni w grupie kobiet. Hipoteza alternatywna z kolei H1: Kobiety będą różnić się od mężczyzn pod wzrostu.\n",
    "Jeśli wynik testu t Studenta będzie istotny na poziomie p < 0,05 możemy odrzucić hipotezę zerową na rzecz hipotezy alternatywnej.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def bootstrap_independent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
    "    p, t, ts = t_test_ind(\n",
    "        sample_1=sample_1,\n",
    "        sample_2=sample_2,\n",
    "        alternative=alternative,\n",
    "    )\n",
    "    \n",
    "    pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
    "    return p, t, ts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO: Wyciągnąć wysokości kobiet i mężczyzn oraz poprawić kod aby można było podawać kolumny\n",
    "t_stat, df, cv, p, _ = bootstrap_independent(dataset, dataset)\n",
    "pretty_print_full_stats(t_stat, df, cv, p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO: Wniosek"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test t studenta dla prób zależnych\n",
    "\n",
    "W odróżnieniu od testu t – Studenta dla prób niezależnych, gdzie porównujemy dwie grupy, ten rodzaj testu stosujemy gdy poddajemy analizie tą samą pojedynczą grupę, ale dwukrotnie w czasie. Na przykład gdy chcemy porównać średnie wagi grupy osób przed dietą oraz po diecie, aby sprawdzić czy dieta spowodowała istotne zmiany statystyczne.\n",
    "\n",
    "Hipoteza zerowa takiego testu będzie brzmiała H0: Średnia waga osób po diecie jest taka sama jak przed dietą. Hipoteza alternatywna z kolei H1: Dieta znacząco wpłynęła na średnią wagę danej grupy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def bootstrap_dependent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
    "    p, t, ts = t_test_dep(\n",
    "        sample_1=sample_1,\n",
    "        sample_2=sample_2,\n",
    "        alternative=alternative,\n",
    "    )\n",
    "    \n",
    "    pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
    "    return p, t, ts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TODO: Wyciągnąć wagi przed dietą i po oraz poprawić kod aby można było podawać kolumny\n",
    "t_stat, df, cv, p, _ = bootstrap_dependent(dataset, dataset)\n",
    "pretty_print_full_stats(t_stat, df, cv, p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO: Wniosek"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Wykresy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "def draw_distribution(stats):\n",
    "    \"\"\"\n",
    "    Funkcja rysuje rozkład statystyki testowej\n",
    "    @param stats: lista statystyk testowych\n",
    "    \"\"\"\n",
    "    plt.hist(stats)\n",
    "    plt.xlabel('Test statistic value')\n",
    "    plt.ylabel('Frequency')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Statystyki dla jednej próby:\n",
      "Wyniki bootstrapowej wersji testu T-studenta\n",
      "\n",
      "Hipoteza: średnia jest równa 2\n",
      "Hipoteza alternatywna: średnia jest mniejsza\n",
      "\n",
      "p: 0.52\n",
      "Wartość statystyki testowej z próby: [1.41421356]\n",
      "Wartości statystyk z prób boostrapowych:\n",
      "[-0.53452248], [0.], [-1.63299316], [1.5], [2.3590713], ... (i 95 pozostałych)\n",
      "Statystyki dla dwóch prób zależnych:\n",
      "Wyniki bootstrapowej wersji testu T-studenta\n",
      "\n",
      "Hipoteza: średnie są takie same\n",
      "Hipoteza alternatywna: średnia jest mniejsza\n",
      "\n",
      "p: 0.0\n",
      "Wartość statystyki testowej z próby: [10.61445555]\n",
      "Wartości statystyk z prób boostrapowych:\n",
      "[nan], [nan], [nan], [nan], [nan], ... (i 95 pozostałych)\n",
      "Statystyki dla dwóch prób niezależnych:\n",
      "Wyniki bootstrapowej wersji testu T-studenta\n",
      "\n",
      "Hipoteza: średnie są takie same\n",
      "Hipoteza alternatywna: średnia jest mniejsza\n",
      "\n",
      "p: 1.0\n",
      "Wartość statystyki testowej z próby: [2.4140394]\n",
      "Wartości statystyk z prób boostrapowych:\n",
      "[0.], [0.], [0.], [0.], [0.], ... (i 95 pozostałych)\n"
     ]
    }
   ],
   "source": [
    "# Testy z bootstrappowaniem\n",
    "\n",
    "\n",
    "print('Statystyki dla jednej próby:')\n",
    "p, t, ts = bootstrap_one_sample(dummy, 2)\n",
    "\n",
    "print('Statystyki dla dwóch prób zależnych:')\n",
    "p, t, ts = bootstrap_dependent(dummy2, dummy3)\n",
    "\n",
    "print('Statystyki dla dwóch prób niezależnych:')\n",
    "p, t, ts = bootstrap_independent(dummy2, dummy3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
  },
  "kernelspec": {
   "display_name": "Python 3.8.10 64-bit",
   "metadata": {
    "interpreter": {
     "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
    }
   },
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								{
 								 "cells": [
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								  {
 								   "cell_type": "markdown",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "# Projekt - Test t studenta\n",
 								    "\n",
 								    "- Marcin Kostrzewski\n",
 								    "- Krystian Wasilewski\n",
 								    "- Mateusz Tylka\n",
 								    "\n",
 								    "## Test t studenta\n",
 								    "\n",
 								    "Metoda statystyczna służącą do porównania dwóch średnich między sobą gdy znamy liczbę badanych próbek, średnią arytmetyczną oraz wartość odchylenia standardowego lub wariancji.\n",
 								    "Jest to jeden z mniej skomplikowanych i bardzo często wykorzystywanych testów statystycznych używanych do weryfikacji hipotez. Dzięki niemu możemy dowiedzieć się czy dwie różne średnie są różne niechcący (w wyniku przypadku) czy są różne istotnie statystycznie (np. z uwagi na naszą manipulację eksperymentalna).\n",
 								    "Wyróżniamy 3 wersję testu t:\n",
 								    "\n",
 								    "1. test t Studenta dla jednej próby\n",
 								    "2. test t Studenta dla prób niezależnych\n",
 								    "3. test t Studenta dla prób zależnych\n",
 								    "\n",
 								    "Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny."
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								  },
 								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 61,
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "pycharm": {
 								     "name": "#%%\n"
 								    }
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   },
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "outputs": [],
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "import numpy as np\n",
 								    "import pandas as pd\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "from enum import Enum\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "from math import sqrt\n",
-												test shapiro wilka

											
										
										
											2022-05-17 17:58:54 +02:00
+								    "from scipy import stats\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "from scipy.stats import sem\n",
 								    "from scipy.stats import t\n",
 								    "import matplotlib.pyplot as plt\n",
 								    "from statistics import mean, stdev\n",
 								    "from scipy.stats import ttest_ind, ttest_1samp, ttest_rel"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								  },
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 62,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
-												TODOs

											
										
										
											2022-05-17 13:58:25 +02:00
+								   "outputs": [],
 								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "dataset = pd.read_csv('experiment_data.csv')"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 63,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "class Alternatives(Enum):\n",
 								    "    LESS = 'less'\n",
 								    "    GREATER = 'greater'"
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   ]
-												TODOs

											
										
										
											2022-05-17 13:58:25 +02:00
+								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 64,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def calculate_t_difference(t_stat_sample, t_stat_list, alternative):\n",
 								    "    \"\"\"\n",
 								    "    Funkcja oblicza procent statystyk testowych powstałych z prób bootstrapowych, \n",
 								    "    które róznią się od statystyki testowej powstałej ze zbioru według hipotezy alternatywnej.\n",
 								    "    \"\"\"\n",
 								    "    all_stats = len(t_stat_list)\n",
 								    "    stats_different_count = 0\n",
 								    "    for t_stat_boot in t_stat_list:\n",
 								    "        if alternative is Alternatives.LESS and t_stat_boot < t_stat_sample:\n",
 								    "            stats_different_count += 1 \n",
 								    "        elif alternative is Alternatives.GREATER and t_stat_boot > t_stat_sample:\n",
 								    "            stats_different_count += 1\n",
 								    "    return stats_different_count / all_stats"
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 65,
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   "metadata": {
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
 								   },
 								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def t_test_1_samp(sample_1, population_mean=None, alternative=Alternatives.LESS):\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "    \"\"\"\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "    Funkcja przeprowadza test T-studenta dla jednej zmiennej.\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "    \"\"\"\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "    t_stat_from_sample, _ = ttest_1samp(a=sample_1, popmean=population_mean, alternative=alternative.value)\n",
 								    "    t_stat_list = get_t_stats(sample_1, t_stat_fn=ttest_1samp, alternative=alternative, population_mean=population_mean)\n",
 								    "\n",
 								    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "\n",
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								    "    return p, t_stat_from_sample, t_stat_list"
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 66,
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def t_test_ind(sample_1, sample_2, alternative=Alternatives.LESS):\n",
 								    "    \"\"\"\n",
 								    "    Funkcja przeprowadza test T-studenta dla dwóch zmiennych niezależnych.\n",
 								    "    \"\"\"\n",
 								    "    t_stat_from_sample, _ = ttest_ind(sample_1, sample_2, alternative=alternative.value)\n",
 								    "    t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_ind)\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "\n",
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								    "    return p, t_stat_from_sample, t_stat_list"
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 67,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def t_test_dep(sample_1, sample_2, alternative=Alternatives.LESS):\n",
 								    "    \"\"\"\n",
 								    "    Funkcja przeprowadza test T-studenta dla dwóch zmiennych zależnych.\n",
 								    "    \"\"\"\n",
 								    "    t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_rel)\n",
 								    "    t_stat_from_sample, _ = ttest_rel(sample_1, sample_2, alternative=alternative.value)\n",
 								    "\n",
 								    "    p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
 								    "\n",
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								    "    return p, t_stat_from_sample, t_stat_list"
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 68,
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def get_t_stats(sample_1, sample_2=None, t_stat_fn=ttest_1samp, alternative=Alternatives.LESS, population_mean=None):\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "    \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
 								    "    t_stat_list = []\n",
 								    "\n",
 								    "    # One sample test\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "    if t_stat_fn is ttest_1samp and sample_2 is None:\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "        if not population_mean:\n",
 								    "            raise Exception(\"population_mean not provided\")\n",
 								    "        for bootstrap in generate_bootstraps(sample_1):\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "            stat, _ = t_stat_fn(bootstrap, population_mean, alternative=alternative.value)\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "            t_stat_list.append(stat)\n",
 								    "        return t_stat_list\n",
 								    "\n",
 								    "    # Two sample test\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "    for bootstrap_sample in generate_bootstraps(pd.concat((sample_1, sample_2))):\n",
 								    "        bootstrap_1, bootstrap_2 = bootstrap_sample.iloc[: round(len(bootstrap_sample) * 0.5)], bootstrap_sample.iloc[: round(-len(bootstrap_sample) * 0.5)]\n",
 								    "        stat, _ = t_stat_fn(bootstrap_1, bootstrap_2, alternative=alternative.value)\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "        t_stat_list.append(stat)\n",
 								    "    return t_stat_list"
 								   ]
 								  },
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								  {
 								   "cell_type": "code",
 								   "execution_count": 69,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def pretty_print_test(p, t_stat_from_sample, t_stat_list, thesis, alternative, max_print=5):\n",
 								    "    print('Wyniki bootstrapowej wersji testu T-studenta')\n",
 								    "    print()\n",
 								    "    print(f'Hipoteza: {thesis}')\n",
 								    "    if alternative is Alternatives.LESS:\n",
 								    "        print(f'Hipoteza alternatywna: średnia jest mniejsza')\n",
 								    "    else:\n",
 								    "        print(f'Hipoteza alternatywna: średnia jest większa')\n",
 								    "    print()\n",
 								    "    print(f'p: {p}')\n",
 								    "    print(f'Wartość statystyki testowej z próby: {t_stat_from_sample}')\n",
 								    "    print(f'Wartości statystyk z prób boostrapowych:')\n",
 								    "\n",
 								    "    t_stat_list_len = len(t_stat_list)\n",
 								    "    for i in range(min(max_print, t_stat_list_len)):\n",
 								    "        print(f'{t_stat_list[i]}, ', end='')\n",
 								    "    if max_print < t_stat_list_len:\n",
 								    "        remaining = t_stat_list_len - max_print\n",
 								    "        print(f'... (i {remaining} pozostałych)')\n",
 								    "\n",
 								    "    print()\n",
 								    "    print()"
 								   ]
 								  },
-												test shapiro wilka

											
										
										
											2022-05-17 17:58:54 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Test Shapiro Wilka\n",
 								    "\n",
 								    "Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny."
 								   ]
 								  },
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Testowanie hipotez metodą bootstrap\n",
 								    "\n",
 								    "**Bootstrap** – metoda szacowania (estymacji) wyników poprzez wielokrotne losowanie ze zwracaniem z próby. Polega ona na utworzeniu nowego rozkładu wyników, na podstawie posiadanych danych, poprzez wielokrotne losowanie wartości z posiadanej próby. Metoda ze zwracaniem polega na tym, że po wylosowaniu danej wartości, “wraca” ona z powrotem do zbioru.\n",
 								    "\n",
 								    "Metoda bootstrapowa znajduje zastosowanie w sytuacji, w której nie znamy rozkładu z populacji z której pochodzi próbka lub w przypadku rozkładów małych lub asymetrycznych. W takim wypadku, dzięki tej metodzie, wyniki testów parametrycznych i analiz opartych o modele liniowe są bardziej precyzyjne. Zazwyczaj losuje się wiele próbek, np. 2000 czy 5000."
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 70,
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   "metadata": {
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
 								   },
 								   "outputs": [],
 								   "source": [
 								    "def generate_bootstraps(data, n_bootstraps=100):\n",
 								    "    data_size = data.shape[0]\n",
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								    "    for _ in range(n_bootstraps):\n",
 								    "        indices =  np.random.choice(len(data), size=data_size)\n",
 								    "        yield data.iloc[indices, :]"
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   ]
 								  },
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Test t studenta dla jednej próby\n",
 								    "\n",
 								    "**Test t Studenta dla jednej próby** wykorzystujemy gdy chcemy porównać średnią “teoretyczną” ze średnią, którą faktycznie możemy zaobserwować w naszej bazie danych. Średnia teoretyczna to średnia pochodząca z innych badań lub po prostu bez większych uzasadnień pochodząca z naszej głowy.\n",
 								    "\n",
 								    "Wyobraźmy sobie, że mamy dane z takimi zmiennymi jak wzrost pewnej grupy ludzi. Dzięki testowi t Studenta dla jednej próby możemy dowiedzieć się np. czy wzrost naszego młodszego brata wynoszący 155cm odbiega znacząco od średniej wzrostu tej grupy. Hipoteza zerowa w takim badaniu wyglądałaby następująco H0: Badana próba została wylosowana z populacji, w której wzrost osób wynosi średnio 155cm. Z kolei hipoteza alternatywna będzie brzmiała H1: Badana próba nie została wylosowana z populacji gdzie średni wzrost wynosi 155cm\n"
 								   ]
 								  },
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 71,
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
 								   },
-												test statistic function for one sample

											
										
										
											2022-05-14 15:31:47 +02:00
+								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def bootstrap_one_sample(sample, population_mean, alternative=Alternatives.LESS):\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    p, t, ts = t_test_1_samp(\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "        sample_1=sample,\n",
 								    "        population_mean=population_mean,\n",
 								    "        alternative=alternative,\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    )\n",
 								    "    \n",
 								    "    pretty_print_test(p, t, ts, f'średnia jest równa {population_mean}', alternative)\n",
 								    "    print()\n",
 								    "    return p, t, ts"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   "cell_type": "markdown",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "### Sprawdzenie czy osoba o wzroście 165cm pasuje do populacji (nie jest odmieńcem)"
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 72,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {},
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   "outputs": [],
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
 								    "dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
 								    "dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 73,
-												test statistic function for one sample

											
										
										
											2022-05-14 15:31:47 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   },
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "outputs": [
 								    {
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								     "name": "stdout",
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								     "output_type": "stream",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								     "text": [
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								      "Wyniki bootstrapowej wersji testu T-studenta\n",
 								      "\n",
 								      "Hipoteza: średnia jest równa 165\n",
 								      "Hipoteza alternatywna: średnia jest mniejsza\n",
 								      "\n",
 								      "p: 0.58\n",
 								      "Wartość statystyki testowej z próby: [-229.1025971]\n",
 								      "Wartości statystyk z prób boostrapowych:\n",
 								      "[-316.92367438], [-406.5], [-201.5], [-243.92267776], [-330.27286699], ... (i 95 pozostałych)\n"
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								     ]
 								    }
 								   ],
-												test statistic function for one sample

											
										
										
											2022-05-14 15:31:47 +02:00
+								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "#TODO: poprawić kod aby można było podawać kolumny\n",
 								    "\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "p, t, ts = bootstrap_one_sample(dummy, 165)"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
-												test statistic function for one sample

											
										
										
											2022-05-14 15:31:47 +02:00
+								  },
-												test shapiro wilka

											
										
										
											2022-05-17 17:58:54 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "TODO: Wniosek"
 								   ]
 								  },
-												declare test functions

											
										
										
											2022-05-14 17:09:29 +02:00
+								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "markdown",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {},
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "source": [
 								    "## Test t studenta dla prób niezależnych\n",
 								    "\n",
 								    "**Test t Studenta dla prób niezależnych** jest najczęściej stosowaną metodą statystyczną w celu porównania średnich z dwóch niezależnych od siebie grup. Wykorzystujemy go gdy chcemy porównać dwie grupy pod względem jakiejś zmiennej ilościowej. Na przykład gdy chcemy porównać średni wzrost kobiet i mężczyzn w danej grupie.\n",
 								    "\n",
 								    "Zazwyczaj dwie średnie z różnych od siebie grup będą się różnić. Test t Studenta powie nam jednak czy owe różnice są istotne statystycznie – czy nie są przypadkowe. Hipoteza zerowa takiego testu będzie brzmiała H0: Średni wzrost w grupie mężczyzn jest taki sam jak średni w grupie kobiet. Hipoteza alternatywna z kolei H1: Kobiety będą różnić się od mężczyzn pod wzrostu.\n",
 								    "Jeśli wynik testu t Studenta będzie istotny na poziomie p < 0,05 możemy odrzucić hipotezę zerową na rzecz hipotezy alternatywnej.\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 74,
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
 								   },
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def bootstrap_independent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    p, t, ts = t_test_ind(\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "        sample_1=sample_1,\n",
 								    "        sample_2=sample_2,\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "        alternative=alternative,\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    )\n",
 								    "    \n",
 								    "    pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
 								    "    return p, t, ts"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "markdown",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {},
 								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "# TODO: Wyciągnąć wysokości kobiet i mężczyzn oraz poprawić kod aby można było podawać kolumny\n",
 								    "t_stat, df, cv, p, _ = bootstrap_independent(dataset, dataset)\n",
 								    "pretty_print_full_stats(t_stat, df, cv, p)"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
-												test shapiro wilka

											
										
										
											2022-05-17 17:58:54 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "TODO: Wniosek"
 								   ]
 								  },
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "markdown",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {},
 								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "## Test t studenta dla prób zależnych\n",
 								    "\n",
 								    "W odróżnieniu od testu t – Studenta dla prób niezależnych, gdzie porównujemy dwie grupy, ten rodzaj testu stosujemy gdy poddajemy analizie tą samą pojedynczą grupę, ale dwukrotnie w czasie. Na przykład gdy chcemy porównać średnie wagi grupy osób przed dietą oraz po diecie, aby sprawdzić czy dieta spowodowała istotne zmiany statystyczne.\n",
 								    "\n",
 								    "Hipoteza zerowa takiego testu będzie brzmiała H0: Średnia waga osób po diecie jest taka sama jak przed dietą. Hipoteza alternatywna z kolei H1: Dieta znacząco wpłynęła na średnią wagę danej grupy."
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 75,
-												declare test functions

											
										
										
											2022-05-14 17:09:29 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "def bootstrap_dependent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    p, t, ts = t_test_dep(\n",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								    "        sample_1=sample_1,\n",
 								    "        sample_2=sample_2,\n",
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								    "        alternative=alternative,\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "    )\n",
 								    "    \n",
 								    "    pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
 								    "    return p, t, ts"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "markdown",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {},
 								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "# TODO: Wyciągnąć wagi przed dietą i po oraz poprawić kod aby można było podawać kolumny\n",
 								    "t_stat, df, cv, p, _ = bootstrap_dependent(dataset, dataset)\n",
 								    "pretty_print_full_stats(t_stat, df, cv, p)"
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   ]
 								  },
-												test shapiro wilka

											
										
										
											2022-05-17 17:58:54 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "TODO: Wniosek"
 								   ]
 								  },
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  {
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								   "cell_type": "markdown",
 								   "metadata": {},
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   "source": [
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "## Wykresy"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 76,
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "pycharm": {
 								     "name": "#%%\n"
 								    }
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   },
-												histogram

											
										
										
											2022-05-16 18:52:49 +02:00
+								   "outputs": [],
-												misc. changes

											
										
										
											2022-05-13 22:06:56 +02:00
+								   "source": [
-												TODOs

											
										
										
											2022-05-17 13:58:25 +02:00
+								    "def draw_distribution(stats):\n",
-												histogram

											
										
										
											2022-05-16 18:52:49 +02:00
+								    "    \"\"\"\n",
 								    "    Funkcja rysuje rozkład statystyki testowej\n",
-												fixes

											
										
										
											2022-05-17 16:21:32 +02:00
+								    "    @param stats: lista statystyk testowych\n",
-												histogram

											
										
										
											2022-05-16 18:52:49 +02:00
+								    "    \"\"\"\n",
 								    "    plt.hist(stats)\n",
 								    "    plt.xlabel('Test statistic value')\n",
 								    "    plt.ylabel('Frequency')\n",
 								    "    plt.show()"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
 								  },
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Testy"
 								   ]
 								  },
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								  {
 								   "cell_type": "code",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "execution_count": 77,
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   "metadata": {},
-												test statistic functions for all 3 tests

											
										
										
											2022-05-14 16:40:40 +02:00
+								   "outputs": [
 								    {
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								     "name": "stdout",
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								     "output_type": "stream",
-												test statistic functions for all 3 tests

											
										
										
											2022-05-14 16:40:40 +02:00
+								     "text": [
-												fixes

											
										
										
											2022-05-17 16:21:32 +02:00
+								      "Statystyki dla jednej próby:\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								      "Wyniki bootstrapowej wersji testu T-studenta\n",
 								      "\n",
 								      "Hipoteza: średnia jest równa 2\n",
 								      "Hipoteza alternatywna: średnia jest mniejsza\n",
 								      "\n",
 								      "p: 0.52\n",
 								      "Wartość statystyki testowej z próby: [1.41421356]\n",
 								      "Wartości statystyk z prób boostrapowych:\n",
 								      "[-0.53452248], [0.], [-1.63299316], [1.5], [2.3590713], ... (i 95 pozostałych)\n",
-												fixes

											
										
										
											2022-05-17 16:21:32 +02:00
+								      "Statystyki dla dwóch prób zależnych:\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								      "Wyniki bootstrapowej wersji testu T-studenta\n",
 								      "\n",
 								      "Hipoteza: średnie są takie same\n",
 								      "Hipoteza alternatywna: średnia jest mniejsza\n",
 								      "\n",
 								      "p: 0.0\n",
 								      "Wartość statystyki testowej z próby: [10.61445555]\n",
 								      "Wartości statystyk z prób boostrapowych:\n",
 								      "[nan], [nan], [nan], [nan], [nan], ... (i 95 pozostałych)\n",
-												fixes

											
										
										
											2022-05-17 16:21:32 +02:00
+								      "Statystyki dla dwóch prób niezależnych:\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								      "Wyniki bootstrapowej wersji testu T-studenta\n",
 								      "\n",
 								      "Hipoteza: średnie są takie same\n",
 								      "Hipoteza alternatywna: średnia jest mniejsza\n",
 								      "\n",
 								      "p: 1.0\n",
 								      "Wartość statystyki testowej z próby: [2.4140394]\n",
 								      "Wartości statystyk z prób boostrapowych:\n",
 								      "[0.], [0.], [0.], [0.], [0.], ... (i 95 pozostałych)\n"
-												histogram

											
										
										
											2022-05-16 18:52:49 +02:00
+								     ]
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								    }
 								   ],
 								   "source": [
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								    "# Testy z bootstrappowaniem\n",
 								    "\n",
-												merge presentation with jupyter

											
										
										
											2022-05-17 17:27:59 +02:00
+								    "\n",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								    "print('Statystyki dla jednej próby:')\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "p, t, ts = bootstrap_one_sample(dummy, 2)\n",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								    "\n",
 								    "print('Statystyki dla dwóch prób zależnych:')\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "p, t, ts = bootstrap_dependent(dummy2, dummy3)\n",
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								    "\n",
 								    "print('Statystyki dla dwóch prób niezależnych:')\n",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								    "p, t, ts = bootstrap_independent(dummy2, dummy3)"
-												Implemented bootstrap tests

											
										
										
											2022-05-16 23:34:31 +02:00
+								   ]
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": []
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  }
 								 ],
 								 "metadata": {
 								  "interpreter": {
 								   "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
 								  },
 								  "kernelspec": {
-												Rewritten to proper bootstrap

											
										
										
											2022-05-17 19:40:13 +02:00
+								   "display_name": "Python 3.8.10 64-bit",
 								   "metadata": {
 								    "interpreter": {
 								     "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
 								    }
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								   },
 								   "name": "python3"
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  },
 								  "language_info": {
 								   "codemirror_mode": {
 								    "name": "ipython",
 								    "version": 3
 								   },
 								   "file_extension": ".py",
 								   "mimetype": "text/x-python",
 								   "name": "python",
 								   "nbconvert_exporter": "python",
 								   "pygments_lexer": "ipython3",
-												Add pretty prints

											
										
										
											2022-05-17 21:08:54 +02:00
+								   "version": "3.9.1"
-												Initial implementation

											
										
										
											2022-05-11 15:02:15 +02:00
+								  },
 								  "orig_nbformat": 4
 								 },
 								 "nbformat": 4,
 								 "nbformat_minor": 2
-												Return sample t-stat

											
										
										
											2022-05-17 20:56:02 +02:00
+								}