Merge pull request 'Rewritten to proper bootstrap' (#3) from fixes into main

Reviewed-on: #3
This commit is contained in:
Marcin Kostrzewski 2022-05-17 22:43:37 +02:00
commit f4f61b0876

View File

@ -25,7 +25,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 62, "execution_count": 68,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -35,18 +35,14 @@
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from math import sqrt\n",
"from scipy import stats\n",
"from scipy.stats import sem\n",
"from scipy.stats import t\n",
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\n",
"from statistics import mean, stdev\n", "from enum import Enum\n",
"from scipy.stats import ttest_ind, ttest_1samp, ttest_rel" "from scipy.stats import ttest_ind, ttest_1samp, ttest_rel, shapiro"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 86, "execution_count": 69,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -55,29 +51,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 70,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def calculate_p(t_stat, df):\n", "class Alternatives(Enum):\n",
" \"\"\"Funkcja oblicza wartość *p* na podstawie statystyki testowej i stopni swobody\"\"\"\n", " LESS = 'less'\n",
" return (1.0 - t.cdf(abs(t_stat), df)) * 2.0" " GREATER = 'greater'"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 71,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def calculate_cv(df, alpha=0.05):\n", "def calculate_t_difference(t_stat_sample, t_stat_list, alternative):\n",
" \"\"\"Funkcja oblicza wartość krytyczną (critical value)\"\"\"\n", " \"\"\"\n",
" return t.ppf(1.0 - alpha, df)" " Funkcja oblicza procent statystyk testowych powstałych z prób bootstrapowych, \n",
" które róznią się od statystyki testowej powstałej ze zbioru według hipotezy alternatywnej.\n",
" \"\"\"\n",
" all_stats = len(t_stat_list)\n",
" stats_different_count = 0\n",
" for t_stat_boot in t_stat_list:\n",
" if alternative is Alternatives.LESS and t_stat_boot < t_stat_sample:\n",
" stats_different_count += 1 \n",
" elif alternative is Alternatives.GREATER and t_stat_boot > t_stat_sample:\n",
" stats_different_count += 1\n",
" return stats_different_count / all_stats"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 53, "execution_count": 72,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -85,57 +91,112 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_test(sample_1, sample_2=None, df_fn=df_single, t_stat_fn=t_stat_single, population_mean=None, alpha=0.05):\n", "def t_test_1_samp(sample_1, population_mean=None, alternative=Alternatives.LESS):\n",
" \"\"\"\n", " \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych.\n", " Funkcja przeprowadza test T-studenta dla jednej zmiennej.\n",
" liczba kolumn wynosi 1, test jest przeprowadzany dla jednej zmiennej.\n",
" @param df_fn - funkcja obliczająca stopnie swobody\n",
" @param t_stat_fn - funkcja obliczająca statystykę T\n",
" \"\"\"\n", " \"\"\"\n",
" t_stat_list = get_t_stats(sample_1, sample_2, t_stat_fn, population_mean=population_mean)\n", " t_stat_from_sample, _ = ttest_1samp(a=sample_1, popmean=population_mean, alternative=alternative.value)\n",
" t_stat_sum = sum(t_stat_list)\n", " t_stat_list = get_t_stats(sample_1, t_stat_fn=ttest_1samp, alternative=alternative, population_mean=population_mean)\n",
"\n", "\n",
" data_size = sample_1.shape[0]\n", " p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n", "\n",
" t_stat = t_stat_sum / data_size\n", " return p, t_stat_from_sample, t_stat_list"
" # TODO: dolna i górna opcja dają inne wyniki z jakiegoś powodu (???)\n",
" t_stat = mean(t_stat_list)\n",
"\n",
" if sample_2 is None:\n",
" df = df_fn(sample_1)\n",
" else:\n",
" df = df_fn(sample_1, sample_2)\n",
" cv = calculate_cv(df, alpha)\n",
" p = calculate_p(t_stat, df)\n",
" return t_stat, df, cv, p, t_stat_list"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 54, "execution_count": 73,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=t_stat_single, population_mean=None):\n", "def t_test_ind(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych niezależnych.\n",
" \"\"\"\n",
" t_stat_from_sample, _ = ttest_ind(sample_1, sample_2, alternative=alternative.value)\n",
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_ind)\n",
"\n",
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n",
" return p, t_stat_from_sample, t_stat_list"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def t_test_dep(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych zależnych.\n",
" \"\"\"\n",
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_rel)\n",
" t_stat_from_sample, _ = ttest_rel(sample_1, sample_2, alternative=alternative.value)\n",
"\n",
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n",
" return p, t_stat_from_sample, t_stat_list"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=ttest_1samp, alternative=Alternatives.LESS, population_mean=None):\n",
" \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n", " \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
" t_stat_list = []\n", " t_stat_list = []\n",
"\n", "\n",
" # One sample test\n", " # One sample test\n",
" if t_stat_fn==t_stat_single:\n", " if t_stat_fn is ttest_1samp and sample_2 is None:\n",
" if not population_mean:\n", " if not population_mean:\n",
" raise Exception(\"population_mean not provided\")\n", " raise Exception(\"population_mean not provided\")\n",
" for bootstrap in generate_bootstraps(sample_1):\n", " for bootstrap in generate_bootstraps(sample_1):\n",
" stat = t_stat_fn(bootstrap, population_mean)\n", " stat, _ = t_stat_fn(bootstrap, population_mean, alternative=alternative.value)\n",
" t_stat_list.append(stat)\n", " t_stat_list.append(stat)\n",
" return t_stat_list\n", " return t_stat_list\n",
"\n", "\n",
" # Two sample test\n", " # Two sample test\n",
" for bootstrap_1, bootstrap_2 in zip(generate_bootstraps(sample_1), generate_bootstraps(sample_2)):\n", " for bootstrap_sample in generate_bootstraps(pd.concat((sample_1, sample_2), ignore_index=True)):\n",
" stat = t_stat_fn(bootstrap_1, bootstrap_2)\n", " bootstrap_1 = bootstrap_sample.iloc[: len(bootstrap_sample) // 2]\n",
" bootstrap_2 = bootstrap_sample.iloc[len(bootstrap_sample) // 2 :]\n",
" stat, _ = t_stat_fn(bootstrap_1, bootstrap_2, alternative=alternative.value)\n",
" t_stat_list.append(stat)\n", " t_stat_list.append(stat)\n",
" return t_stat_list" " return t_stat_list"
] ]
}, },
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"def pretty_print_test(p, t_stat_from_sample, t_stat_list, thesis, alternative, max_print=5):\n",
" print('Wyniki bootstrapowej wersji testu T-studenta')\n",
" print()\n",
" print(f'Hipoteza: {thesis}')\n",
" if alternative is Alternatives.LESS:\n",
" print(f'Hipoteza alternatywna: średnia jest mniejsza')\n",
" else:\n",
" print(f'Hipoteza alternatywna: średnia jest większa')\n",
" print()\n",
" print(f'p: {p}')\n",
" print(f'Wartość statystyki testowej z próby: {t_stat_from_sample}')\n",
" print(f'Wartości statystyk z prób boostrapowych:')\n",
"\n",
" t_stat_list_len = len(t_stat_list)\n",
" for i in range(min(max_print, t_stat_list_len)):\n",
" print(f'{t_stat_list[i]}, ', end='')\n",
" if max_print < t_stat_list_len:\n",
" remaining = t_stat_list_len - max_print\n",
" print(f'... (i {remaining} pozostałych)')\n",
"\n",
" print()\n",
" print()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -147,7 +208,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 87, "execution_count": 77,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -164,7 +225,7 @@
"source": [ "source": [
"ALPHA = 0.05\n", "ALPHA = 0.05\n",
"female_heights = dataset['Female height'].to_numpy()\n", "female_heights = dataset['Female height'].to_numpy()\n",
"shapiro_test = stats.shapiro(female_heights)\n", "shapiro_test = shapiro(female_heights)\n",
"\n", "\n",
"if shapiro_test.pvalue > ALPHA:\n", "if shapiro_test.pvalue > ALPHA:\n",
" print(\"Female height: Dane mają rozkład normalny.\")\n", " print(\"Female height: Dane mają rozkład normalny.\")\n",
@ -172,7 +233,7 @@
" print(\"Female height: Dane nie mają rozkładu normalnego.\")\n", " print(\"Female height: Dane nie mają rozkładu normalnego.\")\n",
"\n", "\n",
"male_heights = dataset['Male height'].to_numpy()\n", "male_heights = dataset['Male height'].to_numpy()\n",
"shapiro_test = stats.shapiro(male_heights)\n", "shapiro_test = shapiro(male_heights)\n",
"\n", "\n",
"if shapiro_test.pvalue > ALPHA:\n", "if shapiro_test.pvalue > ALPHA:\n",
" print(\"Male height: Dane mają rozkład normalny.\")\n", " print(\"Male height: Dane mają rozkład normalny.\")\n",
@ -180,7 +241,7 @@
" print(\"Male height: Dane nie mają rozkładu normalnego.\")\n", " print(\"Male height: Dane nie mają rozkładu normalnego.\")\n",
"\n", "\n",
"weights_before = dataset['Weight before'].to_numpy()\n", "weights_before = dataset['Weight before'].to_numpy()\n",
"shapiro_test = stats.shapiro(weights_before)\n", "shapiro_test = shapiro(weights_before)\n",
"\n", "\n",
"if shapiro_test.pvalue > ALPHA:\n", "if shapiro_test.pvalue > ALPHA:\n",
" print(\"Weight before: Dane mają rozkład normalny.\")\n", " print(\"Weight before: Dane mają rozkład normalny.\")\n",
@ -188,7 +249,7 @@
" print(\"Weight before: Dane nie mają rozkładu normalnego.\")\n", " print(\"Weight before: Dane nie mają rozkładu normalnego.\")\n",
"\n", "\n",
"weights_after = dataset['Weight after'].to_numpy()\n", "weights_after = dataset['Weight after'].to_numpy()\n",
"shapiro_test = stats.shapiro(weights_after)\n", "shapiro_test = shapiro(weights_after)\n",
"\n", "\n",
"if shapiro_test.pvalue > ALPHA:\n", "if shapiro_test.pvalue > ALPHA:\n",
" print(\"Weight after: Dane mają rozkład normalny.\")\n", " print(\"Weight after: Dane mają rozkład normalny.\")\n",
@ -211,7 +272,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 55, "execution_count": 78,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -239,7 +300,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 79,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -248,45 +309,16 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_single(sample, population_mean):\n", "def bootstrap_one_sample(sample, population_mean, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n", " p, t, ts = t_test_1_samp(\n",
" if sample.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample = sample['Height'].values.tolist()\n",
" sample_size = len(sample)\n",
" return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"def df_single(sample_1):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla jednej próbki\"\"\"\n",
" # TODO: I have no clue what to return from here\n",
" return len(sample_1)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_one_sample(sample, population_mean):\n",
" return t_test(\n",
" sample_1=sample,\n", " sample_1=sample,\n",
" df_fn=df_single,\n", " population_mean=population_mean,\n",
" t_stat_fn=t_stat_single,\n", " alternative=alternative,\n",
" population_mean=population_mean\n", " )\n",
" )" " \n",
" pretty_print_test(p, t, ts, f'średnia jest równa {population_mean}', alternative)\n",
" print()\n",
" return p, t, ts"
] ]
}, },
{ {
@ -298,7 +330,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 61, "execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -310,7 +353,17 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"t: 6.854929920812628, df: 500, cv: 1.6479068539295045, p: 2.1091128843409024e-11\n", "Wyniki bootstrapowej wersji testu T-studenta\n",
"\n",
"Hipoteza: średnia jest równa 165\n",
"Hipoteza alternatywna: średnia jest mniejsza\n",
"\n",
"p: 0.72\n",
"Wartość statystyki testowej z próby: [-229.1025971]\n",
"Wartości statystyk z prób boostrapowych:\n",
"[-239.4457368], [-201.5], [-176.97470898], [-256.14449047], [-436.1703468], ... (i 95 pozostałych)\n",
"\n",
"\n",
"\n" "\n"
] ]
} }
@ -318,8 +371,7 @@
"source": [ "source": [
"#TODO: poprawić kod aby można było podawać kolumny\n", "#TODO: poprawić kod aby można było podawać kolumny\n",
"\n", "\n",
"t_stat, df, cv, p, _ = bootstrap_one_sample(dataset, 165)\n", "p, t, ts = bootstrap_one_sample(dummy, 165)"
"pretty_print_full_stats(t_stat, df, cv, p)"
] ]
}, },
{ {
@ -343,7 +395,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 159, "execution_count": 82,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -352,56 +404,15 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_ind(sample_1, sample_2):\n", "def bootstrap_independent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n", " p, t, ts = t_test_ind(\n",
" if sample_1.empty or sample_2.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample_1 = sample_1[0].values.tolist()\n",
" sample_2 = sample_2[0].values.tolist()\n",
" sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n",
" return (mean(sample_1) - mean(sample_2)) / sed"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"def df_ind(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek niezależnych\"\"\"\n",
" return len(sample_1) + len(sample_2) - 2"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_independent(sample_1, sample_2):\n",
" return t_test(\n",
" sample_1=sample_1,\n", " sample_1=sample_1,\n",
" sample_2=sample_2,\n", " sample_2=sample_2,\n",
" df_fn=df_ind,\n", " alternative=alternative,\n",
" t_stat_fn=t_stat_ind\n", " )\n",
" )" " \n",
] " pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
}, " return p, t, ts"
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#TODO: Wyciągnąć wysokości kobiet i mężczyzn oraz poprawić kod aby można było podawać kolumny\n",
"t_stat, df, cv, p, _ = bootstrap_independent(dataset, dataset)\n",
"pretty_print_full_stats(t_stat, df, cv, p)"
] ]
}, },
{ {
@ -424,7 +435,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 160, "execution_count": 83,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -433,49 +444,15 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_dep(sample_1, sample_2, mu=0):\n", "def bootstrap_dependent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n", " p, t, ts = t_test_dep(\n",
" if sample_1.empty or sample_2.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample_1 = sample_1[0].values.tolist()\n",
" sample_2 = sample_2[0].values.tolist()\n",
" differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n",
" sample_size = len(sample_1)\n",
" return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"def df_dep(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek zależnych\"\"\"\n",
" l1, l2 = len(sample_1), len(sample_2)\n",
" if l1 != l2:\n",
" raise Exception(\"Samples aren't of equal length\")\n",
" return l1"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_dependent(sample_1, sample_2):\n",
" return t_test(\n",
" sample_1=sample_1,\n", " sample_1=sample_1,\n",
" sample_2=sample_2,\n", " sample_2=sample_2,\n",
" df_fn=df_dep,\n", " alternative=alternative,\n",
" t_stat_fn=t_stat_dep\n", " )\n",
" )" " \n",
" pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
" return p, t, ts"
] ]
}, },
{ {
@ -503,7 +480,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 171, "execution_count": 84,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -532,76 +509,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 31, "execution_count": 85,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyka testowa dla jednej próby:\n",
"1.414213562373095 - z naszej funkcji\n",
"[1.41421356] - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób niezależnych:\n",
"-3.0 - z naszej funkcji\n",
"[-3.] - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób zależnych:\n",
"-1.6329931618554525 - z naszej funkcji\n",
"[-1.63299316] - z gotowej biblioteki\n",
"\n"
]
}
],
"source": [
"# Testy dla samych statystyk testowych\n",
"def pretty_print_stats(t_stat_selfmade, t_stat_lib, suffix):\n",
" print(f'Statystyka testowa dla {suffix}:')\n",
" print(t_stat_selfmade, '- z naszej funkcji')\n",
" print(t_stat_lib, '- z gotowej biblioteki')\n",
" print()\n",
" \n",
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])\n",
"\n",
"t_stat_selfmade = t_stat_single(dummy, 2)\n",
"t_stat_lib, _ = ttest_1samp(dummy, 2)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'jednej próby')\n",
"\n",
"t_stat_selfmade = t_stat_ind(dummy, dummy2)\n",
"t_stat_lib, _ = ttest_ind(dummy, dummy2)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób niezależnych')\n",
"\n",
"t_stat_selfmade = t_stat_dep(dummy, dummy3)\n",
"t_stat_lib, _ = ttest_rel(dummy, dummy3)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób zależnych')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Statystyki dla jednej próby:\n", "Statystyki dla jednej próby:\n",
"t: 1.8073147056683616, df: 5, cv: 2.015048372669157, p: 0.13052275003443325\n", "Wyniki bootstrapowej wersji testu T-studenta\n",
"\n",
"Hipoteza: średnia jest równa 2\n",
"Hipoteza alternatywna: średnia jest mniejsza\n",
"\n",
"p: 0.35\n",
"Wartość statystyki testowej z próby: [1.41421356]\n",
"Wartości statystyk z prób boostrapowych:\n",
"[2.44948974], [3.13785816], [1.72328087], [0.27216553], [1.17669681], ... (i 95 pozostałych)\n",
"\n", "\n",
"Statystyki dla dwóch prób zależnych:\n",
"t: 3.0790273716290404, df: 5, cv: 2.015048372669157, p: 0.027500015466573435\n",
"\n", "\n",
"Statystyki dla dwóch prób niezależnych:\n",
"t: 2.8109511013364576, df: 8, cv: 1.8595480375228421, p: 0.02280961069987497\n",
"\n" "\n"
] ]
} }
@ -609,22 +535,66 @@
"source": [ "source": [
"# Testy z bootstrappowaniem\n", "# Testy z bootstrappowaniem\n",
"\n", "\n",
"def pretty_print_full_stats(t_stat, df, cv, p):\n",
" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
"\n",
"print(type(dummy))\n",
"\n",
"print('Statystyki dla jednej próby:')\n", "print('Statystyki dla jednej próby:')\n",
"t_stat, df, cv, p, _ = bootstrap_one_sample(dummy, 2)\n", "p, t, ts = bootstrap_one_sample(dummy, 2)"
"pretty_print_full_stats(t_stat, df, cv, p)\n", ]
"\n", },
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyki dla dwóch prób zależnych:\n",
"Wyniki bootstrapowej wersji testu T-studenta\n",
"\n",
"Hipoteza: średnie są takie same\n",
"Hipoteza alternatywna: średnia jest mniejsza\n",
"\n",
"p: 1.0\n",
"Wartość statystyki testowej z próby: [10.61445555]\n",
"Wartości statystyk z prób boostrapowych:\n",
"[-2.66666667], [-0.14359163], [0.21199958], [0.11470787], [0.76696499], ... (i 95 pozostałych)\n",
"\n",
"\n"
]
}
],
"source": [
"print('Statystyki dla dwóch prób zależnych:')\n", "print('Statystyki dla dwóch prób zależnych:')\n",
"t_stat, df, cv, p, _ = bootstrap_dependent(dummy2, dummy3)\n", "p, t, ts = bootstrap_dependent(dummy2, dummy3)"
"pretty_print_full_stats(t_stat, df, cv, p)\n", ]
"\n", },
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyki dla dwóch prób niezależnych:\n",
"Wyniki bootstrapowej wersji testu T-studenta\n",
"\n",
"Hipoteza: średnie są takie same\n",
"Hipoteza alternatywna: średnia jest mniejsza\n",
"\n",
"p: 0.95\n",
"Wartość statystyki testowej z próby: [2.4140394]\n",
"Wartości statystyk z prób boostrapowych:\n",
"[-2.20937908], [0.13187609], [-0.81110711], [-0.94280904], [-0.77151675], ... (i 95 pozostałych)\n",
"\n",
"\n"
]
}
],
"source": [
"print('Statystyki dla dwóch prób niezależnych:')\n", "print('Statystyki dla dwóch prób niezależnych:')\n",
"t_stat, df, cv, p, _ = bootstrap_independent(dummy2, dummy3)\n", "p, t, ts = bootstrap_independent(dummy2, dummy3)"
"pretty_print_full_stats(t_stat, df, cv, p)"
] ]
} }
], ],
@ -633,8 +603,12 @@
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594" "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3.9.1 64-bit", "display_name": "Python 3.8.10 64-bit",
"language": "python", "metadata": {
"interpreter": {
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
}
},
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {
@ -648,8 +622,7 @@
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.1" "version": "3.9.1"
}, }
"orig_nbformat": 4
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 2