Rewritten to proper bootstrap

This commit is contained in:
emkarcinos 2022-05-17 19:40:13 +02:00
parent ccd4517925
commit 7abf326bbb

View File

@ -25,7 +25,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 62, "execution_count": 313,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -35,6 +35,7 @@
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from enum import Enum\n",
"from math import sqrt\n", "from math import sqrt\n",
"from scipy import stats\n", "from scipy import stats\n",
"from scipy.stats import sem\n", "from scipy.stats import sem\n",
@ -46,7 +47,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 314,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -55,29 +56,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 315,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def calculate_p(t_stat, df):\n", "class Alternatives(Enum):\n",
" \"\"\"Funkcja oblicza wartość *p* na podstawie statystyki testowej i stopni swobody\"\"\"\n", " LESS = 'less'\n",
" return (1.0 - t.cdf(abs(t_stat), df)) * 2.0" " GREATER = 'greater'"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 316,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def calculate_cv(df, alpha=0.05):\n", "def calculate_t_difference(t_stat_sample, t_stat_list, alternative):\n",
" \"\"\"Funkcja oblicza wartość krytyczną (critical value)\"\"\"\n", " \"\"\"\n",
" return t.ppf(1.0 - alpha, df)" " Funkcja oblicza procent statystyk testowych powstałych z prób bootstrapowych, \n",
" które róznią się od statystyki testowej powstałej ze zbioru według hipotezy alternatywnej.\n",
" \"\"\"\n",
" all_stats = len(t_stat_list)\n",
" stats_different_count = 0\n",
" for t_stat_boot in t_stat_list:\n",
" if alternative is Alternatives.LESS and t_stat_boot < t_stat_sample:\n",
" stats_different_count += 1 \n",
" elif alternative is Alternatives.GREATER and t_stat_boot > t_stat_sample:\n",
" stats_different_count += 1\n",
" return stats_different_count / all_stats"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 53, "execution_count": 317,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -85,53 +96,77 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_test(sample_1, sample_2=None, df_fn=df_single, t_stat_fn=t_stat_single, population_mean=None, alpha=0.05):\n", "def t_test_1_samp(sample_1, population_mean=None, alternative=Alternatives.LESS):\n",
" \"\"\"\n", " \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych.\n", " Funkcja przeprowadza test T-studenta dla jednej zmiennej.\n",
" liczba kolumn wynosi 1, test jest przeprowadzany dla jednej zmiennej.\n",
" @param df_fn - funkcja obliczająca stopnie swobody\n",
" @param t_stat_fn - funkcja obliczająca statystykę T\n",
" \"\"\"\n", " \"\"\"\n",
" t_stat_list = get_t_stats(sample_1, sample_2, t_stat_fn, population_mean=population_mean)\n", " t_stat_from_sample, _ = ttest_1samp(a=sample_1, popmean=population_mean, alternative=alternative.value)\n",
" t_stat_sum = sum(t_stat_list)\n", " t_stat_list = get_t_stats(sample_1, t_stat_fn=ttest_1samp, alternative=alternative, population_mean=population_mean)\n",
"\n", "\n",
" data_size = sample_1.shape[0]\n", " p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n", "\n",
" t_stat = t_stat_sum / data_size\n", " return p, t_stat_list"
" # TODO: dolna i górna opcja dają inne wyniki z jakiegoś powodu (???)\n",
" t_stat = mean(t_stat_list)\n",
"\n",
" if sample_2 is None:\n",
" df = df_fn(sample_1)\n",
" else:\n",
" df = df_fn(sample_1, sample_2)\n",
" cv = calculate_cv(df, alpha)\n",
" p = calculate_p(t_stat, df)\n",
" return t_stat, df, cv, p, t_stat_list"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 54, "execution_count": 318,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=t_stat_single, population_mean=None):\n", "def t_test_ind(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych niezależnych.\n",
" \"\"\"\n",
" t_stat_from_sample, _ = ttest_ind(sample_1, sample_2, alternative=alternative.value)\n",
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_ind)\n",
"\n",
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n",
" return p, t_stat_list"
]
},
{
"cell_type": "code",
"execution_count": 319,
"metadata": {},
"outputs": [],
"source": [
"def t_test_dep(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"\n",
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych zależnych.\n",
" \"\"\"\n",
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_rel)\n",
" t_stat_from_sample, _ = ttest_rel(sample_1, sample_2, alternative=alternative.value)\n",
"\n",
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
"\n",
" return p, t_stat_list"
]
},
{
"cell_type": "code",
"execution_count": 320,
"metadata": {},
"outputs": [],
"source": [
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=ttest_1samp, alternative=Alternatives.LESS, population_mean=None):\n",
" \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n", " \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
" t_stat_list = []\n", " t_stat_list = []\n",
"\n", "\n",
" # One sample test\n", " # One sample test\n",
" if t_stat_fn==t_stat_single:\n", " if t_stat_fn is ttest_1samp and sample_2 is None:\n",
" if not population_mean:\n", " if not population_mean:\n",
" raise Exception(\"population_mean not provided\")\n", " raise Exception(\"population_mean not provided\")\n",
" for bootstrap in generate_bootstraps(sample_1):\n", " for bootstrap in generate_bootstraps(sample_1):\n",
" stat = t_stat_fn(bootstrap, population_mean)\n", " stat, _ = t_stat_fn(bootstrap, population_mean, alternative=alternative.value)\n",
" t_stat_list.append(stat)\n", " t_stat_list.append(stat)\n",
" return t_stat_list\n", " return t_stat_list\n",
"\n", "\n",
" # Two sample test\n", " # Two sample test\n",
" for bootstrap_1, bootstrap_2 in zip(generate_bootstraps(sample_1), generate_bootstraps(sample_2)):\n", " for bootstrap_sample in generate_bootstraps(pd.concat((sample_1, sample_2))):\n",
" stat = t_stat_fn(bootstrap_1, bootstrap_2)\n", " bootstrap_1, bootstrap_2 = bootstrap_sample.iloc[: round(len(bootstrap_sample) * 0.5)], bootstrap_sample.iloc[: round(-len(bootstrap_sample) * 0.5)]\n",
" stat, _ = t_stat_fn(bootstrap_1, bootstrap_2, alternative=alternative.value)\n",
" t_stat_list.append(stat)\n", " t_stat_list.append(stat)\n",
" return t_stat_list" " return t_stat_list"
] ]
@ -145,34 +180,6 @@
"Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny." "Wszystkie rodzaje testów są testami parametrycznymi, a co za tym idzie nasze mierzone zmienne ilościowe powinny mieć rozkład normalny."
] ]
}, },
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9606528878211975\n",
"2.666284970587185e-10\n",
"Dane nie mają rozkładu normalnego.\n"
]
}
],
"source": [
"# TODO: Test Shapiro Wilka sprawdzający czy nasze dane mają rozkład normalny\n",
"x = dataset['Height'].to_numpy()\n",
"shapiro_test = stats.shapiro(x)\n",
"print(shapiro_test.statistic)\n",
"print(shapiro_test.pvalue)\n",
"\n",
"if shapiro_test.pvalue > shapiro_test.statistic:\n",
" print(\"Dane mają rozkład normalny.\")\n",
"else:\n",
" print(\"Dane nie mają rozkładu normalnego.\")"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -186,7 +193,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 55, "execution_count": 321,
"metadata": { "metadata": {
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n"
@ -214,7 +221,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 322,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -223,44 +230,11 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_single(sample, population_mean):\n", "def bootstrap_one_sample(sample, population_mean, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n", " return t_test_1_samp(\n",
" if sample.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample = sample['Height'].values.tolist()\n",
" sample_size = len(sample)\n",
" return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"def df_single(sample_1):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla jednej próbki\"\"\"\n",
" # TODO: I have no clue what to return from here\n",
" return len(sample_1)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_one_sample(sample, population_mean):\n",
" return t_test(\n",
" sample_1=sample,\n", " sample_1=sample,\n",
" df_fn=df_single,\n", " population_mean=population_mean,\n",
" t_stat_fn=t_stat_single,\n", " alternative=alternative,\n",
" population_mean=population_mean\n",
" )" " )"
] ]
}, },
@ -273,7 +247,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 61, "execution_count": 323,
"metadata": {},
"outputs": [],
"source": [
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])"
]
},
{
"cell_type": "code",
"execution_count": 324,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -282,19 +267,18 @@
}, },
"outputs": [ "outputs": [
{ {
"name": "stdout",
"output_type": "stream", "output_type": "stream",
"name": "stdout",
"text": [ "text": [
"t: 6.854929920812628, df: 500, cv: 1.6479068539295045, p: 2.1091128843409024e-11\n", "p: 0.73\n"
"\n"
] ]
} }
], ],
"source": [ "source": [
"#TODO: poprawić kod aby można było podawać kolumny\n", "#TODO: poprawić kod aby można było podawać kolumny\n",
"\n", "\n",
"t_stat, df, cv, p, _ = bootstrap_one_sample(dataset, 165)\n", "p, _ = bootstrap_one_sample(dummy, 165)\n",
"pretty_print_full_stats(t_stat, df, cv, p)" "print(f'p: {p}')"
] ]
}, },
{ {
@ -318,7 +302,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 159, "execution_count": 325,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -327,44 +311,11 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_ind(sample_1, sample_2):\n", "def bootstrap_independent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n", " return t_test_ind(\n",
" if sample_1.empty or sample_2.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample_1 = sample_1[0].values.tolist()\n",
" sample_2 = sample_2[0].values.tolist()\n",
" sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n",
" return (mean(sample_1) - mean(sample_2)) / sed"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"def df_ind(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek niezależnych\"\"\"\n",
" return len(sample_1) + len(sample_2) - 2"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_independent(sample_1, sample_2):\n",
" return t_test(\n",
" sample_1=sample_1,\n", " sample_1=sample_1,\n",
" sample_2=sample_2,\n", " sample_2=sample_2,\n",
" df_fn=df_ind,\n", " alternative=alternative,\n",
" t_stat_fn=t_stat_ind\n",
" )" " )"
] ]
}, },
@ -397,7 +348,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 160, "execution_count": 326,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -406,48 +357,11 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def t_stat_dep(sample_1, sample_2, mu=0):\n", "def bootstrap_dependent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n", " return t_test_dep(\n",
" if sample_1.empty or sample_2.empty:\n",
" raise Exception(\"Empty sample\")\n",
" sample_1 = sample_1[0].values.tolist()\n",
" sample_2 = sample_2[0].values.tolist()\n",
" differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n",
" sample_size = len(sample_1)\n",
" return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"def df_dep(sample_1, sample_2):\n",
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek zależnych\"\"\"\n",
" l1, l2 = len(sample_1), len(sample_2)\n",
" if l1 != l2:\n",
" raise Exception(\"Samples aren't of equal length\")\n",
" return l1"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"def bootstrap_dependent(sample_1, sample_2):\n",
" return t_test(\n",
" sample_1=sample_1,\n", " sample_1=sample_1,\n",
" sample_2=sample_2,\n", " sample_2=sample_2,\n",
" df_fn=df_dep,\n", " alternative=alternative,\n",
" t_stat_fn=t_stat_dep\n",
" )" " )"
] ]
}, },
@ -476,7 +390,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 171, "execution_count": 327,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
@ -505,100 +419,45 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 31, "execution_count": 328,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Statystyka testowa dla jednej próby:\n",
"1.414213562373095 - z naszej funkcji\n",
"[1.41421356] - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób niezależnych:\n",
"-3.0 - z naszej funkcji\n",
"[-3.] - z gotowej biblioteki\n",
"\n",
"Statystyka testowa dla dwóch prób zależnych:\n",
"-1.6329931618554525 - z naszej funkcji\n",
"[-1.63299316] - z gotowej biblioteki\n",
"\n"
]
}
],
"source": [
"# Testy dla samych statystyk testowych\n",
"def pretty_print_stats(t_stat_selfmade, t_stat_lib, suffix):\n",
" print(f'Statystyka testowa dla {suffix}:')\n",
" print(t_stat_selfmade, '- z naszej funkcji')\n",
" print(t_stat_lib, '- z gotowej biblioteki')\n",
" print()\n",
" \n",
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])\n",
"\n",
"t_stat_selfmade = t_stat_single(dummy, 2)\n",
"t_stat_lib, _ = ttest_1samp(dummy, 2)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'jednej próby')\n",
"\n",
"t_stat_selfmade = t_stat_ind(dummy, dummy2)\n",
"t_stat_lib, _ = ttest_ind(dummy, dummy2)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób niezależnych')\n",
"\n",
"t_stat_selfmade = t_stat_dep(dummy, dummy3)\n",
"t_stat_lib, _ = ttest_rel(dummy, dummy3)\n",
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób zależnych')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout",
"output_type": "stream", "output_type": "stream",
"name": "stdout",
"text": [ "text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Statystyki dla jednej próby:\n", "Statystyki dla jednej próby:\n",
"t: 1.8073147056683616, df: 5, cv: 2.015048372669157, p: 0.13052275003443325\n", "0.44\n",
"\n",
"Statystyki dla dwóch prób zależnych:\n", "Statystyki dla dwóch prób zależnych:\n",
"t: 3.0790273716290404, df: 5, cv: 2.015048372669157, p: 0.027500015466573435\n", "0.0\n",
"\n",
"Statystyki dla dwóch prób niezależnych:\n", "Statystyki dla dwóch prób niezależnych:\n",
"t: 2.8109511013364576, df: 8, cv: 1.8595480375228421, p: 0.02280961069987497\n", "1.0\n"
"\n"
] ]
} }
], ],
"source": [ "source": [
"# Testy z bootstrappowaniem\n", "# Testy z bootstrappowaniem\n",
"\n", "\n",
"def pretty_print_full_stats(t_stat, df, cv, p):\n",
" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
"\n",
"print(type(dummy))\n",
"\n", "\n",
"print('Statystyki dla jednej próby:')\n", "print('Statystyki dla jednej próby:')\n",
"t_stat, df, cv, p, _ = bootstrap_one_sample(dummy, 2)\n", "p, _ = bootstrap_one_sample(dummy, 2)\n",
"pretty_print_full_stats(t_stat, df, cv, p)\n", "print(f'p {p}')\n",
"\n", "\n",
"print('Statystyki dla dwóch prób zależnych:')\n", "print('Statystyki dla dwóch prób zależnych:')\n",
"t_stat, df, cv, p, _ = bootstrap_dependent(dummy2, dummy3)\n", "p, _ = bootstrap_dependent(dummy2, dummy3)\n",
"pretty_print_full_stats(t_stat, df, cv, p)\n", "print(f'p {p}')\n",
"\n", "\n",
"print('Statystyki dla dwóch prób niezależnych:')\n", "print('Statystyki dla dwóch prób niezależnych:')\n",
"t_stat, df, cv, p, _ = bootstrap_independent(dummy2, dummy3)\n", "p, _ = bootstrap_independent(dummy2, dummy3)\n",
"pretty_print_full_stats(t_stat, df, cv, p)" "print(f'p {p}')"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -606,9 +465,13 @@
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594" "hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3.9.1 64-bit", "name": "python3",
"language": "python", "display_name": "Python 3.8.10 64-bit",
"name": "python3" "metadata": {
"interpreter": {
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
}
}
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
@ -620,7 +483,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.1" "version": "3.8.10-final"
}, },
"orig_nbformat": 4 "orig_nbformat": 4
}, },