Merge pull request 'Rewritten to proper bootstrap' (#3) from fixes into main
Reviewed-on: #3
This commit is contained in:
commit
f4f61b0876
@ -25,7 +25,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 62,
|
"execution_count": 68,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
"name": "#%%\n"
|
"name": "#%%\n"
|
||||||
@ -35,18 +35,14 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from math import sqrt\n",
|
|
||||||
"from scipy import stats\n",
|
|
||||||
"from scipy.stats import sem\n",
|
|
||||||
"from scipy.stats import t\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"from statistics import mean, stdev\n",
|
"from enum import Enum\n",
|
||||||
"from scipy.stats import ttest_ind, ttest_1samp, ttest_rel"
|
"from scipy.stats import ttest_ind, ttest_1samp, ttest_rel, shapiro"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 86,
|
"execution_count": 69,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -55,29 +51,39 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 50,
|
"execution_count": 70,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def calculate_p(t_stat, df):\n",
|
"class Alternatives(Enum):\n",
|
||||||
" \"\"\"Funkcja oblicza wartość *p* na podstawie statystyki testowej i stopni swobody\"\"\"\n",
|
" LESS = 'less'\n",
|
||||||
" return (1.0 - t.cdf(abs(t_stat), df)) * 2.0"
|
" GREATER = 'greater'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 51,
|
"execution_count": 71,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def calculate_cv(df, alpha=0.05):\n",
|
"def calculate_t_difference(t_stat_sample, t_stat_list, alternative):\n",
|
||||||
" \"\"\"Funkcja oblicza wartość krytyczną (critical value)\"\"\"\n",
|
" \"\"\"\n",
|
||||||
" return t.ppf(1.0 - alpha, df)"
|
" Funkcja oblicza procent statystyk testowych powstałych z prób bootstrapowych, \n",
|
||||||
|
" które róznią się od statystyki testowej powstałej ze zbioru według hipotezy alternatywnej.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" all_stats = len(t_stat_list)\n",
|
||||||
|
" stats_different_count = 0\n",
|
||||||
|
" for t_stat_boot in t_stat_list:\n",
|
||||||
|
" if alternative is Alternatives.LESS and t_stat_boot < t_stat_sample:\n",
|
||||||
|
" stats_different_count += 1 \n",
|
||||||
|
" elif alternative is Alternatives.GREATER and t_stat_boot > t_stat_sample:\n",
|
||||||
|
" stats_different_count += 1\n",
|
||||||
|
" return stats_different_count / all_stats"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 53,
|
"execution_count": 72,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
"name": "#%%\n"
|
"name": "#%%\n"
|
||||||
@ -85,57 +91,112 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def t_test(sample_1, sample_2=None, df_fn=df_single, t_stat_fn=t_stat_single, population_mean=None, alpha=0.05):\n",
|
"def t_test_1_samp(sample_1, population_mean=None, alternative=Alternatives.LESS):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych.\n",
|
" Funkcja przeprowadza test T-studenta dla jednej zmiennej.\n",
|
||||||
" liczba kolumn wynosi 1, test jest przeprowadzany dla jednej zmiennej.\n",
|
|
||||||
" @param df_fn - funkcja obliczająca stopnie swobody\n",
|
|
||||||
" @param t_stat_fn - funkcja obliczająca statystykę T\n",
|
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" t_stat_list = get_t_stats(sample_1, sample_2, t_stat_fn, population_mean=population_mean)\n",
|
" t_stat_from_sample, _ = ttest_1samp(a=sample_1, popmean=population_mean, alternative=alternative.value)\n",
|
||||||
" t_stat_sum = sum(t_stat_list)\n",
|
" t_stat_list = get_t_stats(sample_1, t_stat_fn=ttest_1samp, alternative=alternative, population_mean=population_mean)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" data_size = sample_1.shape[0]\n",
|
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" t_stat = t_stat_sum / data_size\n",
|
" return p, t_stat_from_sample, t_stat_list"
|
||||||
" # TODO: dolna i górna opcja dają inne wyniki z jakiegoś powodu (???)\n",
|
|
||||||
" t_stat = mean(t_stat_list)\n",
|
|
||||||
"\n",
|
|
||||||
" if sample_2 is None:\n",
|
|
||||||
" df = df_fn(sample_1)\n",
|
|
||||||
" else:\n",
|
|
||||||
" df = df_fn(sample_1, sample_2)\n",
|
|
||||||
" cv = calculate_cv(df, alpha)\n",
|
|
||||||
" p = calculate_p(t_stat, df)\n",
|
|
||||||
" return t_stat, df, cv, p, t_stat_list"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 54,
|
"execution_count": 73,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=t_stat_single, population_mean=None):\n",
|
"def t_test_ind(sample_1, sample_2, alternative=Alternatives.LESS):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych niezależnych.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" t_stat_from_sample, _ = ttest_ind(sample_1, sample_2, alternative=alternative.value)\n",
|
||||||
|
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_ind)\n",
|
||||||
|
"\n",
|
||||||
|
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
|
||||||
|
"\n",
|
||||||
|
" return p, t_stat_from_sample, t_stat_list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 74,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def t_test_dep(sample_1, sample_2, alternative=Alternatives.LESS):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Funkcja przeprowadza test T-studenta dla dwóch zmiennych zależnych.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" t_stat_list = get_t_stats(sample_1, sample_2, alternative=alternative, t_stat_fn=ttest_rel)\n",
|
||||||
|
" t_stat_from_sample, _ = ttest_rel(sample_1, sample_2, alternative=alternative.value)\n",
|
||||||
|
"\n",
|
||||||
|
" p = calculate_t_difference(t_stat_from_sample, t_stat_list, alternative)\n",
|
||||||
|
"\n",
|
||||||
|
" return p, t_stat_from_sample, t_stat_list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 75,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_t_stats(sample_1, sample_2=None, t_stat_fn=ttest_1samp, alternative=Alternatives.LESS, population_mean=None):\n",
|
||||||
" \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
|
" \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n",
|
||||||
" t_stat_list = []\n",
|
" t_stat_list = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # One sample test\n",
|
" # One sample test\n",
|
||||||
" if t_stat_fn==t_stat_single:\n",
|
" if t_stat_fn is ttest_1samp and sample_2 is None:\n",
|
||||||
" if not population_mean:\n",
|
" if not population_mean:\n",
|
||||||
" raise Exception(\"population_mean not provided\")\n",
|
" raise Exception(\"population_mean not provided\")\n",
|
||||||
" for bootstrap in generate_bootstraps(sample_1):\n",
|
" for bootstrap in generate_bootstraps(sample_1):\n",
|
||||||
" stat = t_stat_fn(bootstrap, population_mean)\n",
|
" stat, _ = t_stat_fn(bootstrap, population_mean, alternative=alternative.value)\n",
|
||||||
" t_stat_list.append(stat)\n",
|
" t_stat_list.append(stat)\n",
|
||||||
" return t_stat_list\n",
|
" return t_stat_list\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Two sample test\n",
|
" # Two sample test\n",
|
||||||
" for bootstrap_1, bootstrap_2 in zip(generate_bootstraps(sample_1), generate_bootstraps(sample_2)):\n",
|
" for bootstrap_sample in generate_bootstraps(pd.concat((sample_1, sample_2), ignore_index=True)):\n",
|
||||||
" stat = t_stat_fn(bootstrap_1, bootstrap_2)\n",
|
" bootstrap_1 = bootstrap_sample.iloc[: len(bootstrap_sample) // 2]\n",
|
||||||
|
" bootstrap_2 = bootstrap_sample.iloc[len(bootstrap_sample) // 2 :]\n",
|
||||||
|
" stat, _ = t_stat_fn(bootstrap_1, bootstrap_2, alternative=alternative.value)\n",
|
||||||
" t_stat_list.append(stat)\n",
|
" t_stat_list.append(stat)\n",
|
||||||
" return t_stat_list"
|
" return t_stat_list"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 76,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def pretty_print_test(p, t_stat_from_sample, t_stat_list, thesis, alternative, max_print=5):\n",
|
||||||
|
" print('Wyniki bootstrapowej wersji testu T-studenta')\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(f'Hipoteza: {thesis}')\n",
|
||||||
|
" if alternative is Alternatives.LESS:\n",
|
||||||
|
" print(f'Hipoteza alternatywna: średnia jest mniejsza')\n",
|
||||||
|
" else:\n",
|
||||||
|
" print(f'Hipoteza alternatywna: średnia jest większa')\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(f'p: {p}')\n",
|
||||||
|
" print(f'Wartość statystyki testowej z próby: {t_stat_from_sample}')\n",
|
||||||
|
" print(f'Wartości statystyk z prób boostrapowych:')\n",
|
||||||
|
"\n",
|
||||||
|
" t_stat_list_len = len(t_stat_list)\n",
|
||||||
|
" for i in range(min(max_print, t_stat_list_len)):\n",
|
||||||
|
" print(f'{t_stat_list[i]}, ', end='')\n",
|
||||||
|
" if max_print < t_stat_list_len:\n",
|
||||||
|
" remaining = t_stat_list_len - max_print\n",
|
||||||
|
" print(f'... (i {remaining} pozostałych)')\n",
|
||||||
|
"\n",
|
||||||
|
" print()\n",
|
||||||
|
" print()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -147,7 +208,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 87,
|
"execution_count": 77,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -164,7 +225,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"ALPHA = 0.05\n",
|
"ALPHA = 0.05\n",
|
||||||
"female_heights = dataset['Female height'].to_numpy()\n",
|
"female_heights = dataset['Female height'].to_numpy()\n",
|
||||||
"shapiro_test = stats.shapiro(female_heights)\n",
|
"shapiro_test = shapiro(female_heights)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if shapiro_test.pvalue > ALPHA:\n",
|
"if shapiro_test.pvalue > ALPHA:\n",
|
||||||
" print(\"Female height: Dane mają rozkład normalny.\")\n",
|
" print(\"Female height: Dane mają rozkład normalny.\")\n",
|
||||||
@ -172,7 +233,7 @@
|
|||||||
" print(\"Female height: Dane nie mają rozkładu normalnego.\")\n",
|
" print(\"Female height: Dane nie mają rozkładu normalnego.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"male_heights = dataset['Male height'].to_numpy()\n",
|
"male_heights = dataset['Male height'].to_numpy()\n",
|
||||||
"shapiro_test = stats.shapiro(male_heights)\n",
|
"shapiro_test = shapiro(male_heights)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if shapiro_test.pvalue > ALPHA:\n",
|
"if shapiro_test.pvalue > ALPHA:\n",
|
||||||
" print(\"Male height: Dane mają rozkład normalny.\")\n",
|
" print(\"Male height: Dane mają rozkład normalny.\")\n",
|
||||||
@ -180,7 +241,7 @@
|
|||||||
" print(\"Male height: Dane nie mają rozkładu normalnego.\")\n",
|
" print(\"Male height: Dane nie mają rozkładu normalnego.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"weights_before = dataset['Weight before'].to_numpy()\n",
|
"weights_before = dataset['Weight before'].to_numpy()\n",
|
||||||
"shapiro_test = stats.shapiro(weights_before)\n",
|
"shapiro_test = shapiro(weights_before)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if shapiro_test.pvalue > ALPHA:\n",
|
"if shapiro_test.pvalue > ALPHA:\n",
|
||||||
" print(\"Weight before: Dane mają rozkład normalny.\")\n",
|
" print(\"Weight before: Dane mają rozkład normalny.\")\n",
|
||||||
@ -188,7 +249,7 @@
|
|||||||
" print(\"Weight before: Dane nie mają rozkładu normalnego.\")\n",
|
" print(\"Weight before: Dane nie mają rozkładu normalnego.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"weights_after = dataset['Weight after'].to_numpy()\n",
|
"weights_after = dataset['Weight after'].to_numpy()\n",
|
||||||
"shapiro_test = stats.shapiro(weights_after)\n",
|
"shapiro_test = shapiro(weights_after)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if shapiro_test.pvalue > ALPHA:\n",
|
"if shapiro_test.pvalue > ALPHA:\n",
|
||||||
" print(\"Weight after: Dane mają rozkład normalny.\")\n",
|
" print(\"Weight after: Dane mają rozkład normalny.\")\n",
|
||||||
@ -211,7 +272,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 55,
|
"execution_count": 78,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
"name": "#%%\n"
|
"name": "#%%\n"
|
||||||
@ -239,7 +300,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 60,
|
"execution_count": 79,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
@ -248,45 +309,16 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def t_stat_single(sample, population_mean):\n",
|
"def bootstrap_one_sample(sample, population_mean, alternative=Alternatives.LESS):\n",
|
||||||
" \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n",
|
" p, t, ts = t_test_1_samp(\n",
|
||||||
" if sample.empty:\n",
|
|
||||||
" raise Exception(\"Empty sample\")\n",
|
|
||||||
" sample = sample['Height'].values.tolist()\n",
|
|
||||||
" sample_size = len(sample)\n",
|
|
||||||
" return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 57,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def df_single(sample_1):\n",
|
|
||||||
" \"\"\"Funkcja oblicza stopnie swobody dla jednej próbki\"\"\"\n",
|
|
||||||
" # TODO: I have no clue what to return from here\n",
|
|
||||||
" return len(sample_1)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 58,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false,
|
|
||||||
"pycharm": {
|
|
||||||
"name": "#%%\n"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def bootstrap_one_sample(sample, population_mean):\n",
|
|
||||||
" return t_test(\n",
|
|
||||||
" sample_1=sample,\n",
|
" sample_1=sample,\n",
|
||||||
" df_fn=df_single,\n",
|
" population_mean=population_mean,\n",
|
||||||
" t_stat_fn=t_stat_single,\n",
|
" alternative=alternative,\n",
|
||||||
" population_mean=population_mean\n",
|
" )\n",
|
||||||
" )"
|
" \n",
|
||||||
|
" pretty_print_test(p, t, ts, f'średnia jest równa {population_mean}', alternative)\n",
|
||||||
|
" print()\n",
|
||||||
|
" return p, t, ts"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -298,7 +330,18 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 61,
|
"execution_count": 80,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
|
||||||
|
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
|
||||||
|
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 81,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
@ -310,7 +353,17 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"t: 6.854929920812628, df: 500, cv: 1.6479068539295045, p: 2.1091128843409024e-11\n",
|
"Wyniki bootstrapowej wersji testu T-studenta\n",
|
||||||
|
"\n",
|
||||||
|
"Hipoteza: średnia jest równa 165\n",
|
||||||
|
"Hipoteza alternatywna: średnia jest mniejsza\n",
|
||||||
|
"\n",
|
||||||
|
"p: 0.72\n",
|
||||||
|
"Wartość statystyki testowej z próby: [-229.1025971]\n",
|
||||||
|
"Wartości statystyk z prób boostrapowych:\n",
|
||||||
|
"[-239.4457368], [-201.5], [-176.97470898], [-256.14449047], [-436.1703468], ... (i 95 pozostałych)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -318,8 +371,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"#TODO: poprawić kod aby można było podawać kolumny\n",
|
"#TODO: poprawić kod aby można było podawać kolumny\n",
|
||||||
"\n",
|
"\n",
|
||||||
"t_stat, df, cv, p, _ = bootstrap_one_sample(dataset, 165)\n",
|
"p, t, ts = bootstrap_one_sample(dummy, 165)"
|
||||||
"pretty_print_full_stats(t_stat, df, cv, p)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -343,7 +395,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 159,
|
"execution_count": 82,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
@ -352,56 +404,15 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def t_stat_ind(sample_1, sample_2):\n",
|
"def bootstrap_independent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
|
||||||
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n",
|
" p, t, ts = t_test_ind(\n",
|
||||||
" if sample_1.empty or sample_2.empty:\n",
|
|
||||||
" raise Exception(\"Empty sample\")\n",
|
|
||||||
" sample_1 = sample_1[0].values.tolist()\n",
|
|
||||||
" sample_2 = sample_2[0].values.tolist()\n",
|
|
||||||
" sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n",
|
|
||||||
" return (mean(sample_1) - mean(sample_2)) / sed"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 162,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def df_ind(sample_1, sample_2):\n",
|
|
||||||
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek niezależnych\"\"\"\n",
|
|
||||||
" return len(sample_1) + len(sample_2) - 2"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 167,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false,
|
|
||||||
"pycharm": {
|
|
||||||
"name": "#%%\n"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def bootstrap_independent(sample_1, sample_2):\n",
|
|
||||||
" return t_test(\n",
|
|
||||||
" sample_1=sample_1,\n",
|
" sample_1=sample_1,\n",
|
||||||
" sample_2=sample_2,\n",
|
" sample_2=sample_2,\n",
|
||||||
" df_fn=df_ind,\n",
|
" alternative=alternative,\n",
|
||||||
" t_stat_fn=t_stat_ind\n",
|
" )\n",
|
||||||
" )"
|
" \n",
|
||||||
]
|
" pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
|
||||||
},
|
" return p, t, ts"
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#TODO: Wyciągnąć wysokości kobiet i mężczyzn oraz poprawić kod aby można było podawać kolumny\n",
|
|
||||||
"t_stat, df, cv, p, _ = bootstrap_independent(dataset, dataset)\n",
|
|
||||||
"pretty_print_full_stats(t_stat, df, cv, p)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -424,7 +435,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 160,
|
"execution_count": 83,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
@ -433,49 +444,15 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def t_stat_dep(sample_1, sample_2, mu=0):\n",
|
"def bootstrap_dependent(sample_1, sample_2, alternative=Alternatives.LESS):\n",
|
||||||
" \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n",
|
" p, t, ts = t_test_dep(\n",
|
||||||
" if sample_1.empty or sample_2.empty:\n",
|
|
||||||
" raise Exception(\"Empty sample\")\n",
|
|
||||||
" sample_1 = sample_1[0].values.tolist()\n",
|
|
||||||
" sample_2 = sample_2[0].values.tolist()\n",
|
|
||||||
" differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n",
|
|
||||||
" sample_size = len(sample_1)\n",
|
|
||||||
" return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 161,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def df_dep(sample_1, sample_2):\n",
|
|
||||||
" \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek zależnych\"\"\"\n",
|
|
||||||
" l1, l2 = len(sample_1), len(sample_2)\n",
|
|
||||||
" if l1 != l2:\n",
|
|
||||||
" raise Exception(\"Samples aren't of equal length\")\n",
|
|
||||||
" return l1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 168,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": false,
|
|
||||||
"pycharm": {
|
|
||||||
"name": "#%%\n"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def bootstrap_dependent(sample_1, sample_2):\n",
|
|
||||||
" return t_test(\n",
|
|
||||||
" sample_1=sample_1,\n",
|
" sample_1=sample_1,\n",
|
||||||
" sample_2=sample_2,\n",
|
" sample_2=sample_2,\n",
|
||||||
" df_fn=df_dep,\n",
|
" alternative=alternative,\n",
|
||||||
" t_stat_fn=t_stat_dep\n",
|
" )\n",
|
||||||
" )"
|
" \n",
|
||||||
|
" pretty_print_test(p, t, ts, 'średnie są takie same', alternative)\n",
|
||||||
|
" return p, t, ts"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -503,7 +480,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 171,
|
"execution_count": 84,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false,
|
"collapsed": false,
|
||||||
"pycharm": {
|
"pycharm": {
|
||||||
@ -532,76 +509,25 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": 85,
|
||||||
"metadata": {
|
|
||||||
"collapsed": false,
|
|
||||||
"pycharm": {
|
|
||||||
"name": "#%%\n"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Statystyka testowa dla jednej próby:\n",
|
|
||||||
"1.414213562373095 - z naszej funkcji\n",
|
|
||||||
"[1.41421356] - z gotowej biblioteki\n",
|
|
||||||
"\n",
|
|
||||||
"Statystyka testowa dla dwóch prób niezależnych:\n",
|
|
||||||
"-3.0 - z naszej funkcji\n",
|
|
||||||
"[-3.] - z gotowej biblioteki\n",
|
|
||||||
"\n",
|
|
||||||
"Statystyka testowa dla dwóch prób zależnych:\n",
|
|
||||||
"-1.6329931618554525 - z naszej funkcji\n",
|
|
||||||
"[-1.63299316] - z gotowej biblioteki\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Testy dla samych statystyk testowych\n",
|
|
||||||
"def pretty_print_stats(t_stat_selfmade, t_stat_lib, suffix):\n",
|
|
||||||
" print(f'Statystyka testowa dla {suffix}:')\n",
|
|
||||||
" print(t_stat_selfmade, '- z naszej funkcji')\n",
|
|
||||||
" print(t_stat_lib, '- z gotowej biblioteki')\n",
|
|
||||||
" print()\n",
|
|
||||||
" \n",
|
|
||||||
"dummy = pd.DataFrame([1, 2, 3, 4, 5])\n",
|
|
||||||
"dummy2 = pd.DataFrame([4, 5, 6, 7, 8])\n",
|
|
||||||
"dummy3 = pd.DataFrame([1, 3 , 3, 4, 6])\n",
|
|
||||||
"\n",
|
|
||||||
"t_stat_selfmade = t_stat_single(dummy, 2)\n",
|
|
||||||
"t_stat_lib, _ = ttest_1samp(dummy, 2)\n",
|
|
||||||
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'jednej próby')\n",
|
|
||||||
"\n",
|
|
||||||
"t_stat_selfmade = t_stat_ind(dummy, dummy2)\n",
|
|
||||||
"t_stat_lib, _ = ttest_ind(dummy, dummy2)\n",
|
|
||||||
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób niezależnych')\n",
|
|
||||||
"\n",
|
|
||||||
"t_stat_selfmade = t_stat_dep(dummy, dummy3)\n",
|
|
||||||
"t_stat_lib, _ = ttest_rel(dummy, dummy3)\n",
|
|
||||||
"pretty_print_stats(t_stat_selfmade, t_stat_lib, 'dwóch prób zależnych')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 39,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
||||||
"Statystyki dla jednej próby:\n",
|
"Statystyki dla jednej próby:\n",
|
||||||
"t: 1.8073147056683616, df: 5, cv: 2.015048372669157, p: 0.13052275003443325\n",
|
"Wyniki bootstrapowej wersji testu T-studenta\n",
|
||||||
|
"\n",
|
||||||
|
"Hipoteza: średnia jest równa 2\n",
|
||||||
|
"Hipoteza alternatywna: średnia jest mniejsza\n",
|
||||||
|
"\n",
|
||||||
|
"p: 0.35\n",
|
||||||
|
"Wartość statystyki testowej z próby: [1.41421356]\n",
|
||||||
|
"Wartości statystyk z prób boostrapowych:\n",
|
||||||
|
"[2.44948974], [3.13785816], [1.72328087], [0.27216553], [1.17669681], ... (i 95 pozostałych)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Statystyki dla dwóch prób zależnych:\n",
|
|
||||||
"t: 3.0790273716290404, df: 5, cv: 2.015048372669157, p: 0.027500015466573435\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"Statystyki dla dwóch prób niezależnych:\n",
|
|
||||||
"t: 2.8109511013364576, df: 8, cv: 1.8595480375228421, p: 0.02280961069987497\n",
|
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -609,22 +535,66 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Testy z bootstrappowaniem\n",
|
"# Testy z bootstrappowaniem\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def pretty_print_full_stats(t_stat, df, cv, p):\n",
|
|
||||||
" print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n",
|
|
||||||
"\n",
|
|
||||||
"print(type(dummy))\n",
|
|
||||||
"\n",
|
|
||||||
"print('Statystyki dla jednej próby:')\n",
|
"print('Statystyki dla jednej próby:')\n",
|
||||||
"t_stat, df, cv, p, _ = bootstrap_one_sample(dummy, 2)\n",
|
"p, t, ts = bootstrap_one_sample(dummy, 2)"
|
||||||
"pretty_print_full_stats(t_stat, df, cv, p)\n",
|
]
|
||||||
"\n",
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 86,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Statystyki dla dwóch prób zależnych:\n",
|
||||||
|
"Wyniki bootstrapowej wersji testu T-studenta\n",
|
||||||
|
"\n",
|
||||||
|
"Hipoteza: średnie są takie same\n",
|
||||||
|
"Hipoteza alternatywna: średnia jest mniejsza\n",
|
||||||
|
"\n",
|
||||||
|
"p: 1.0\n",
|
||||||
|
"Wartość statystyki testowej z próby: [10.61445555]\n",
|
||||||
|
"Wartości statystyk z prób boostrapowych:\n",
|
||||||
|
"[-2.66666667], [-0.14359163], [0.21199958], [0.11470787], [0.76696499], ... (i 95 pozostałych)\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
"print('Statystyki dla dwóch prób zależnych:')\n",
|
"print('Statystyki dla dwóch prób zależnych:')\n",
|
||||||
"t_stat, df, cv, p, _ = bootstrap_dependent(dummy2, dummy3)\n",
|
"p, t, ts = bootstrap_dependent(dummy2, dummy3)"
|
||||||
"pretty_print_full_stats(t_stat, df, cv, p)\n",
|
]
|
||||||
"\n",
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 87,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Statystyki dla dwóch prób niezależnych:\n",
|
||||||
|
"Wyniki bootstrapowej wersji testu T-studenta\n",
|
||||||
|
"\n",
|
||||||
|
"Hipoteza: średnie są takie same\n",
|
||||||
|
"Hipoteza alternatywna: średnia jest mniejsza\n",
|
||||||
|
"\n",
|
||||||
|
"p: 0.95\n",
|
||||||
|
"Wartość statystyki testowej z próby: [2.4140394]\n",
|
||||||
|
"Wartości statystyk z prób boostrapowych:\n",
|
||||||
|
"[-2.20937908], [0.13187609], [-0.81110711], [-0.94280904], [-0.77151675], ... (i 95 pozostałych)\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
"print('Statystyki dla dwóch prób niezależnych:')\n",
|
"print('Statystyki dla dwóch prób niezależnych:')\n",
|
||||||
"t_stat, df, cv, p, _ = bootstrap_independent(dummy2, dummy3)\n",
|
"p, t, ts = bootstrap_independent(dummy2, dummy3)"
|
||||||
"pretty_print_full_stats(t_stat, df, cv, p)"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -633,8 +603,12 @@
|
|||||||
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
|
"hash": "11938c6bc6919ae2720b4d5011047913343b08a43b18698fd82dedb0d4417594"
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3.9.1 64-bit",
|
"display_name": "Python 3.8.10 64-bit",
|
||||||
"language": "python",
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
|
||||||
|
}
|
||||||
|
},
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
@ -648,8 +622,7 @@
|
|||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.1"
|
"version": "3.9.1"
|
||||||
},
|
}
|
||||||
"orig_nbformat": 4
|
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 2
|
||||||
|
Loading…
Reference in New Issue
Block a user