diff --git a/bootstrap-t.ipynb b/bootstrap-t.ipynb index f89a173..16a57d2 100644 --- a/bootstrap-t.ipynb +++ b/bootstrap-t.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 155, "outputs": [], "source": [ "# TODO: Poprzestawiać kolejność definicji funkcji?" @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 252, + "execution_count": 156, "metadata": { "pycharm": { "name": "#%%\n" @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 253, + "execution_count": 157, "metadata": { "pycharm": { "name": "#%%\n" @@ -79,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 254, + "execution_count": 158, "metadata": { "collapsed": false, "pycharm": { @@ -88,19 +88,18 @@ }, "outputs": [], "source": [ - "def t_stat_single(sample, population_mean=2):\n", - " # TODO: Wywalić min, funkcja nie powinna działać dla pustej próbki\n", - " # TODO: population mean nie powinien mieć defaultowego argumentu\n", + "def t_stat_single(sample, population_mean):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla jednej próbki\"\"\"\n", + " if sample.empty:\n", + " raise Exception(\"Empty sample\")\n", " sample = sample[0].values.tolist()\n", " sample_size = len(sample)\n", - " # min is to fix near-zero values causing zero division erros\n", - " return (mean(sample) - population_mean) / (stdev(sample) / min(0.00000001, sqrt(sample_size)))" + " return (mean(sample) - population_mean) / (stdev(sample) / sqrt(sample_size))" ] }, { "cell_type": "code", - "execution_count": 255, + "execution_count": 159, "metadata": { "collapsed": false, "pycharm": { @@ -111,6 +110,8 @@ "source": [ "def t_stat_ind(sample_1, sample_2):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek niezależnych\"\"\"\n", + " if sample_1.empty or sample_2.empty:\n", + " raise Exception(\"Empty sample\")\n", " sample_1 = sample_1[0].values.tolist()\n", " sample_2 = sample_2[0].values.tolist()\n", " sed = sqrt(sem(sample_1)**2 + sem(sample_2)**2)\n", @@ -119,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 256, + "execution_count": 160, "metadata": { "collapsed": false, "pycharm": { @@ -128,36 +129,34 @@ }, "outputs": [], "source": [ - "def t_stat_dep(sample_1, sample_2):\n", + "def t_stat_dep(sample_1, sample_2, mu=0):\n", " \"\"\"Funkcja oblicza wartość statystyki testowej dla dwóch próbek zależnych\"\"\"\n", - " # TODO: Wywalić min\n", - " # TODO: Przenieść mu jako opcjonalny argument?\n", + " if sample_1.empty or sample_2.empty:\n", + " raise Exception(\"Empty sample\")\n", " sample_1 = sample_1[0].values.tolist()\n", " sample_2 = sample_2[0].values.tolist()\n", " differences = [x_1 - x_2 for x_1, x_2 in zip(sample_1, sample_2)]\n", " sample_size = len(sample_1)\n", - " mu = 0 # The constant is zero if we want to test whether the average of the difference is significantly different.\n", - " return (mean(differences) - mu) / (stdev(differences) / min(0.00000001, sqrt(sample_size)))" + " return (mean(differences) - mu) / (stdev(differences) / sqrt(sample_size))" ] }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ "def df_dep(sample_1, sample_2):\n", " \"\"\"Funkcja oblicza stopnie swobody dla dwóch próbek zależnych\"\"\"\n", - " # TODO: Assert działa chyba tylko w trybie debugowania\n", " l1, l2 = len(sample_1), len(sample_2)\n", - " assert l1 == l2 \n", - "\n", + " if l1 != l2:\n", + " raise Exception(\"Samples aren't of equal length\")\n", " return l1" ] }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -168,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -191,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 261, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -202,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 166, "metadata": { "collapsed": false, "pycharm": { @@ -211,17 +210,18 @@ }, "outputs": [], "source": [ - "def bootstrap_one_sample(sample):\n", + "def bootstrap_one_sample(sample, population_mean):\n", " return t_test(\n", " sample_1=sample,\n", " df_fn=df_single,\n", - " t_stat_fn=t_stat_single\n", + " t_stat_fn=t_stat_single,\n", + " population_mean=population_mean\n", " )" ] }, { "cell_type": "code", - "execution_count": 263, + "execution_count": 167, "metadata": { "collapsed": false, "pycharm": { @@ -241,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 264, + "execution_count": 168, "metadata": { "collapsed": false, "pycharm": { @@ -261,31 +261,33 @@ }, { "cell_type": "code", - "execution_count": 265, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ - "def get_t_stats(sample_1, sample_2=None, t_stat_fn=t_stat_ind):\n", + "def get_t_stats(sample_1, sample_2=None, t_stat_fn=t_stat_single, population_mean=None):\n", " \"\"\"Funkcja oblicza listę statystyk testowych dla każdej próbki bootstrapowej wybranej na podstawie danych sample_1 i sample_2\"\"\"\n", " t_stat_list = []\n", "\n", - " # Separate case for single tests\n", - " if sample_2 is None:\n", + " # One sample test\n", + " if t_stat_fn==t_stat_single:\n", + " if not population_mean:\n", + " raise Exception(\"population_mean not provided\")\n", " for bootstrap in generate_bootstraps(sample_1):\n", - " stat = t_stat_fn(bootstrap)\n", + " stat = t_stat_fn(bootstrap, population_mean)\n", " t_stat_list.append(stat)\n", " return t_stat_list\n", - " \n", + "\n", + " # Two sample test\n", " for bootstrap_1, bootstrap_2 in zip(generate_bootstraps(sample_1), generate_bootstraps(sample_2)):\n", " stat = t_stat_fn(bootstrap_1, bootstrap_2)\n", " t_stat_list.append(stat)\n", - " \n", " return t_stat_list" ] }, { "cell_type": "code", - "execution_count": 266, + "execution_count": 170, "metadata": { "pycharm": { "name": "#%%\n" @@ -293,34 +295,34 @@ }, "outputs": [], "source": [ - "def t_test(sample_1, sample_2=None, df_fn=df_ind, t_stat_fn=t_stat_ind, alpha=0.05):\n", + "def t_test(sample_1, sample_2=None, df_fn=df_single, t_stat_fn=t_stat_single, population_mean=None, alpha=0.05):\n", " \"\"\"\n", " Funkcja przeprowadza test T-studenta dla dwóch zmiennych.\n", " liczba kolumn wynosi 1, test jest przeprowadzany dla jednej zmiennej.\n", " @param df_fn - funkcja obliczająca stopnie swobody\n", " @param t_stat_fn - funkcja obliczająca statystykę T\n", " \"\"\"\n", - " t_stat_list = get_t_stats(sample_1, sample_2, t_stat_fn)\n", + " t_stat_list = get_t_stats(sample_1, sample_2, t_stat_fn, population_mean=population_mean)\n", " t_stat_sum = sum(t_stat_list)\n", "\n", " data_size = sample_1.shape[0]\n", "\n", " t_stat = t_stat_sum / data_size\n", + " # TODO: dolna i górna opcja dają inne wyniki z jakiegoś powodu (???)\n", + " t_stat = mean(t_stat_list)\n", "\n", - " df = 0.0\n", " if sample_2 is None:\n", " df = df_fn(sample_1)\n", " else:\n", " df = df_fn(sample_1, sample_2)\n", " cv = calculate_cv(df, alpha)\n", " p = calculate_p(t_stat, df)\n", - " \n", " return t_stat, df, cv, p, t_stat_list" ] }, { "cell_type": "code", - "execution_count": 267, + "execution_count": 171, "metadata": { "collapsed": false, "pycharm": { @@ -332,7 +334,7 @@ "def draw_distribution(stats):\n", " \"\"\"\n", " Funkcja rysuje rozkład statystyki testowej\n", - " stats: lista statystyk testowych\n", + " @param stats: lista statystyk testowych\n", " \"\"\"\n", " plt.hist(stats)\n", " plt.xlabel('Test statistic value')\n", @@ -342,23 +344,7 @@ }, { "cell_type": "code", - "execution_count": 268, - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "def make_decision(data, columns):\n", - " # TODO: Potrzebna ta funkcja w ogóle? Decyzja jest zależna od wybranych hipotez chyba.\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 269, + "execution_count": 172, "metadata": { "collapsed": false, "pycharm": { @@ -371,7 +357,7 @@ "output_type": "stream", "text": [ "Statystyka testowa dla jednej próby:\n", - "6.324555320336758e-09 - z naszej funkcji\n", + "1.414213562373095 - z naszej funkcji\n", "[1.41421356] - z gotowej biblioteki\n", "\n", "Statystyka testowa dla dwóch prób niezależnych:\n", @@ -379,7 +365,7 @@ "[-3.] - z gotowej biblioteki\n", "\n", "Statystyka testowa dla dwóch prób zależnych:\n", - "-7.302967433402215e-09 - z naszej funkcji\n", + "-1.6329931618554525 - z naszej funkcji\n", "[-1.63299316] - z gotowej biblioteki\n", "\n" ] @@ -412,28 +398,22 @@ }, { "cell_type": "code", - "execution_count": 270, + "execution_count": 173, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Statystyki dla jednej próby:\n" - ] - }, - { - "ename": "TypeError", - "evalue": "t_stat_single() missing 1 required positional argument: 'population_mean'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", - "Input \u001B[1;32mIn [270]\u001B[0m, in \u001B[0;36m\u001B[1;34m()\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mt: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mt_stat\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m, df: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mdf\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m, cv: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcv\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m, p: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mp\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124m'\u001B[39m)\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mStatystyki dla jednej próby:\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[1;32m----> 7\u001B[0m t_stat, df, cv, p, _ \u001B[38;5;241m=\u001B[39m \u001B[43mbootstrap_one_sample\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdummy\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 8\u001B[0m pretty_print_full_stats(t_stat, df, cv, p)\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mStatystyki dla dwóch prób zależnych:\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", - "Input \u001B[1;32mIn [262]\u001B[0m, in \u001B[0;36mbootstrap_one_sample\u001B[1;34m(sample)\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mbootstrap_one_sample\u001B[39m(sample):\n\u001B[1;32m----> 2\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mt_test\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m 3\u001B[0m \u001B[43m \u001B[49m\u001B[43msample_1\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43msample\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 4\u001B[0m \u001B[43m \u001B[49m\u001B[43mdf_fn\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdf_single\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 5\u001B[0m \u001B[43m \u001B[49m\u001B[43mt_stat_fn\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mt_stat_single\u001B[49m\n\u001B[0;32m 6\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n", - "Input \u001B[1;32mIn [266]\u001B[0m, in \u001B[0;36mt_test\u001B[1;34m(sample_1, sample_2, df_fn, t_stat_fn, alpha)\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mt_test\u001B[39m(sample_1, sample_2\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m, df_fn\u001B[38;5;241m=\u001B[39mdf_ind, t_stat_fn\u001B[38;5;241m=\u001B[39mt_stat_ind, alpha\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0.05\u001B[39m):\n\u001B[0;32m 2\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 3\u001B[0m \u001B[38;5;124;03m Funkcja przeprowadza test T-studenta dla dwóch zmiennych.\u001B[39;00m\n\u001B[0;32m 4\u001B[0m \u001B[38;5;124;03m liczba kolumn wynosi 1, test jest przeprowadzany dla jednej zmiennej.\u001B[39;00m\n\u001B[0;32m 5\u001B[0m \u001B[38;5;124;03m @param df_fn - funkcja obliczająca stopnie swobody\u001B[39;00m\n\u001B[0;32m 6\u001B[0m \u001B[38;5;124;03m @param t_stat_fn - funkcja obliczająca statystykę T\u001B[39;00m\n\u001B[0;32m 7\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m----> 8\u001B[0m t_stat_list \u001B[38;5;241m=\u001B[39m \u001B[43mget_t_stats\u001B[49m\u001B[43m(\u001B[49m\u001B[43msample_1\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43msample_2\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mt_stat_fn\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 9\u001B[0m t_stat_sum \u001B[38;5;241m=\u001B[39m \u001B[38;5;28msum\u001B[39m(t_stat_list)\n\u001B[0;32m 11\u001B[0m data_size \u001B[38;5;241m=\u001B[39m sample_1\u001B[38;5;241m.\u001B[39mshape[\u001B[38;5;241m0\u001B[39m]\n", - "Input \u001B[1;32mIn [265]\u001B[0m, in \u001B[0;36mget_t_stats\u001B[1;34m(sample_1, sample_2, t_stat_fn)\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m sample_2 \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m bootstrap \u001B[38;5;129;01min\u001B[39;00m generate_bootstraps(sample_1):\n\u001B[1;32m----> 8\u001B[0m stat \u001B[38;5;241m=\u001B[39m \u001B[43mt_stat_fn\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbootstrap\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 9\u001B[0m t_stat_list\u001B[38;5;241m.\u001B[39mappend(stat)\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m t_stat_list\n", - "\u001B[1;31mTypeError\u001B[0m: t_stat_single() missing 1 required positional argument: 'population_mean'" + "Statystyki dla jednej próby:\n", + "t: 1.8524997668616348, df: 5, cv: 2.015048372669157, p: 0.12315232406912302\n", + "\n", + "Statystyki dla dwóch prób zależnych:\n", + "t: 3.166992562129946, df: 5, cv: 2.015048372669157, p: 0.02489883191814224\n", + "\n", + "Statystyki dla dwóch prób niezależnych:\n", + "t: 3.0429202631473986, df: 8, cv: 1.8595480375228421, p: 0.015992147409949586\n", + "\n" ] } ], @@ -444,7 +424,7 @@ " print(f't: {t_stat}, df: {df}, cv: {cv}, p: {p}\\n')\n", "\n", "print('Statystyki dla jednej próby:')\n", - "t_stat, df, cv, p, _ = bootstrap_one_sample(dummy)\n", + "t_stat, df, cv, p, _ = bootstrap_one_sample(dummy, 2)\n", "pretty_print_full_stats(t_stat, df, cv, p)\n", "\n", "print('Statystyki dla dwóch prób zależnych:')\n", @@ -458,7 +438,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 174, "metadata": { "collapsed": false, "pycharm": { @@ -468,7 +448,7 @@ "outputs": [], "source": [ "dataset = pd.read_csv('experiment_data.csv')\n", - "make_decision(dataset, ['Weight', 'Age'])" + "#make_decision(dataset, ['Weight', 'Age'])" ] } ],