From bc694d08fcd9b5d925ebd1a737986545b7f98c3b Mon Sep 17 00:00:00 2001 From: Adrian Charkiewicz Date: Sat, 19 Mar 2022 22:17:19 +0100 Subject: [PATCH] 8:2 split changed to 8:1:1 --- lab2.ipynb | 214 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 122 insertions(+), 92 deletions(-) diff --git a/lab2.ipynb b/lab2.ipynb index 571a685..55847a8 100644 --- a/lab2.ipynb +++ b/lab2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 39, "id": "35674c19", "metadata": { "scrolled": true @@ -13,15 +13,15 @@ "output_type": "stream", "text": [ "Requirement already satisfied: opendatasets in c:\\users\\riraa\\anaconda3\\lib\\site-packages (0.1.20)\n", - "Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n", "Requirement already satisfied: click in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (7.1.2)\n", "Requirement already satisfied: kaggle in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from opendatasets) (1.5.12)\n", - "Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n", - "Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n", - "Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n", - "Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n", + "Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n", "Requirement already satisfied: python-slugify in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from kaggle->opendatasets) (6.1.1)\n", "Requirement already satisfied: python-dateutil in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.8.1)\n", + "Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n", + "Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n", + "Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n", + "Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n", "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from python-slugify->kaggle->opendatasets) (1.3)\n", "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (2.10)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (4.0.0)\n" @@ -35,30 +35,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 40, "id": "5e8e5ea8", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████████████████████████████████████████████████████████████████████████| 25.6k/25.6k [00:00<00:00, 1.68MB/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Downloading red-wine-quality-cortez-et-al-2009.zip to .\\red-wine-quality-cortez-et-al-2009\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "Skipping, found downloaded files in \".\\red-wine-quality-cortez-et-al-2009\" (use force=True to force download)\n" ] } ], @@ -69,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 41, "id": "1d0f072e", "metadata": {}, "outputs": [ @@ -322,7 +307,7 @@ "[1599 rows x 12 columns]" ] }, - "execution_count": 6, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -335,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 42, "id": "9a675582", "metadata": { "scrolled": true @@ -353,7 +338,7 @@ "Name: quality, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -382,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 43, "id": "3197a613", "metadata": {}, "outputs": [ @@ -524,7 +509,7 @@ "4 9.4 5 " ] }, - "execution_count": 8, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -535,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 44, "id": "18dcd194", "metadata": { "scrolled": false @@ -733,7 +718,7 @@ "max 4.010000 2.000000 14.900000 8.000000 " ] }, - "execution_count": 9, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -744,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 45, "id": "0948ca45", "metadata": { "scrolled": false @@ -762,7 +747,7 @@ "Name: quality, dtype: int64" ] }, - "execution_count": 10, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -773,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 46, "id": "7245500d", "metadata": {}, "outputs": [ @@ -783,7 +768,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" }, @@ -814,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 47, "id": "f2d00efe", "metadata": { "scrolled": true @@ -958,7 +943,7 @@ "166 10.2 5 " ] }, - "execution_count": 12, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -969,7 +954,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 48, "id": "e074e787", "metadata": {}, "outputs": [ @@ -1165,7 +1150,7 @@ "max 4.010000 1.980000 14.900000 8.000000 " ] }, - "execution_count": 13, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1176,7 +1161,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 49, "id": "34f511dd", "metadata": { "scrolled": false @@ -1194,7 +1179,7 @@ "Name: quality, dtype: int64" ] }, - "execution_count": 14, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1203,9 +1188,17 @@ "wine_train[\"quality\"].value_counts().sort_index(ascending=False) #indexy oznaczają jakość wina" ] }, + { + "cell_type": "markdown", + "id": "29e301ee", + "metadata": {}, + "source": [ + "#### Sortowanie jest po to, by szły od najlepszego do najgorszego, zamiast po największej ilość próbek" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 50, "id": "466eb483", "metadata": {}, "outputs": [ @@ -1215,7 +1208,7 @@ "" ] }, - "execution_count": 15, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" }, @@ -1252,7 +1245,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 51, "id": "d6b697ec", "metadata": {}, "outputs": [ @@ -1394,7 +1387,7 @@ "288 10.6 7 " ] }, - "execution_count": 16, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1405,7 +1398,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 52, "id": "bc91d2fb", "metadata": {}, "outputs": [ @@ -1601,7 +1594,7 @@ "max 3.710000 2.000000 12.800000 8.000000 " ] }, - "execution_count": 17, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1612,7 +1605,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 53, "id": "72ce755c", "metadata": {}, "outputs": [ @@ -1627,7 +1620,7 @@ "Name: quality, dtype: int64" ] }, - "execution_count": 18, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -1638,7 +1631,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 54, "id": "fc355d95", "metadata": { "scrolled": true @@ -1650,7 +1643,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" }, @@ -1671,35 +1664,27 @@ "wine_test[\"quality\"].value_counts().sort_index(ascending=False).plot(kind=\"bar\")" ] }, - { - "cell_type": "markdown", - "id": "518f05c2", - "metadata": {}, - "source": [ - "## Normalizacja" - ] - }, { "cell_type": "markdown", "id": "0d904976", "metadata": {}, "source": [ - "# Podział z wyróżnieniem data/target" + "# Podział z wyróżnieniem data/remain" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 55, "id": "2f1c75ab", "metadata": {}, "outputs": [], "source": [ - "x_train,x_test,y_train,y_test = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])" + "X_train,X_rem,y_train,y_rem = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 56, "id": "c2b16170", "metadata": {}, "outputs": [ @@ -1709,7 +1694,7 @@ "1279" ] }, - "execution_count": 21, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1720,7 +1705,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 57, "id": "772560b4", "metadata": {}, "outputs": [ @@ -1730,13 +1715,59 @@ "320" ] }, - "execution_count": 22, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "y_test.value_counts().sum()" + "y_rem.value_counts().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "e6bca841", + "metadata": {}, + "source": [ + "#### Mamy teraz podział 8:2, a chcemy mieć 8:1:1, więc pozostały zbiór dzielimy na pół\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "c6bca605", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1279, 11)\n", + "(1279,)\n", + "(160, 11)\n", + "(160,)\n", + "(160, 11)\n", + "(160,)\n" + ] + }, + { + "data": { + "text/plain": [ + "(None, None)" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)\n", + "\n", + "print(X_train.shape), print(y_train.shape)\n", + "print(X_valid.shape), print(y_valid.shape)\n", + "print(X_test.shape), print(y_test.shape)" ] }, { @@ -1749,51 +1780,50 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 59, "id": "a4ac6f00", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "norm = MinMaxScaler()\n", - "norm_fit = norm.fit(x_train)\n", - "norm_x_train = norm_fit.transform(x_train)\n", - "norm_x_test = norm_fit.transform(x_test)" + "norm_fit = norm.fit(X_train)\n", + "norm_X_train = norm_fit.transform(X_train)\n", + "norm_X_test = norm_fit.transform(X_test)\n", + "norm_X_valid =norm_fit.transform(X_valid)" + ] + }, + { + "cell_type": "markdown", + "id": "599c4102", + "metadata": {}, + "source": [ + "###### Wygląd po normalizacji: mieści się w zakresie [0,1]" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 69, "id": "be0d1121", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { "text/plain": [ - "array([[0.31858407, 0.15702479, 0.50632911, 0.0890411 , 0.1010989 ,\n", - " 0.07042254, 0.01413428, 0.38839941, 0.39130435, 0.21212121,\n", - " 0.43076923],\n", - " [0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n", - " 0.33802817, 0.19081272, 0.51615272, 0.39130435, 0.16969697,\n", - " 0.26153846],\n", - " [0.23893805, 0.17355372, 0.59493671, 0.08219178, 0.14285714,\n", - " 0.05633803, 0.01766784, 0.42070485, 0.40869565, 0.12121212,\n", - " 0.29230769],\n", - " [0.19469027, 0.31404959, 0.13924051, 0.04109589, 0.13846154,\n", - " 0.21126761, 0.15194346, 0.39500734, 0.43478261, 0.27878788,\n", - " 0.16923077],\n", - " [0.27433628, 0.65702479, 0.15189873, 0.0890411 , 0.28791209,\n", - " 0.08450704, 0.06007067, 0.46475771, 0.42608696, 0.19393939,\n", - " 0.27692308]])" + "array([0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n", + " 0.33802817, 0.19081272, 0.51615272, 0.39130435, 0.16969697,\n", + " 0.26153846])" ] }, - "execution_count": 27, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "norm_x_train[:5]" + "norm_X_train[1]" ] }, { @@ -1806,7 +1836,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 64, "id": "670062c0", "metadata": {}, "outputs": [ @@ -1828,7 +1858,7 @@ "dtype: int64" ] }, - "execution_count": 24, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" }