8:2 split changed to 8:1:1

This commit is contained in:
Adrian Charkiewicz 2022-03-19 22:17:19 +01:00
parent a1cf4b8e5d
commit bc694d08fc

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 39,
"id": "35674c19",
"metadata": {
"scrolled": true
@ -13,15 +13,15 @@
"output_type": "stream",
"text": [
"Requirement already satisfied: opendatasets in c:\\users\\riraa\\anaconda3\\lib\\site-packages (0.1.20)\n",
"Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n",
"Requirement already satisfied: click in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (7.1.2)\n",
"Requirement already satisfied: kaggle in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from opendatasets) (1.5.12)\n",
"Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n",
"Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n",
"Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n",
"Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n",
"Requirement already satisfied: python-slugify in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from kaggle->opendatasets) (6.1.1)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.8.1)\n",
"Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n",
"Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n",
"Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from python-slugify->kaggle->opendatasets) (1.3)\n",
"Requirement already satisfied: idna<3,>=2.5 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (2.10)\n",
"Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (4.0.0)\n"
@ -35,30 +35,15 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 40,
"id": "5e8e5ea8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████████| 25.6k/25.6k [00:00<00:00, 1.68MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading red-wine-quality-cortez-et-al-2009.zip to .\\red-wine-quality-cortez-et-al-2009\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
"Skipping, found downloaded files in \".\\red-wine-quality-cortez-et-al-2009\" (use force=True to force download)\n"
]
}
],
@ -69,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 41,
"id": "1d0f072e",
"metadata": {},
"outputs": [
@ -322,7 +307,7 @@
"[1599 rows x 12 columns]"
]
},
"execution_count": 6,
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
@ -335,7 +320,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 42,
"id": "9a675582",
"metadata": {
"scrolled": true
@ -353,7 +338,7 @@
"Name: quality, dtype: int64"
]
},
"execution_count": 7,
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
@ -382,7 +367,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 43,
"id": "3197a613",
"metadata": {},
"outputs": [
@ -524,7 +509,7 @@
"4 9.4 5 "
]
},
"execution_count": 8,
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
@ -535,7 +520,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 44,
"id": "18dcd194",
"metadata": {
"scrolled": false
@ -733,7 +718,7 @@
"max 4.010000 2.000000 14.900000 8.000000 "
]
},
"execution_count": 9,
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
@ -744,7 +729,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 45,
"id": "0948ca45",
"metadata": {
"scrolled": false
@ -762,7 +747,7 @@
"Name: quality, dtype: int64"
]
},
"execution_count": 10,
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
@ -773,7 +758,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 46,
"id": "7245500d",
"metadata": {},
"outputs": [
@ -783,7 +768,7 @@
"<AxesSubplot:>"
]
},
"execution_count": 11,
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
},
@ -814,7 +799,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 47,
"id": "f2d00efe",
"metadata": {
"scrolled": true
@ -958,7 +943,7 @@
"166 10.2 5 "
]
},
"execution_count": 12,
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
@ -969,7 +954,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 48,
"id": "e074e787",
"metadata": {},
"outputs": [
@ -1165,7 +1150,7 @@
"max 4.010000 1.980000 14.900000 8.000000 "
]
},
"execution_count": 13,
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@ -1176,7 +1161,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 49,
"id": "34f511dd",
"metadata": {
"scrolled": false
@ -1194,7 +1179,7 @@
"Name: quality, dtype: int64"
]
},
"execution_count": 14,
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
@ -1203,9 +1188,17 @@
"wine_train[\"quality\"].value_counts().sort_index(ascending=False) #indexy oznaczają jakość wina"
]
},
{
"cell_type": "markdown",
"id": "29e301ee",
"metadata": {},
"source": [
"#### Sortowanie jest po to, by szły od najlepszego do najgorszego, zamiast po największej ilość próbek"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 50,
"id": "466eb483",
"metadata": {},
"outputs": [
@ -1215,7 +1208,7 @@
"<AxesSubplot:>"
]
},
"execution_count": 15,
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
},
@ -1252,7 +1245,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 51,
"id": "d6b697ec",
"metadata": {},
"outputs": [
@ -1394,7 +1387,7 @@
"288 10.6 7 "
]
},
"execution_count": 16,
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@ -1405,7 +1398,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 52,
"id": "bc91d2fb",
"metadata": {},
"outputs": [
@ -1601,7 +1594,7 @@
"max 3.710000 2.000000 12.800000 8.000000 "
]
},
"execution_count": 17,
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
@ -1612,7 +1605,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 53,
"id": "72ce755c",
"metadata": {},
"outputs": [
@ -1627,7 +1620,7 @@
"Name: quality, dtype: int64"
]
},
"execution_count": 18,
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
@ -1638,7 +1631,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 54,
"id": "fc355d95",
"metadata": {
"scrolled": true
@ -1650,7 +1643,7 @@
"<AxesSubplot:>"
]
},
"execution_count": 19,
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
},
@ -1671,35 +1664,27 @@
"wine_test[\"quality\"].value_counts().sort_index(ascending=False).plot(kind=\"bar\")"
]
},
{
"cell_type": "markdown",
"id": "518f05c2",
"metadata": {},
"source": [
"## Normalizacja"
]
},
{
"cell_type": "markdown",
"id": "0d904976",
"metadata": {},
"source": [
"# Podział z wyróżnieniem data/target"
"# Podział z wyróżnieniem data/remain"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 55,
"id": "2f1c75ab",
"metadata": {},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])"
"X_train,X_rem,y_train,y_rem = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 56,
"id": "c2b16170",
"metadata": {},
"outputs": [
@ -1709,7 +1694,7 @@
"1279"
]
},
"execution_count": 21,
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
@ -1720,7 +1705,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 57,
"id": "772560b4",
"metadata": {},
"outputs": [
@ -1730,13 +1715,59 @@
"320"
]
},
"execution_count": 22,
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.value_counts().sum()"
"y_rem.value_counts().sum()"
]
},
{
"cell_type": "markdown",
"id": "e6bca841",
"metadata": {},
"source": [
"#### Mamy teraz podział 8:2, a chcemy mieć 8:1:1, więc pozostały zbiór dzielimy na pół\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "c6bca605",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1279, 11)\n",
"(1279,)\n",
"(160, 11)\n",
"(160,)\n",
"(160, 11)\n",
"(160,)\n"
]
},
{
"data": {
"text/plain": [
"(None, None)"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)\n",
"\n",
"print(X_train.shape), print(y_train.shape)\n",
"print(X_valid.shape), print(y_valid.shape)\n",
"print(X_test.shape), print(y_test.shape)"
]
},
{
@ -1749,51 +1780,50 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 59,
"id": "a4ac6f00",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"norm = MinMaxScaler()\n",
"norm_fit = norm.fit(x_train)\n",
"norm_x_train = norm_fit.transform(x_train)\n",
"norm_x_test = norm_fit.transform(x_test)"
"norm_fit = norm.fit(X_train)\n",
"norm_X_train = norm_fit.transform(X_train)\n",
"norm_X_test = norm_fit.transform(X_test)\n",
"norm_X_valid =norm_fit.transform(X_valid)"
]
},
{
"cell_type": "markdown",
"id": "599c4102",
"metadata": {},
"source": [
"###### Wygląd po normalizacji: mieści się w zakresie [0,1]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 69,
"id": "be0d1121",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.31858407, 0.15702479, 0.50632911, 0.0890411 , 0.1010989 ,\n",
" 0.07042254, 0.01413428, 0.38839941, 0.39130435, 0.21212121,\n",
" 0.43076923],\n",
" [0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n",
"array([0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n",
" 0.33802817, 0.19081272, 0.51615272, 0.39130435, 0.16969697,\n",
" 0.26153846],\n",
" [0.23893805, 0.17355372, 0.59493671, 0.08219178, 0.14285714,\n",
" 0.05633803, 0.01766784, 0.42070485, 0.40869565, 0.12121212,\n",
" 0.29230769],\n",
" [0.19469027, 0.31404959, 0.13924051, 0.04109589, 0.13846154,\n",
" 0.21126761, 0.15194346, 0.39500734, 0.43478261, 0.27878788,\n",
" 0.16923077],\n",
" [0.27433628, 0.65702479, 0.15189873, 0.0890411 , 0.28791209,\n",
" 0.08450704, 0.06007067, 0.46475771, 0.42608696, 0.19393939,\n",
" 0.27692308]])"
" 0.26153846])"
]
},
"execution_count": 27,
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"norm_x_train[:5]"
"norm_X_train[1]"
]
},
{
@ -1806,7 +1836,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 64,
"id": "670062c0",
"metadata": {},
"outputs": [
@ -1828,7 +1858,7 @@
"dtype: int64"
]
},
"execution_count": 24,
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}