8:2 split changed to 8:1:1
This commit is contained in:
parent
a1cf4b8e5d
commit
bc694d08fc
214
lab2.ipynb
214
lab2.ipynb
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 39,
|
||||
"id": "35674c19",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -13,15 +13,15 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: opendatasets in c:\\users\\riraa\\anaconda3\\lib\\site-packages (0.1.20)\n",
|
||||
"Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n",
|
||||
"Requirement already satisfied: click in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (7.1.2)\n",
|
||||
"Requirement already satisfied: kaggle in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from opendatasets) (1.5.12)\n",
|
||||
"Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n",
|
||||
"Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n",
|
||||
"Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n",
|
||||
"Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n",
|
||||
"Requirement already satisfied: tqdm in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from opendatasets) (4.59.0)\n",
|
||||
"Requirement already satisfied: python-slugify in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from kaggle->opendatasets) (6.1.1)\n",
|
||||
"Requirement already satisfied: python-dateutil in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.8.1)\n",
|
||||
"Requirement already satisfied: requests in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2.25.1)\n",
|
||||
"Requirement already satisfied: urllib3 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.26.4)\n",
|
||||
"Requirement already satisfied: certifi in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (2020.12.5)\n",
|
||||
"Requirement already satisfied: six>=1.10 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from kaggle->opendatasets) (1.15.0)\n",
|
||||
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\riraa\\appdata\\roaming\\python\\python38\\site-packages (from python-slugify->kaggle->opendatasets) (1.3)\n",
|
||||
"Requirement already satisfied: idna<3,>=2.5 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (2.10)\n",
|
||||
"Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\riraa\\anaconda3\\lib\\site-packages (from requests->kaggle->opendatasets) (4.0.0)\n"
|
||||
@ -35,30 +35,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 40,
|
||||
"id": "5e8e5ea8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|█████████████████████████████████████████████████████████████████████████████| 25.6k/25.6k [00:00<00:00, 1.68MB/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading red-wine-quality-cortez-et-al-2009.zip to .\\red-wine-quality-cortez-et-al-2009\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
"Skipping, found downloaded files in \".\\red-wine-quality-cortez-et-al-2009\" (use force=True to force download)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -69,7 +54,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 41,
|
||||
"id": "1d0f072e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -322,7 +307,7 @@
|
||||
"[1599 rows x 12 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -335,7 +320,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 42,
|
||||
"id": "9a675582",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -353,7 +338,7 @@
|
||||
"Name: quality, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -382,7 +367,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 43,
|
||||
"id": "3197a613",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -524,7 +509,7 @@
|
||||
"4 9.4 5 "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -535,7 +520,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 44,
|
||||
"id": "18dcd194",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
@ -733,7 +718,7 @@
|
||||
"max 4.010000 2.000000 14.900000 8.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -744,7 +729,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 45,
|
||||
"id": "0948ca45",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
@ -762,7 +747,7 @@
|
||||
"Name: quality, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -773,7 +758,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 46,
|
||||
"id": "7245500d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -783,7 +768,7 @@
|
||||
"<AxesSubplot:>"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -814,7 +799,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 47,
|
||||
"id": "f2d00efe",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -958,7 +943,7 @@
|
||||
"166 10.2 5 "
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -969,7 +954,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 48,
|
||||
"id": "e074e787",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1165,7 +1150,7 @@
|
||||
"max 4.010000 1.980000 14.900000 8.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1176,7 +1161,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 49,
|
||||
"id": "34f511dd",
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
@ -1194,7 +1179,7 @@
|
||||
"Name: quality, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1203,9 +1188,17 @@
|
||||
"wine_train[\"quality\"].value_counts().sort_index(ascending=False) #indexy oznaczają jakość wina"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "29e301ee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Sortowanie jest po to, by szły od najlepszego do najgorszego, zamiast po największej ilość próbek"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 50,
|
||||
"id": "466eb483",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1215,7 +1208,7 @@
|
||||
"<AxesSubplot:>"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -1252,7 +1245,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 51,
|
||||
"id": "d6b697ec",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1394,7 +1387,7 @@
|
||||
"288 10.6 7 "
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1405,7 +1398,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 52,
|
||||
"id": "bc91d2fb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1601,7 +1594,7 @@
|
||||
"max 3.710000 2.000000 12.800000 8.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1612,7 +1605,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 53,
|
||||
"id": "72ce755c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1627,7 +1620,7 @@
|
||||
"Name: quality, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1638,7 +1631,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 54,
|
||||
"id": "fc355d95",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -1650,7 +1643,7 @@
|
||||
"<AxesSubplot:>"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
@ -1671,35 +1664,27 @@
|
||||
"wine_test[\"quality\"].value_counts().sort_index(ascending=False).plot(kind=\"bar\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "518f05c2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Normalizacja"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0d904976",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Podział z wyróżnieniem data/target"
|
||||
"# Podział z wyróżnieniem data/remain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 55,
|
||||
"id": "2f1c75ab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train,x_test,y_train,y_test = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])"
|
||||
"X_train,X_rem,y_train,y_rem = train_test_split(wine.iloc[:,:-1],wine.iloc[:,-1], test_size=0.2, random_state=1,stratify=wine[\"quality\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 56,
|
||||
"id": "c2b16170",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1709,7 +1694,7 @@
|
||||
"1279"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -1720,7 +1705,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 57,
|
||||
"id": "772560b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1730,13 +1715,59 @@
|
||||
"320"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y_test.value_counts().sum()"
|
||||
"y_rem.value_counts().sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e6bca841",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Mamy teraz podział 8:2, a chcemy mieć 8:1:1, więc pozostały zbiór dzielimy na pół\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "c6bca605",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(1279, 11)\n",
|
||||
"(1279,)\n",
|
||||
"(160, 11)\n",
|
||||
"(160,)\n",
|
||||
"(160, 11)\n",
|
||||
"(160,)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(None, None)"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)\n",
|
||||
"\n",
|
||||
"print(X_train.shape), print(y_train.shape)\n",
|
||||
"print(X_valid.shape), print(y_valid.shape)\n",
|
||||
"print(X_test.shape), print(y_test.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1749,51 +1780,50 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 59,
|
||||
"id": "a4ac6f00",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import MinMaxScaler\n",
|
||||
"norm = MinMaxScaler()\n",
|
||||
"norm_fit = norm.fit(x_train)\n",
|
||||
"norm_x_train = norm_fit.transform(x_train)\n",
|
||||
"norm_x_test = norm_fit.transform(x_test)"
|
||||
"norm_fit = norm.fit(X_train)\n",
|
||||
"norm_X_train = norm_fit.transform(X_train)\n",
|
||||
"norm_X_test = norm_fit.transform(X_test)\n",
|
||||
"norm_X_valid =norm_fit.transform(X_valid)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "599c4102",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"###### Wygląd po normalizacji: mieści się w zakresie [0,1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 69,
|
||||
"id": "be0d1121",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[0.31858407, 0.15702479, 0.50632911, 0.0890411 , 0.1010989 ,\n",
|
||||
" 0.07042254, 0.01413428, 0.38839941, 0.39130435, 0.21212121,\n",
|
||||
" 0.43076923],\n",
|
||||
" [0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n",
|
||||
" 0.33802817, 0.19081272, 0.51615272, 0.39130435, 0.16969697,\n",
|
||||
" 0.26153846],\n",
|
||||
" [0.23893805, 0.17355372, 0.59493671, 0.08219178, 0.14285714,\n",
|
||||
" 0.05633803, 0.01766784, 0.42070485, 0.40869565, 0.12121212,\n",
|
||||
" 0.29230769],\n",
|
||||
" [0.19469027, 0.31404959, 0.13924051, 0.04109589, 0.13846154,\n",
|
||||
" 0.21126761, 0.15194346, 0.39500734, 0.43478261, 0.27878788,\n",
|
||||
" 0.16923077],\n",
|
||||
" [0.27433628, 0.65702479, 0.15189873, 0.0890411 , 0.28791209,\n",
|
||||
" 0.08450704, 0.06007067, 0.46475771, 0.42608696, 0.19393939,\n",
|
||||
" 0.27692308]])"
|
||||
"array([0.26548673, 0.14049587, 0.62025316, 0.12328767, 0.17582418,\n",
|
||||
" 0.33802817, 0.19081272, 0.51615272, 0.39130435, 0.16969697,\n",
|
||||
" 0.26153846])"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"norm_x_train[:5]"
|
||||
"norm_X_train[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1806,7 +1836,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 64,
|
||||
"id": "670062c0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -1828,7 +1858,7 @@
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user