67 KiB
67 KiB
import pandas as pd
import sklearn.model_selection
from datasets import load_dataset
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
dataset = load_dataset("mstz/wine", "wine")
Found cached dataset wine (C:/Users/macty/.cache/huggingface/datasets/mstz___wine/wine/1.0.0/7c3844cac7ac7a22d5fbbaf60fc1d4e9c9deb1b9b9c4dbae6a7b1a962dbc96d8) 100%|██████████| 1/1 [00:00<00:00, 49.24it/s]
dataset["train"]
Dataset({ features: ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'is_red'], num_rows: 6497 })
wine_dataset = pd.DataFrame(dataset["train"])
wine_dataset.head()# podgląd danych
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 0 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 0 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 0 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
wine_dataset.describe(include='all')
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
mean | 7.215307 | 0.339666 | 0.318633 | 5.443235 | 0.056034 | 30.525319 | 115.744574 | 0.994697 | 3.218501 | 0.531268 | 10.491801 | 5.818378 | 0.753886 |
std | 1.296434 | 0.164636 | 0.145318 | 4.757804 | 0.035034 | 17.749400 | 56.521855 | 0.002999 | 0.160787 | 0.148806 | 1.192712 | 0.873255 | 0.430779 |
min | 3.800000 | 0.080000 | 0.000000 | 0.600000 | 0.009000 | 1.000000 | 6.000000 | 0.987110 | 2.720000 | 0.220000 | 8.000000 | 3.000000 | 0.000000 |
25% | 6.400000 | 0.230000 | 0.250000 | 1.800000 | 0.038000 | 17.000000 | 77.000000 | 0.992340 | 3.110000 | 0.430000 | 9.500000 | 5.000000 | 1.000000 |
50% | 7.000000 | 0.290000 | 0.310000 | 3.000000 | 0.047000 | 29.000000 | 118.000000 | 0.994890 | 3.210000 | 0.510000 | 10.300000 | 6.000000 | 1.000000 |
75% | 7.700000 | 0.400000 | 0.390000 | 8.100000 | 0.065000 | 41.000000 | 156.000000 | 0.996990 | 3.320000 | 0.600000 | 11.300000 | 6.000000 | 1.000000 |
max | 15.900000 | 1.580000 | 1.660000 | 65.800000 | 0.611000 | 289.000000 | 440.000000 | 1.038980 | 4.010000 | 2.000000 | 14.900000 | 9.000000 | 1.000000 |
wine_dataset["is_red"].value_counts().plot(kind="bar")
<Axes: >
wine_dataset["fixed_acidity"].std()
1.2964337577998153
import numpy as np
np.where(pd.isnull(wine_dataset))## sprawdzanie czy istnieją puste wartości
(array([], dtype=int64), array([], dtype=int64))
for column in wine_dataset.columns:
wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja
wine_dataset.describe(include='all') # sprawdzanie wartości po znormalizowaniu
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 | 6497.000000 |
mean | 0.453793 | 0.214978 | 0.191948 | 0.082724 | 0.091708 | 0.105624 | 0.263056 | 0.957378 | 0.802619 | 0.265634 | 0.704148 | 0.646486 | 0.753886 |
std | 0.081537 | 0.104200 | 0.087541 | 0.072307 | 0.057338 | 0.061417 | 0.128459 | 0.002886 | 0.040097 | 0.074403 | 0.080048 | 0.097028 | 0.430779 |
min | 0.238994 | 0.050633 | 0.000000 | 0.009119 | 0.014730 | 0.003460 | 0.013636 | 0.950076 | 0.678304 | 0.110000 | 0.536913 | 0.333333 | 0.000000 |
25% | 0.402516 | 0.145570 | 0.150602 | 0.027356 | 0.062193 | 0.058824 | 0.175000 | 0.955110 | 0.775561 | 0.215000 | 0.637584 | 0.555556 | 1.000000 |
50% | 0.440252 | 0.183544 | 0.186747 | 0.045593 | 0.076923 | 0.100346 | 0.268182 | 0.957564 | 0.800499 | 0.255000 | 0.691275 | 0.666667 | 1.000000 |
75% | 0.484277 | 0.253165 | 0.234940 | 0.123100 | 0.106383 | 0.141869 | 0.354545 | 0.959585 | 0.827930 | 0.300000 | 0.758389 | 0.666667 | 1.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
wine_dataset["fixed_acidity"].nlargest(10) #sprawdza czy najwyższe wartości mają sens
652 1.000000 442 0.981132 557 0.981132 554 0.974843 555 0.974843 243 0.943396 244 0.943396 544 0.899371 3125 0.893082 374 0.880503 Name: fixed_acidity, dtype: float64
from sklearn.model_selection import train_test_split
wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["is_red"])
wine_train["is_red"].value_counts()
# podzielenie na train i test
1.0 4408 0.0 1439 Name: is_red, dtype: int64
wine_test["is_red"].value_counts()
1.0 490 0.0 160 Name: is_red, dtype: int64
wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["is_red"]) # podzielenie na test i validation
wine_test["is_red"].value_counts()
1.0 245 0.0 80 Name: is_red, dtype: int64
wine_val["is_red"].value_counts()
1.0 245 0.0 80 Name: is_red, dtype: int64
import seaborn as sns
sns.set_theme()
len(wine_dataset.columns)
13
#sns.pairplot(data=wine_dataset, hue="is_red")
wine_test.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 |
mean | 0.448244 | 0.217069 | 0.180630 | 0.078990 | 0.088742 | 0.103024 | 0.257462 | 0.957255 | 0.803553 | 0.263877 | 0.703930 | 0.646154 | 0.753846 |
std | 0.074301 | 0.107627 | 0.078046 | 0.070045 | 0.051400 | 0.054750 | 0.125165 | 0.002786 | 0.039808 | 0.072275 | 0.078704 | 0.095014 | 0.431433 |
min | 0.314465 | 0.063291 | 0.000000 | 0.012158 | 0.031097 | 0.010381 | 0.020455 | 0.951116 | 0.713217 | 0.130000 | 0.570470 | 0.333333 | 0.000000 |
25% | 0.402516 | 0.145570 | 0.144578 | 0.027356 | 0.060556 | 0.058824 | 0.168182 | 0.955168 | 0.775561 | 0.210000 | 0.637584 | 0.555556 | 1.000000 |
50% | 0.433962 | 0.177215 | 0.180723 | 0.042553 | 0.078560 | 0.100346 | 0.261364 | 0.957478 | 0.800499 | 0.250000 | 0.691275 | 0.666667 | 1.000000 |
75% | 0.471698 | 0.253165 | 0.222892 | 0.113982 | 0.101473 | 0.141869 | 0.343182 | 0.959354 | 0.827930 | 0.300000 | 0.758389 | 0.666667 | 1.000000 |
max | 0.817610 | 0.569620 | 0.445783 | 0.334347 | 0.679214 | 0.231834 | 0.575000 | 0.965264 | 0.917706 | 0.585000 | 0.939597 | 1.000000 | 1.000000 |
wine_train.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 | 5847.000000 |
mean | 0.453848 | 0.215061 | 0.192235 | 0.082331 | 0.092161 | 0.105659 | 0.262894 | 0.957364 | 0.802569 | 0.265798 | 0.704326 | 0.646732 | 0.753891 |
std | 0.081742 | 0.104315 | 0.088036 | 0.071982 | 0.058619 | 0.061749 | 0.128256 | 0.002882 | 0.039880 | 0.074864 | 0.079852 | 0.096928 | 0.430780 |
min | 0.238994 | 0.050633 | 0.000000 | 0.009119 | 0.014730 | 0.003460 | 0.013636 | 0.950076 | 0.678304 | 0.110000 | 0.536913 | 0.333333 | 0.000000 |
25% | 0.402516 | 0.145570 | 0.150602 | 0.027356 | 0.062193 | 0.058824 | 0.176136 | 0.955071 | 0.775561 | 0.215000 | 0.637584 | 0.555556 | 1.000000 |
50% | 0.440252 | 0.183544 | 0.186747 | 0.045593 | 0.076923 | 0.100346 | 0.268182 | 0.957516 | 0.800499 | 0.255000 | 0.691275 | 0.666667 | 1.000000 |
75% | 0.484277 | 0.253165 | 0.234940 | 0.123100 | 0.106383 | 0.141869 | 0.353409 | 0.959581 | 0.827930 | 0.300000 | 0.758389 | 0.666667 | 1.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
wine_val.describe()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | is_red | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 | 325.000000 |
mean | 0.458355 | 0.211412 | 0.198091 | 0.093521 | 0.086537 | 0.107596 | 0.271556 | 0.957757 | 0.802570 | 0.264446 | 0.701160 | 0.642393 | 0.753846 |
std | 0.084621 | 0.098749 | 0.086862 | 0.079346 | 0.035141 | 0.061805 | 0.135185 | 0.003031 | 0.044183 | 0.068086 | 0.084939 | 0.100957 | 0.431433 |
min | 0.295597 | 0.056962 | 0.000000 | 0.012158 | 0.019640 | 0.010381 | 0.018182 | 0.950413 | 0.715711 | 0.140000 | 0.563758 | 0.333333 | 0.000000 |
25% | 0.402516 | 0.145570 | 0.156627 | 0.030395 | 0.063830 | 0.055363 | 0.179545 | 0.955456 | 0.773067 | 0.215000 | 0.630872 | 0.555556 | 1.000000 |
50% | 0.446541 | 0.183544 | 0.186747 | 0.069149 | 0.078560 | 0.100346 | 0.284091 | 0.957978 | 0.800499 | 0.250000 | 0.684564 | 0.666667 | 1.000000 |
75% | 0.490566 | 0.253165 | 0.240964 | 0.133739 | 0.098200 | 0.155709 | 0.370455 | 0.960028 | 0.827930 | 0.305000 | 0.758389 | 0.666667 | 1.000000 |
max | 0.943396 | 0.746835 | 0.445783 | 0.480243 | 0.278232 | 0.266436 | 0.570455 | 0.972396 | 1.000000 | 0.570000 | 0.939597 | 0.888889 | 1.000000 |
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
class TabularDataset(Dataset):
def __init__(self, data):
self.data = data.values.astype('float32')
def __getitem__(self, index):
x = torch.tensor(self.data[index, :-1])
y = torch.tensor(self.data[index, -1])
return x, y
def __len__(self):
return len(self.data)
batch_size = 64
train_dataset = TabularDataset(wine_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TabularDataset(wine_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class TabularModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(TabularModel, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.softmax(out)
return out
input_dim = wine_train.shape[1] - 1
hidden_dim = 32
output_dim = 2
model = TabularModel(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 10
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, labels = data
labels = labels.type(torch.LongTensor)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
# Print the loss every 1000 mini-batches
if (epoch%2) == 0:
print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}')
print('Finished Training')
Epoch 1, loss: 0.5358 Epoch 3, loss: 0.3417 Epoch 5, loss: 0.3344 Epoch 7, loss: 0.3338 Epoch 9, loss: 0.3318 Finished Training
correct = 0
total = 0
with torch.no_grad():
for data in test_dataloader:
inputs, labels = data
outputs = model(inputs.float())
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy on test set: %d %%' % (100 * correct / total))
Accuracy on test set: 98 %