#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd import sklearn.model_selection from datasets import load_dataset # In[3]: dataset = load_dataset("mstz/wine", "wine") # In[4]: dataset["train"] # In[5]: wine_dataset = pd.DataFrame(dataset["train"]) # In[6]: wine_dataset.head()# podgląd danych # In[7]: wine_dataset.describe(include='all') # In[8]: wine_dataset["is_red"].value_counts().plot(kind="bar") # In[9]: wine_dataset["fixed_acidity"].std() # In[10]: import numpy as np np.where(pd.isnull(wine_dataset))## sprawdzanie czy istnieją puste wartości # In[11]: for column in wine_dataset.columns: wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja # In[12]: wine_dataset.describe(include='all') # sprawdzanie wartości po znormalizowaniu # In[13]: wine_dataset["fixed_acidity"].nlargest(10) #sprawdza czy najwyższe wartości mają sens # In[14]: from sklearn.model_selection import train_test_split wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["is_red"]) wine_train["is_red"].value_counts() # podzielenie na train i test # In[15]: wine_test["is_red"].value_counts() # In[16]: wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["is_red"]) # podzielenie na test i validation # In[17]: wine_test["is_red"].value_counts() # In[18]: wine_val["is_red"].value_counts() # In[19]: import seaborn as sns sns.set_theme() # In[20]: len(wine_dataset.columns) # In[ ]: # In[21]: #sns.pairplot(data=wine_dataset, hue="is_red") # In[22]: wine_test.describe() # In[23]: wine_train.describe() # In[24]: wine_val.describe() # In[25]: import torch from torch import nn from torch.utils.data import DataLoader, Dataset # In[26]: class TabularDataset(Dataset): def __init__(self, data): self.data = data.values.astype('float32') def __getitem__(self, index): x = torch.tensor(self.data[index, :-1]) y = torch.tensor(self.data[index, -1]) return x, y def __len__(self): return len(self.data) # In[27]: batch_size = 64 train_dataset = TabularDataset(wine_train) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_dataset = TabularDataset(wine_test) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # In[ ]: # In[28]: class TabularModel(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super(TabularModel, self).__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) self.relu = nn.ReLU() self.fc2 = nn.Linear(hidden_dim, output_dim) self.softmax = nn.Softmax(dim=1) def forward(self, x): out = self.fc1(x) out = self.relu(out) out = self.fc2(out) out = self.softmax(out) return out # In[29]: input_dim = wine_train.shape[1] - 1 hidden_dim = 32 output_dim = 2 model = TabularModel(input_dim, hidden_dim, output_dim) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) # In[30]: model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # In[31]: num_epochs = 10 for epoch in range(num_epochs): running_loss = 0.0 for i, data in enumerate(train_dataloader, 0): inputs, labels = data labels = labels.type(torch.LongTensor) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() # Print the loss every 1000 mini-batches if (epoch%2) == 0: print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}') print('Finished Training') # In[32]: correct = 0 total = 0 with torch.no_grad(): for data in test_dataloader: inputs, labels = data outputs = model(inputs.float()) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy on test set: %d %%' % (100 * correct / total))