ium_487176/zad1.py

137 lines
4.1 KiB
Python
Raw Normal View History

import pandas as pd
import sklearn.model_selection
from datasets import load_dataset
import mlflow
import mlflow.sklearn
import numpy as np
import logging
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# logging.basicConfig(level=logging.WARN)
# logger = logging.getLogger(__name__)
# mlflow.set_tracking_uri("http://localhost:5000")
# mlflow.set_experiment("s123456")
# def eval_metrics(actual, pred):
# rmse = np.sqrt(mean_squared_error(actual, pred))
# mae = mean_absolute_error(actual, pred)
# r2 = r2_score(actual, pred)
# return rmse, mae, r2
import requests
url = "https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv"
save_path = "Wine_Quality_Data.csv"
response = requests.get(url)
response.raise_for_status()
with open(save_path, "wb") as f:
f.write(response.content)
wine_dataset = pd.read_csv("Wine_Quality_Data.csv")
wine_dataset['color'] = wine_dataset['color'].replace({'red': 1, 'white': 0})
for column in wine_dataset.columns:
wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja
from sklearn.model_selection import train_test_split
wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"])
wine_train["color"].value_counts()
# podzielenie na train i test
wine_test["color"].value_counts()
wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation
wine_test["color"].value_counts()
wine_val["color"].value_counts()
import seaborn as sns
sns.set_theme()
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
class TabularDataset(Dataset):
def __init__(self, data):
self.data = data.values.astype('float32')
def __getitem__(self, index):
x = torch.tensor(self.data[index, :-1])
y = torch.tensor(self.data[index, -1])
return x, y
def __len__(self):
return len(self.data)
batch_size = 64
train_dataset = TabularDataset(wine_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TabularDataset(wine_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class TabularModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(TabularModel, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.softmax(out)
return out
input_dim = wine_train.shape[1] - 1
hidden_dim = 32
output_dim = 2
model = TabularModel(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=32, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
num_epochs = 10
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
inputs, labels = data
labels = labels.type(torch.LongTensor)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
# Print the loss every 1000 mini-batches
if (epoch%2) == 0:
print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}')
print('Finished Training')
correct = 0
total = 0
with torch.no_grad():
for data in test_dataloader:
inputs, labels = data
outputs = model(inputs.float())
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy on test set: %d %%' % (100 * correct / total))