import pandas as pd import sklearn.model_selection from datasets import load_dataset import mlflow import mlflow.sklearn import numpy as np import logging from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score logging.basicConfig(level=logging.WARN) logger = logging.getLogger(__name__) mlflow.set_tracking_uri("http://localhost:5000") mlflow.set_experiment("s487176") import requests url = "https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv" save_path = "Wine_Quality_Data.csv" response = requests.get(url) response.raise_for_status() with open(save_path, "wb") as f: f.write(response.content) wine_dataset = pd.read_csv("Wine_Quality_Data.csv") wine_dataset['color'] = wine_dataset['color'].replace({'red': 1, 'white': 0}) for column in wine_dataset.columns: wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja from sklearn.model_selection import train_test_split wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"]) wine_train["color"].value_counts() # podzielenie na train i test wine_test["color"].value_counts() wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation wine_test["color"].value_counts() wine_val["color"].value_counts() import seaborn as sns sns.set_theme() import torch from torch import nn from torch.utils.data import DataLoader, Dataset class TabularDataset(Dataset): def __init__(self, data): self.data = data.values.astype('float32') def __getitem__(self, index): x = torch.tensor(self.data[index, :-1]) y = torch.tensor(self.data[index, -1]) return x, y def __len__(self): return len(self.data) batch_size = 64 train_dataset = TabularDataset(wine_train) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_dataset = TabularDataset(wine_test) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) class TabularModel(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super(TabularModel, self).__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) self.relu = nn.ReLU() self.fc2 = nn.Linear(hidden_dim, output_dim) self.softmax = nn.Softmax(dim=1) def forward(self, x): out = self.fc1(x) out = self.relu(out) out = self.fc2(out) out = self.softmax(out) return out def predict(self, x): with torch.no_grad(): output = self.forward(x) _, predicted = torch.max(output, 1) return predicted input_dim = wine_train.shape[1] - 1 hidden_dim = 32 output_dim = 2 model = TabularModel(input_dim, hidden_dim, output_dim) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) num_epochs = 10 lr = 0.01 alpha = 0.01 model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=hidden_dim, output_dim=output_dim) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=alpha) with mlflow.start_run(): mlflow.log_params({"learning rate":lr,"alpha":alpha}) for epoch in range(num_epochs): running_loss = 0.0 for i, data in enumerate(train_dataloader, 0): inputs, labels = data labels = labels.type(torch.LongTensor) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() # Print the loss every 1000 mini-batches if (epoch%2) == 0: print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}') print('Finished Training') correct = 0 total = 0 with torch.no_grad(): for data in test_dataloader: inputs, labels = data predicted = model.predict(inputs.float()) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy= 100 * correct / total print('Accuracy on test set: %d %%' % accuracy) mlflow.log_metric("test_accuracy", accuracy) mlflow.sklearn.log_model(model, "model")