ium_487176/zad1.py

import pandas as pd
import sklearn.model_selection
import mlflow
import mlflow.sklearn
import numpy as np
import logging


import argparse

parser = argparse.ArgumentParser(description='IUM script')
parser.add_argument('--num_epochs', type=int, default=10, help='Number of epochs')
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
parser.add_argument('--alpha', type=float, default=0.001, help='Learning rate')
args = parser.parse_args()
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("s487176")

import requests

url = "https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv"
save_path = "Wine_Quality_Data.csv"

response = requests.get(url)
response.raise_for_status()

with open(save_path, "wb") as f:
    f.write(response.content)
wine_dataset = pd.read_csv("Wine_Quality_Data.csv")
wine_dataset['color'] = wine_dataset['color'].replace({'red': 1, 'white': 0})
for column in wine_dataset.columns:
    wine_dataset[column] = wine_dataset[column]  / wine_dataset[column].abs().max() # normalizacja


from sklearn.model_selection import train_test_split
wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"])
wine_train["color"].value_counts() 
# podzielenie na train i test

wine_test["color"].value_counts()


wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation

wine_test["color"].value_counts()

wine_val["color"].value_counts()

import seaborn as sns
sns.set_theme()

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset


class TabularDataset(Dataset):
    def __init__(self, data):
        self.data = data.values.astype('float32')

    def __getitem__(self, index):
        x = torch.tensor(self.data[index, :-1])
        y = torch.tensor(self.data[index, -1])
        return x, y

    def __len__(self):
        return len(self.data)


batch_size = 64
train_dataset = TabularDataset(wine_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TabularDataset(wine_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


class TabularModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TabularModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.softmax(out)
        return out
    
    def predict(self, x):
        with torch.no_grad():
            output = self.forward(x)
            _, predicted = torch.max(output, 1)
            return predicted

input_dim = wine_train.shape[1] - 1
hidden_dim = 32
output_dim = 2
model = TabularModel(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = args.num_epochs
lr = args.lr
alpha = args.alpha
model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=hidden_dim, output_dim=output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=alpha)
with mlflow.start_run():
    mlflow.log_params({"learning rate":lr,"alpha":alpha})


    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            inputs, labels = data
            labels = labels.type(torch.LongTensor)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Print the loss every 1000 mini-batches
        if (epoch%2)  == 0:
            print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}')

print('Finished Training')


correct = 0
total = 0
with torch.no_grad():
    for data in test_dataloader:
        inputs, labels = data
        predicted = model.predict(inputs.float())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
accuracy= 100 * correct / total
print('Accuracy on test set: %d %%' % accuracy)

mlflow.log_metric("test_accuracy", accuracy)
mlflow.sklearn.log_model(model, "model")
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00			`import pandas as pd`
			`import sklearn.model_selection`
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`import mlflow`
			`import mlflow.sklearn`
			`import numpy as np`
			`import logging`
added enviroment yml file 2023-06-09 16:56:36 +02:00
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
added parameters and mlflow to dockerfile 2023-06-28 13:31:49 +02:00			`import argparse`

			`parser = argparse.ArgumentParser(description='IUM script')`
			`parser.add_argument('--num_epochs', type=int, default=10, help='Number of epochs')`
			`parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')`
			`parser.add_argument('--alpha', type=float, default=0.001, help='Learning rate')`
			`args = parser.parse_args()`
added mlflow logging 2023-06-09 15:44:54 +02:00			`logging.basicConfig(level=logging.WARN)`
			`logger = logging.getLogger(__name__)`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
added mlflow logging 2023-06-09 15:44:54 +02:00			`mlflow.set_tracking_uri("http://localhost:5000")`
			`mlflow.set_experiment("s487176")`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`import requests`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`url = "https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv"`
			`save_path = "Wine_Quality_Data.csv"`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`response = requests.get(url)`
			`response.raise_for_status()`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`with open(save_path, "wb") as f:`
			`f.write(response.content)`
			`wine_dataset = pd.read_csv("Wine_Quality_Data.csv")`
			`wine_dataset['color'] = wine_dataset['color'].replace({'red': 1, 'white': 0})`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00			`for column in wine_dataset.columns:`
			`wine_dataset[column] = wine_dataset[column] / wine_dataset[column].abs().max() # normalizacja`


			`from sklearn.model_selection import train_test_split`
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`wine_train, wine_test = sklearn.model_selection.train_test_split(wine_dataset, test_size=0.1, random_state=1, stratify=wine_dataset["color"])`
			`wine_train["color"].value_counts()`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00			`# podzielenie na train i test`

fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`wine_test["color"].value_counts()`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00

fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`wine_test, wine_val = sklearn.model_selection.train_test_split(wine_test, test_size=0.5, random_state=1, stratify=wine_test["color"]) # podzielenie na test i validation`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`wine_test["color"].value_counts()`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
fixed the issue with dataset being different in link and in loading it via huggingface load dataset 2023-06-08 20:20:28 +02:00			`wine_val["color"].value_counts()`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
			`import seaborn as sns`
			`sns.set_theme()`

			`import torch`
			`from torch import nn`
			`from torch.utils.data import DataLoader, Dataset`


			`class TabularDataset(Dataset):`
			`def __init__(self, data):`
			`self.data = data.values.astype('float32')`

			`def __getitem__(self, index):`
			`x = torch.tensor(self.data[index, :-1])`
			`y = torch.tensor(self.data[index, -1])`
			`return x, y`

			`def __len__(self):`
			`return len(self.data)`


			`batch_size = 64`
			`train_dataset = TabularDataset(wine_train)`
			`train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)`
			`test_dataset = TabularDataset(wine_test)`
			`test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)`


			`class TabularModel(nn.Module):`
			`def __init__(self, input_dim, hidden_dim, output_dim):`
			`super(TabularModel, self).__init__()`
			`self.fc1 = nn.Linear(input_dim, hidden_dim)`
			`self.relu = nn.ReLU()`
			`self.fc2 = nn.Linear(hidden_dim, output_dim)`
			`self.softmax = nn.Softmax(dim=1)`

			`def forward(self, x):`
			`out = self.fc1(x)`
			`out = self.relu(out)`
			`out = self.fc2(out)`
			`out = self.softmax(out)`
			`return out`
added mlflow logging 2023-06-09 15:44:54 +02:00
			`def predict(self, x):`
			`with torch.no_grad():`
			`output = self.forward(x)`
			`_, predicted = torch.max(output, 1)`
			`return predicted`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
			`input_dim = wine_train.shape[1] - 1`
			`hidden_dim = 32`
			`output_dim = 2`
			`model = TabularModel(input_dim, hidden_dim, output_dim)`
			`criterion = nn.CrossEntropyLoss()`
			`optimizer = torch.optim.Adam(model.parameters())`

added parameters and mlflow to dockerfile 2023-06-28 13:31:49 +02:00			`num_epochs = args.num_epochs`
			`lr = args.lr`
			`alpha = args.alpha`
added mlflow logging 2023-06-09 15:44:54 +02:00			`model = TabularModel(input_dim=len(wine_train.columns)-1, hidden_dim=hidden_dim, output_dim=output_dim)`
			`criterion = nn.CrossEntropyLoss()`
			`optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=alpha)`
			`with mlflow.start_run():`
			`mlflow.log_params({"learning rate":lr,"alpha":alpha})`


			`for epoch in range(num_epochs):`
			`running_loss = 0.0`
			`for i, data in enumerate(train_dataloader, 0):`
			`inputs, labels = data`
			`labels = labels.type(torch.LongTensor)`
			`optimizer.zero_grad()`
			`outputs = model(inputs)`
			`loss = criterion(outputs, labels)`
			`loss.backward()`
			`optimizer.step()`
			`running_loss += loss.item()`

			`# Print the loss every 1000 mini-batches`
			`if (epoch%2) == 0:`
			`print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader):.4f}')`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
			`print('Finished Training')`


			`correct = 0`
			`total = 0`
			`with torch.no_grad():`
			`for data in test_dataloader:`
			`inputs, labels = data`
added mlflow logging 2023-06-09 15:44:54 +02:00			`predicted = model.predict(inputs.float())`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00			`total += labels.size(0)`
			`correct += (predicted == labels).sum().item()`
added mlflow logging 2023-06-09 15:44:54 +02:00
			`accuracy= 100 * correct / total`
			`print('Accuracy on test set: %d %%' % accuracy)`
modified dockerfile and added py script based on jupyter notebook 2023-04-19 21:37:40 +02:00
added mlflow logging 2023-06-09 15:44:54 +02:00			`mlflow.log_metric("test_accuracy", accuracy)`
			`mlflow.sklearn.log_model(model, "model")`