2023-04-19 21:37:40 +02:00
import pandas as pd
import sklearn . model_selection
2023-06-08 20:20:28 +02:00
import mlflow
import mlflow . sklearn
import numpy as np
import logging
2023-06-09 16:56:36 +02:00
2023-04-19 21:37:40 +02:00
2023-06-28 13:31:49 +02:00
import argparse
parser = argparse . ArgumentParser ( description = ' IUM script ' )
parser . add_argument ( ' --num_epochs ' , type = int , default = 10 , help = ' Number of epochs ' )
parser . add_argument ( ' --lr ' , type = float , default = 0.001 , help = ' Learning rate ' )
parser . add_argument ( ' --alpha ' , type = float , default = 0.001 , help = ' Learning rate ' )
args = parser . parse_args ( )
2023-06-09 15:44:54 +02:00
logging . basicConfig ( level = logging . WARN )
logger = logging . getLogger ( __name__ )
2023-04-19 21:37:40 +02:00
2023-06-09 15:44:54 +02:00
mlflow . set_tracking_uri ( " http://localhost:5000 " )
mlflow . set_experiment ( " s487176 " )
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
import requests
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
url = " https://huggingface.co/datasets/mstz/wine/raw/main/Wine_Quality_Data.csv "
save_path = " Wine_Quality_Data.csv "
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
response = requests . get ( url )
response . raise_for_status ( )
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
with open ( save_path , " wb " ) as f :
f . write ( response . content )
wine_dataset = pd . read_csv ( " Wine_Quality_Data.csv " )
wine_dataset [ ' color ' ] = wine_dataset [ ' color ' ] . replace ( { ' red ' : 1 , ' white ' : 0 } )
2023-04-19 21:37:40 +02:00
for column in wine_dataset . columns :
wine_dataset [ column ] = wine_dataset [ column ] / wine_dataset [ column ] . abs ( ) . max ( ) # normalizacja
from sklearn . model_selection import train_test_split
2023-06-08 20:20:28 +02:00
wine_train , wine_test = sklearn . model_selection . train_test_split ( wine_dataset , test_size = 0.1 , random_state = 1 , stratify = wine_dataset [ " color " ] )
wine_train [ " color " ] . value_counts ( )
2023-04-19 21:37:40 +02:00
# podzielenie na train i test
2023-06-08 20:20:28 +02:00
wine_test [ " color " ] . value_counts ( )
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
wine_test , wine_val = sklearn . model_selection . train_test_split ( wine_test , test_size = 0.5 , random_state = 1 , stratify = wine_test [ " color " ] ) # podzielenie na test i validation
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
wine_test [ " color " ] . value_counts ( )
2023-04-19 21:37:40 +02:00
2023-06-08 20:20:28 +02:00
wine_val [ " color " ] . value_counts ( )
2023-04-19 21:37:40 +02:00
import seaborn as sns
sns . set_theme ( )
import torch
from torch import nn
from torch . utils . data import DataLoader , Dataset
class TabularDataset ( Dataset ) :
def __init__ ( self , data ) :
self . data = data . values . astype ( ' float32 ' )
def __getitem__ ( self , index ) :
x = torch . tensor ( self . data [ index , : - 1 ] )
y = torch . tensor ( self . data [ index , - 1 ] )
return x , y
def __len__ ( self ) :
return len ( self . data )
batch_size = 64
train_dataset = TabularDataset ( wine_train )
train_dataloader = DataLoader ( train_dataset , batch_size = batch_size , shuffle = True )
test_dataset = TabularDataset ( wine_test )
test_dataloader = DataLoader ( test_dataset , batch_size = batch_size , shuffle = False )
class TabularModel ( nn . Module ) :
def __init__ ( self , input_dim , hidden_dim , output_dim ) :
super ( TabularModel , self ) . __init__ ( )
self . fc1 = nn . Linear ( input_dim , hidden_dim )
self . relu = nn . ReLU ( )
self . fc2 = nn . Linear ( hidden_dim , output_dim )
self . softmax = nn . Softmax ( dim = 1 )
def forward ( self , x ) :
out = self . fc1 ( x )
out = self . relu ( out )
out = self . fc2 ( out )
out = self . softmax ( out )
return out
2023-06-09 15:44:54 +02:00
def predict ( self , x ) :
with torch . no_grad ( ) :
output = self . forward ( x )
_ , predicted = torch . max ( output , 1 )
return predicted
2023-04-19 21:37:40 +02:00
input_dim = wine_train . shape [ 1 ] - 1
hidden_dim = 32
output_dim = 2
model = TabularModel ( input_dim , hidden_dim , output_dim )
criterion = nn . CrossEntropyLoss ( )
optimizer = torch . optim . Adam ( model . parameters ( ) )
2023-06-28 13:31:49 +02:00
num_epochs = args . num_epochs
lr = args . lr
alpha = args . alpha
2023-06-09 15:44:54 +02:00
model = TabularModel ( input_dim = len ( wine_train . columns ) - 1 , hidden_dim = hidden_dim , output_dim = output_dim )
criterion = nn . CrossEntropyLoss ( )
optimizer = torch . optim . Adam ( model . parameters ( ) , lr = lr , weight_decay = alpha )
with mlflow . start_run ( ) :
mlflow . log_params ( { " learning rate " : lr , " alpha " : alpha } )
for epoch in range ( num_epochs ) :
running_loss = 0.0
for i , data in enumerate ( train_dataloader , 0 ) :
inputs , labels = data
labels = labels . type ( torch . LongTensor )
optimizer . zero_grad ( )
outputs = model ( inputs )
loss = criterion ( outputs , labels )
loss . backward ( )
optimizer . step ( )
running_loss + = loss . item ( )
# Print the loss every 1000 mini-batches
if ( epoch % 2 ) == 0 :
print ( f ' Epoch { epoch + 1 } , loss: { running_loss / len ( train_dataloader ) : .4f } ' )
2023-04-19 21:37:40 +02:00
print ( ' Finished Training ' )
correct = 0
total = 0
with torch . no_grad ( ) :
for data in test_dataloader :
inputs , labels = data
2023-06-09 15:44:54 +02:00
predicted = model . predict ( inputs . float ( ) )
2023-04-19 21:37:40 +02:00
total + = labels . size ( 0 )
correct + = ( predicted == labels ) . sum ( ) . item ( )
2023-06-09 15:44:54 +02:00
accuracy = 100 * correct / total
print ( ' Accuracy on test set: %d %% ' % accuracy )
2023-04-19 21:37:40 +02:00
2023-06-09 15:44:54 +02:00
mlflow . log_metric ( " test_accuracy " , accuracy )
mlflow . sklearn . log_model ( model , " model " )