diff --git a/.dvc/config b/.dvc/config index e69de29..f11a7f9 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = ium_ssh_remote +['remote "my_local_remote"'] + url = /dvcstore +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl diff --git a/.gitignore b/.gitignore index 207d123..a4627ad 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,8 @@ ipython_config.py # Remove previous ipynb_checkpoints # git rm -r .ipynb_checkpoints/ +/X_train.csv +/X_test.csv +/y_train.csv +/y_test.csv +/model.pth diff --git a/Dockerfile b/Dockerfile index eada525..83f646a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,5 +13,6 @@ WORKDIR /app COPY ./body-performance-data.zip ./ -COPY ./classification_net.py ./ +COPY ./prepare_datasets.py ./ +COPY ./train.py ./ diff --git a/classification_net.py b/classification_net.py deleted file mode 100644 index d8b2348..0000000 --- a/classification_net.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[ ]: - - -# get_ipython().system('kaggle datasets download -d kukuroo3/body-performance-data') - - -# In[ ]: - - -get_ipython().system('unzip -o body-performance-data.zip') - - -# In[114]: - - -import numpy as np -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.metrics import classification_report -import torch -from torch import nn, optim -import torch.nn.functional as F - - -# In[115]: - - -df = pd.read_csv('bodyPerformance.csv') -df.shape - - -# In[116]: - - -df.head() - - -# In[117]: - - -cols = ['gender', 'height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm'] -df = df[cols] - -# male - 0, female - 1 -df['gender'].replace({'M': 0, 'F': 1}, inplace = True) -df = df.dropna(how='any') - - -# In[118]: - - -df.gender.value_counts() / df.shape[0] - - -# In[119]: - - -X = df[['height_cm', 'weight_kg', 'body fat_%', 'sit-ups counts', 'broad jump_cm']] -y = df[['gender']] - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - - -# In[120]: - - -X_train = torch.from_numpy(np.array(X_train)).float() -y_train = torch.squeeze(torch.from_numpy(y_train.values).float()) - -X_test = torch.from_numpy(np.array(X_test)).float() -y_test = torch.squeeze(torch.from_numpy(y_test.values).float()) - -print(X_train.shape, y_train.shape) -print(X_test.shape, y_test.shape) - - -# In[121]: - - -class Net(nn.Module): - def __init__(self, n_features): - super(Net, self).__init__() - self.fc1 = nn.Linear(n_features, 5) - self.fc2 = nn.Linear(5, 3) - self.fc3 = nn.Linear(3, 1) - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return torch.sigmoid(self.fc3(x)) -net = Net(X_train.shape[1]) - - -# In[122]: - - -criterion = nn.BCELoss() - - -# In[123]: - - -optimizer = optim.Adam(net.parameters(), lr=0.001) - - -# In[124]: - - -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - -# In[125]: - - -X_train = X_train.to(device) -y_train = y_train.to(device) -X_test = X_test.to(device) -y_test = y_test.to(device) - - -# In[126]: - - -net = net.to(device) -criterion = criterion.to(device) - - -# In[127]: - - -def calculate_accuracy(y_true, y_pred): - predicted = y_pred.ge(.5).view(-1) - return (y_true == predicted).sum().float() / len(y_true) - - -# In[128]: - - -def round_tensor(t, decimal_places=3): - return round(t.item(), decimal_places) -for epoch in range(1000): - y_pred = net(X_train) - y_pred = torch.squeeze(y_pred) - train_loss = criterion(y_pred, y_train) - if epoch % 100 == 0: - train_acc = calculate_accuracy(y_train, y_pred) - y_test_pred = net(X_test) - y_test_pred = torch.squeeze(y_test_pred) - test_loss = criterion(y_test_pred, y_test) - test_acc = calculate_accuracy(y_test, y_test_pred) - print( -f'''epoch {epoch} -Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)} -Test set - loss: {round_tensor(test_loss)}, accuracy: {round_tensor(test_acc)} -''') - optimizer.zero_grad() - train_loss.backward() - optimizer.step() - - -# In[129]: - - -# torch.save(net, 'model.pth') - - -# In[130]: - - -# net = torch.load('model.pth') - - -# In[131]: - - -classes = ['Male', 'Female'] -y_pred = net(X_test) -y_pred = y_pred.ge(.5).view(-1).cpu() -y_test = y_test.cpu() -print(classification_report(y_test, y_pred, target_names=classes)) - - -# In[132]: - - -with open('test_out.csv', 'w') as file: - for y in y_pred: - file.write(classes[y.item()]) - file.write('\n') - diff --git a/dvc.Jenkinsfile b/dvc.Jenkinsfile new file mode 100644 index 0000000..cd35608 --- /dev/null +++ b/dvc.Jenkinsfile @@ -0,0 +1,17 @@ +pipeline { + agent { + dockerfile true + } + stages { + stage('Dvc pull and reproduce') { + steps { + checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444421', url: 'https://git.wmi.amu.edu.pl/s444421/ium_444421.git']]]) + withCredentials([string(credentialsId: 'ium-sftp-password', variable: 'IUM_SFTP_PASS')]) { + sh 'dvc remote add -d ium_ssh_remote ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl/ium-sftp' + sh 'dvc remote modify --local ium_ssh_remote password $IUM_SFTP_KEY' + sh 'dvc pull' + } + } + } + } +} \ No newline at end of file diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..cfce856 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,19 @@ +stages: + prepare_datasets: + cmd: python prepare_datasets.py X_train.csv X_test.csv y_train.csv y_test.csv + deps: + - data/bodyPerformance.csv + - prepare_datasets.py + outs: + - X_test.csv + - X_train.csv + - y_test.csv + - y_train.csv + train: + cmd: python train.py model.pth + deps: + - X_train.csv + - train.py + - y_train.csv + outs: + - model.pth diff --git a/prepare_datasets.py b/prepare_datasets.py index a437097..a2d6405 100644 --- a/prepare_datasets.py +++ b/prepare_datasets.py @@ -4,7 +4,7 @@ # In[ ]: -get_ipython().system('unzip -o body-performance-data.zip') +# get_ipython().system('unzip -o body-performance-data.zip') # In[4]: @@ -17,7 +17,7 @@ from sklearn.model_selection import train_test_split # In[21]: -df = pd.read_csv('bodyPerformance.csv') +df = pd.read_csv('data/bodyPerformance.csv') # In[22]: diff --git a/train.py b/train.py new file mode 100644 index 0000000..fa8b13c --- /dev/null +++ b/train.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import numpy as np +import pandas as pd +import torch +from torch import nn, optim +import torch.nn.functional as F +import sys + + +# In[ ]: + + +X_train = pd.read_csv('X_train.csv') +y_train = pd.read_csv('y_train.csv') + + +# In[ ]: + + +X_train = torch.from_numpy(np.array(X_train)).float() +y_train = torch.squeeze(torch.from_numpy(y_train.values).float()) + + +# In[ ]: + + +class Net(nn.Module): + def __init__(self, n_features): + super(Net, self).__init__() + self.fc1 = nn.Linear(n_features, 5) + self.fc2 = nn.Linear(5, 3) + self.fc3 = nn.Linear(3, 1) + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return torch.sigmoid(self.fc3(x)) + + +# In[ ]: + + +net = Net(X_train.shape[1]) +criterion = nn.BCELoss() +optimizer = optim.Adam(net.parameters(), lr=0.001) + + +# In[ ]: + + +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +X_train = X_train.to(device) +y_train = y_train.to(device) + +net = net.to(device) +criterion = criterion.to(device) + + +# In[ ]: + + +def calculate_accuracy(y_true, y_pred): + predicted = y_pred.ge(.5).view(-1) + return (y_true == predicted).sum().float() / len(y_true) + +def round_tensor(t, decimal_places=3): + return round(t.item(), decimal_places) + + +for epoch in range(1000): + y_pred = net(X_train) + y_pred = torch.squeeze(y_pred) + train_loss = criterion(y_pred, y_train) + if epoch % 100 == 0: + train_acc = calculate_accuracy(y_train, y_pred) + print( + f'''epoch {epoch} + Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)} + ''') + optimizer.zero_grad() + train_loss.backward() + optimizer.step() + + +# In[ ]: + + +torch.save(net, 'model.pth') +