From fae14d38c1371efe7602f04039ed1d7f0d3a111a Mon Sep 17 00:00:00 2001
From: s434732 <mickub7@st.amu.edu.pl>
Date: Wed, 9 Jun 2021 17:52:05 +0200
Subject: [PATCH] '.'

---
 .dvc/config    |  4 +++
 Dockerfile     |  3 +++
 JenkinsFileDvc | 49 ++++++++++++++++++++++++++++++++++++
 dvc.yaml       | 18 +++++++++++++
 split_10.py    | 28 +++++++++++++++++++++
 train_10.py    | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 170 insertions(+)
 create mode 100644 JenkinsFileDvc
 create mode 100644 dvc.yaml
 create mode 100644 split_10.py
 create mode 100644 train_10.py

diff --git a/.dvc/config b/.dvc/config
index e69de29..c02d6a2 100644
--- a/.dvc/config
+++ b/.dvc/config
@@ -0,0 +1,4 @@
+[core]
+    remote = ium_ssh_remote
+['remote "ium_ssh_remote"']
+    url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl/ium-sftp
diff --git a/Dockerfile b/Dockerfile
index 013ba47..fdd1083 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,6 +8,8 @@ RUN pip3 install torch torchvision torchaudio
 RUN apt install -y curl
 RUN pip3 install --user wget
 RUN pip3 install sacred && pip3 install GitPython && pip3 install pymongo
+RUN pip3 install dvc
+RUN pip3 install dvc[ssh] paramiko
 
 WORKDIR /app
 
@@ -18,6 +20,7 @@ COPY ./IUM_05.py ./
 COPY ./training.py ./
 COPY ./mongoObserver.py ./
 COPY ./fileObserver.py ./
+COPY ./fileObserver.py ./
 
 RUN mkdir /.kaggle
 RUN chmod -R 777 /.kaggle
diff --git a/JenkinsFileDvc b/JenkinsFileDvc
new file mode 100644
index 0000000..5db166b
--- /dev/null
+++ b/JenkinsFileDvc
@@ -0,0 +1,49 @@
+pipeline {
+   agent {
+      dockerfile true
+   }
+   parameters{
+      buildSelector(
+            defaultSelector: lastSuccessful(),
+            description: 'Which build to use for copying artifacts',
+            name: 'WHICH_BUILD'
+        )
+        string(
+            defaultValue: '10',
+            description: 'batch size',
+            name: 'BATCH_SIZE'
+        )
+        string(
+            defaultValue: '5',
+            description: 'epochs',
+            name: 'EPOCHS'
+
+        )
+   }
+   stages {
+      stage('dvc') {
+            steps {
+            withCredentials([sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: '')]) {
+                copyArtifacts fingerprintArtifacts: true, projectName: 's434732-create-dataset', selector: buildParameter('WHICH_BUILD')
+    		    sh 'ssh ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl -i $IUM_SFTP_KEY'
+                sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
+                sh "dvc pull"
+                sh 'dvc pull'
+                sh "dvc reproduce"
+}
+
+            }
+        }
+    }
+    post {
+        success {
+            mail body: 'SUCCESS DVC', subject: 's434732', to: '26ab8f35.uam.onmicrosoft.com@emea.teams.ms'
+            archiveArtifacts 'accuracy.txt'
+
+        }
+
+        failure {
+            mail body: 'FAILURE DVC', subject: 's434732', to: '26ab8f35.uam.onmicrosoft.com@emea.teams.ms'
+        }
+    }
+}
\ No newline at end of file
diff --git a/dvc.yaml b/dvc.yaml
new file mode 100644
index 0000000..b128883
--- /dev/null
+++ b/dvc.yaml
@@ -0,0 +1,18 @@
+stages:
+  download_and_split:
+    cmd: python3 split_10.py
+    deps:
+    - heart_failure_clinical_records_dataset.csv
+    - split_10.py
+    outs:
+    - train.csv
+    - valid.csv
+    - test.csv
+  train_model:
+    cmd: python3 train_10.py
+    deps:
+    - train.csv
+    - valid.csv
+    - test.csv
+    outs:
+    - accuracy.txt
\ No newline at end of file
diff --git a/split_10.py b/split_10.py
new file mode 100644
index 0000000..d28d79d
--- /dev/null
+++ b/split_10.py
@@ -0,0 +1,28 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+import numpy as np
+
+results = pd.read_csv('heart_failure_clinical_records_dataset.csv')
+
+#brak wierszy z NaN
+results.dropna()
+
+results = results.astype({"age": np.int64})
+
+
+for col in results.columns:
+    if results[col].dtype == np.float64:  # FLOATS TO VALUES IN [ 0, 1]
+        dataReshaped = results[col].values.reshape(-1, 1)
+        scaler = MinMaxScaler(feature_range=(0, 1))
+        results[col] = scaler.fit_transform(dataReshaped)
+
+
+# Podział zbioru 6:1:1
+train, test = train_test_split(results, test_size= 1 - 0.6)
+
+valid, test = train_test_split(test, test_size=0.5)
+
+train.to_csv("train.csv", index=False)
+valid.to_csv("valid.csv",index=False)
+test.to_csv("test.csv",index=False)
diff --git a/train_10.py b/train_10.py
new file mode 100644
index 0000000..d42ce3b
--- /dev/null
+++ b/train_10.py
@@ -0,0 +1,68 @@
+import torch
+import sys
+from torch import nn
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import f1_score
+np.set_printoptions(suppress=False)
+
+
+class LogisticRegressionModel(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(LogisticRegressionModel, self).__init__()
+        self.linear = nn.Linear(input_dim, output_dim)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        out = self.linear(x)
+        return self.sigmoid(out)
+
+
+train = pd.read_csv("train.csv")
+test = pd.read_csv("test.csv")
+valid  = pd.read_csv("valid.csv")
+
+xtrain = train[['age','anaemia','creatinine_phosphokinase','diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking']].astype(np.float32)
+ytrain = train['DEATH_EVENT'].astype(np.float32)
+
+xtest = test[['age','anaemia','creatinine_phosphokinase','diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking']].astype(np.float32)
+ytest = test['DEATH_EVENT'].astype(np.float32)
+
+xTrain = torch.from_numpy(xtrain.values)
+yTrain = torch.from_numpy(ytrain.values.reshape(179,1))
+
+xTest = torch.from_numpy(xtest.values)
+yTest = torch.from_numpy(ytest.values)
+
+batch_size = int(sys.argv[1]) if len(sys.argv) > 1 else 10
+num_epochs = int(sys.argv[2]) if len(sys.argv) > 2 else 5
+learning_rate = 0.002
+input_dim = 11
+output_dim = 1
+
+model = LogisticRegressionModel(input_dim, output_dim)
+
+criterion = torch.nn.BCELoss(reduction='mean')
+optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
+
+for epoch in range(num_epochs):
+    # print ("Epoch #",epoch)
+    model.train()
+    optimizer.zero_grad()
+    # Forward pass
+    y_pred = model(xTrain)
+    # Compute Loss
+    loss = criterion(y_pred, yTrain)
+    # print(loss.item())
+    # Backward pass
+    loss.backward()
+    optimizer.step()
+predictions = model(xTest)
+
+accuracy_result = accuracy_score(yTest, np.argmax(predictions.detach().numpy(), axis=1))
+print("accuracy_score", accuracy_result)
+print("F1", f1_score(yTest, np.argmax(predictions.detach().numpy(), axis=1), average=None))
+
+text_file = open("accuracy.txt", "w")
+n = text_file.write(f"accuracy: {accuracy_result}")
+text_file.close()
\ No newline at end of file