Jenkinsfile and Dockerfile modify for s444498-dvc pipeline

2022-06-05 18:59:04 +02:00 · 2022-06-05 18:59:04 +02:00 · 89d8c6e9e9
commit 89d8c6e9e9
parent d4d89d25a8
10 changed files with 127 additions and 12 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -1,5 +1,5 @@
 [core]
-    autostage = true
    remote = ium_ssh_remote
 ['remote "ium_ssh_remote"']
-    url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl
+    url = ssh://tzietkiewicz.vm.wmi.amu.edu.pl:/home/ium-sftp
+    user = ium-sftp
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.csv
-*.zip
 *.png
 *.txt
 __pycache__
 /prepared
+model.zip
+sacred_runs/1/model.zip
--- a/5
+++ b/5
@ -19,6 +19,8 @@ RUN pip3 install matplotlib
 RUN pip3 install torchvision
 RUN pip3 install sacred
 RUN pip3 install pymongo
+RUN pip3 install dvc
+RUN pip3 install 'dvc[ssh]' paramiko

 # Args
 ARG KAGGLE_USERNAME
@ -31,5 +33,8 @@ WORKDIR /app
 # Copy everything from jenkins to /app
 COPY . .

+# Create user
+RUN useradd -r -u 111 jenkins
+
 # Create kaggle catalog for authenticate
 RUN mkdir /.kaggle/ && chmod o+w /.kaggle
--- a/42
+++ b/42
@ -0,0 +1,42 @@
+pipeline {
+    agent {
+        dockerfile {
+			args '-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY}'
+		}
+    }
+    parameters {
+        string (
+            defaultValue: 'wirus006',
+            description: 'Kaggle username',
+            name: 'KAGGLE_USERNAME',
+            trim: false
+        )
+        password (
+            defaultValue: '',
+            description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
+            name: 'KAGGLE_KEY'
+        )
+    }
+    stages {
+        stage("Git clone") {
+            steps {
+              checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444498', url: 'https://git.wmi.amu.edu.pl/s444498/ium_444498.git']]])
+            }
+        }
+        stage("Run DVC") {
+            steps{
+                withCredentials(
+                    [sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: 'USER')]) {
+                    sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
+                    sh 'dvc remote modify --local ium_ssh_remote password IUM@2021'
+                    sh 'dvc remote list'
+			        sh 'cat .dvc/config'
+			        sh 'cat .dvc/config.local'
+                    sh 'dvc pull'
+                    sh 'ls -al'
+                    sh 'dvc repro'
+                }
+            }
+        }
+    }
+}
--- a/atp-and-wta-tennis-data.zip
+++ b/atp-and-wta-tennis-data.zip
--- a/atp_dev.csv.dvc
+++ b/atp_dev.csv.dvc
@ -1,4 +1,4 @@
 outs:
- md5: 16cefb2b04f963bcf0fbb6f256496219
-  size: 2466716
+- md5: d32a6cf1889199066cace68f8f56890b
+  size: 2431316
  path: atp_dev.csv
--- a/atp_test.csv.dvc
+++ b/atp_test.csv.dvc
@ -1,4 +1,4 @@
 outs:
- md5: b5b50c11ef644df2ef799ca56e7d1ced
-  size: 2466156
+- md5: 389fd474d4db00db1c113683177d5880
+  size: 2430180
  path: atp_test.csv
--- a/atp_train.csv.dvc
+++ b/atp_train.csv.dvc
@ -1,4 +1,4 @@
 outs:
- md5: 314cd14a051bd61bf7e1f3a160c02dd2
-  size: 7408451
+- md5: 50969b14a70db98c17a62cf7d99edb5a
+  size: 7302503
  path: atp_train.csv
--- a/dvc.yaml
+++ b/dvc.yaml
@ -1,5 +1,5 @@
 stages:
-  prepare:
-    cmd: python init.py
  train:
-    cmd: python neutral_network.py
+    cmd: python3 neutral_network.py
+  prepare:
+    cmd: python3 init2.py
--- a/init2.py
+++ b/init2.py
@ -0,0 +1,67 @@
+import subprocess
+from os.path import exists
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import matplotlib
+from pathlib import Path
+import math
+
+# Inicjalizacja danych
+file_exists = exists("./df_atp.csv")
+if not file_exists:
+    subprocess.run(["unzip", "-o", "atp-and-wta-tennis-data.zip"])
+
+atp_data = pd.read_csv("df_atp.csv")
+
+# Średnia ilość gemów w pierwszym secie zwycięzców meczu
+print(atp_data[["Winner", "W1"]].mean())
+
+# Minimalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz
+print(atp_data[["Winner", "W1"]].min())
+
+# Maksymalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz
+print(atp_data[["Winner", "W1"]].max())
+
+# Odchylenie standardowe wygranych gemów w pierwszym secie osób wygrywających mecz
+print(atp_data[["Winner", "W1"]].std())
+
+# Mediana wygranych gemów w pierwszym secie osób wygrywających mecz
+print(atp_data[["Winner", "W1"]].median())
+
+# Zmiana nazwy nienazwanej kolumny
+atp_data.rename(columns={"Unnamed: 0": "ID"}, inplace=True)
+
+# Jak często kto był zwycięzcą
+print(atp_data.groupby("Winner")["ID"].nunique())
+
+# Normalizacja rund -1: Finał, -2: Półfinał, -3: Ćwiartka, -4: Każdy z każdym
+# 1: pierwsza runda, 2: druga runda, 3: trzecia runda, 4: czwarta runda
+atp_data.loc[atp_data["Round"] == "The Final", "Round"] = -1
+atp_data.loc[atp_data["Round"] == "Semifinals", "Round"] = -2
+atp_data.loc[atp_data["Round"] == "Quarterfinals", "Round"] = -3
+atp_data.loc[atp_data["Round"] == "Round Robin", "Round"] = -4
+atp_data.loc[atp_data["Round"] == "1st Round", "Round"] = 1
+atp_data.loc[atp_data["Round"] == "2nd Round", "Round"] = 2
+atp_data.loc[atp_data["Round"] == "3rd Round", "Round"] = 3
+atp_data.loc[atp_data["Round"] == "4th Round", "Round"] = 4
+print(atp_data["Round"])
+
+# Czyszczenie: W polu z datą zamienimy ######## na pustego stringa
+atp_data.loc[atp_data["Date"] == "########", "Date"] = ""
+print(atp_data["Date"])
+
+# Podział na podzbiory: trenujący, testowy, walidujący w proporcjach 6:2:2
+atp_train, atp_test = train_test_split(atp_data, test_size=0.4, random_state=1)
+atp_dev, atp_test = train_test_split(atp_test, test_size=0.5, random_state=1)
+
+# Wielkość zbioru i podzbiorów
+print("\nElements of total set: " + str(len(atp_data)))
+print("\nElements of test set: " + str(len(atp_test)))
+print("\nElements of dev set: " + str(len(atp_dev)))
+print("\nElements of train set: " + str(len(atp_train)))
+
+# Stworzenie plików z danymi trenującymi i testowymi
+atp_test.to_csv("atp_test.csv", encoding="utf-8", index=False)
+atp_dev.to_csv("atp_dev.csv", encoding="utf-8", index=False)
+atp_train.to_csv("atp_train.csv", encoding="utf-8", index=False)