diff --git a/.dvc/config b/.dvc/config index 3416bf3..95c5b0c 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,5 +1,5 @@ [core] - autostage = true remote = ium_ssh_remote ['remote "ium_ssh_remote"'] - url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl + url = ssh://tzietkiewicz.vm.wmi.amu.edu.pl:/home/ium-sftp + user = ium-sftp \ No newline at end of file diff --git a/.gitignore b/.gitignore index b170011..c37f473 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.csv -*.zip *.png *.txt __pycache__ /prepared +model.zip +sacred_runs/1/model.zip diff --git a/Dockerfile b/Dockerfile index a474b57..d5d57b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,8 @@ RUN pip3 install matplotlib RUN pip3 install torchvision RUN pip3 install sacred RUN pip3 install pymongo +RUN pip3 install dvc +RUN pip3 install 'dvc[ssh]' paramiko # Args ARG KAGGLE_USERNAME @@ -31,5 +33,8 @@ WORKDIR /app # Copy everything from jenkins to /app COPY . . +# Create user +RUN useradd -r -u 111 jenkins + # Create kaggle catalog for authenticate RUN mkdir /.kaggle/ && chmod o+w /.kaggle diff --git a/Jenkinsfile-dvc b/Jenkinsfile-dvc new file mode 100644 index 0000000..2e665aa --- /dev/null +++ b/Jenkinsfile-dvc @@ -0,0 +1,42 @@ +pipeline { + agent { + dockerfile { + args '-e KAGGLE_USERNAME=${params.KAGGLE_USERNAME} -e KAGGLE_KEY=${params.KAGGLE_KEY}' + } + } + parameters { + string ( + defaultValue: 'wirus006', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password ( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + } + stages { + stage("Git clone") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's444498', url: 'https://git.wmi.amu.edu.pl/s444498/ium_444498.git']]]) + } + } + stage("Run DVC") { + steps{ + withCredentials( + [sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: 'USER')]) { + sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY' + sh 'dvc remote modify --local ium_ssh_remote password IUM@2021' + sh 'dvc remote list' + sh 'cat .dvc/config' + sh 'cat .dvc/config.local' + sh 'dvc pull' + sh 'ls -al' + sh 'dvc repro' + } + } + } + } +} \ No newline at end of file diff --git a/atp-and-wta-tennis-data.zip b/atp-and-wta-tennis-data.zip new file mode 100644 index 0000000..a73ac1c Binary files /dev/null and b/atp-and-wta-tennis-data.zip differ diff --git a/atp_dev.csv.dvc b/atp_dev.csv.dvc index e8cfd67..2120c68 100644 --- a/atp_dev.csv.dvc +++ b/atp_dev.csv.dvc @@ -1,4 +1,4 @@ outs: -- md5: 16cefb2b04f963bcf0fbb6f256496219 - size: 2466716 +- md5: d32a6cf1889199066cace68f8f56890b + size: 2431316 path: atp_dev.csv diff --git a/atp_test.csv.dvc b/atp_test.csv.dvc index 54ea118..d5e4708 100644 --- a/atp_test.csv.dvc +++ b/atp_test.csv.dvc @@ -1,4 +1,4 @@ outs: -- md5: b5b50c11ef644df2ef799ca56e7d1ced - size: 2466156 +- md5: 389fd474d4db00db1c113683177d5880 + size: 2430180 path: atp_test.csv diff --git a/atp_train.csv.dvc b/atp_train.csv.dvc index d7b9c20..d91a385 100644 --- a/atp_train.csv.dvc +++ b/atp_train.csv.dvc @@ -1,4 +1,4 @@ outs: -- md5: 314cd14a051bd61bf7e1f3a160c02dd2 - size: 7408451 +- md5: 50969b14a70db98c17a62cf7d99edb5a + size: 7302503 path: atp_train.csv diff --git a/dvc.yaml b/dvc.yaml index 6c3ab73..786b3df 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,5 +1,5 @@ stages: - prepare: - cmd: python init.py train: - cmd: python neutral_network.py + cmd: python3 neutral_network.py + prepare: + cmd: python3 init2.py \ No newline at end of file diff --git a/init2.py b/init2.py new file mode 100644 index 0000000..7d54e8b --- /dev/null +++ b/init2.py @@ -0,0 +1,67 @@ +import subprocess +from os.path import exists +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import matplotlib +from pathlib import Path +import math + +# Inicjalizacja danych +file_exists = exists("./df_atp.csv") +if not file_exists: + subprocess.run(["unzip", "-o", "atp-and-wta-tennis-data.zip"]) + +atp_data = pd.read_csv("df_atp.csv") + +# Średnia ilość gemów w pierwszym secie zwycięzców meczu +print(atp_data[["Winner", "W1"]].mean()) + +# Minimalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz +print(atp_data[["Winner", "W1"]].min()) + +# Maksymalna ilość wygranych gemów w pierwszym secie osób wygrywających mecz +print(atp_data[["Winner", "W1"]].max()) + +# Odchylenie standardowe wygranych gemów w pierwszym secie osób wygrywających mecz +print(atp_data[["Winner", "W1"]].std()) + +# Mediana wygranych gemów w pierwszym secie osób wygrywających mecz +print(atp_data[["Winner", "W1"]].median()) + +# Zmiana nazwy nienazwanej kolumny +atp_data.rename(columns={"Unnamed: 0": "ID"}, inplace=True) + +# Jak często kto był zwycięzcą +print(atp_data.groupby("Winner")["ID"].nunique()) + +# Normalizacja rund -1: Finał, -2: Półfinał, -3: Ćwiartka, -4: Każdy z każdym +# 1: pierwsza runda, 2: druga runda, 3: trzecia runda, 4: czwarta runda +atp_data.loc[atp_data["Round"] == "The Final", "Round"] = -1 +atp_data.loc[atp_data["Round"] == "Semifinals", "Round"] = -2 +atp_data.loc[atp_data["Round"] == "Quarterfinals", "Round"] = -3 +atp_data.loc[atp_data["Round"] == "Round Robin", "Round"] = -4 +atp_data.loc[atp_data["Round"] == "1st Round", "Round"] = 1 +atp_data.loc[atp_data["Round"] == "2nd Round", "Round"] = 2 +atp_data.loc[atp_data["Round"] == "3rd Round", "Round"] = 3 +atp_data.loc[atp_data["Round"] == "4th Round", "Round"] = 4 +print(atp_data["Round"]) + +# Czyszczenie: W polu z datą zamienimy ######## na pustego stringa +atp_data.loc[atp_data["Date"] == "########", "Date"] = "" +print(atp_data["Date"]) + +# Podział na podzbiory: trenujący, testowy, walidujący w proporcjach 6:2:2 +atp_train, atp_test = train_test_split(atp_data, test_size=0.4, random_state=1) +atp_dev, atp_test = train_test_split(atp_test, test_size=0.5, random_state=1) + +# Wielkość zbioru i podzbiorów +print("\nElements of total set: " + str(len(atp_data))) +print("\nElements of test set: " + str(len(atp_test))) +print("\nElements of dev set: " + str(len(atp_dev))) +print("\nElements of train set: " + str(len(atp_train))) + +# Stworzenie plików z danymi trenującymi i testowymi +atp_test.to_csv("atp_test.csv", encoding="utf-8", index=False) +atp_dev.to_csv("atp_dev.csv", encoding="utf-8", index=False) +atp_train.to_csv("atp_train.csv", encoding="utf-8", index=False)