From 7f7f14fd9eb9f9747637229559751acdf1a9b7da Mon Sep 17 00:00:00 2001 From: s434732 Date: Sat, 10 Apr 2021 15:08:18 +0200 Subject: [PATCH] docker-v1 --- Dockerfile | 17 +++++++++++++++++ Jenkinsfile | 9 ++++----- skrypt.py | 31 +++++++++++++++++++++++++++++++ skrypt_download.py | 31 +++++++++++++++++++++++++++++++ skrypt_stat.py | 31 +++++++++++++++++++++++++++++++ 5 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 Dockerfile create mode 100644 skrypt.py create mode 100644 skrypt_download.py create mode 100644 skrypt_stat.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..74decb2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM ubuntu:latest + +RUN apt update && apt install -y python3 && apt install -y nano + +RUN apt update && apt install python3-pip -y +RUN pip3 install --user kaggle && pip3 install --user pandas && pip3 install scikit-learn && pip3 install matplotlib +RUN apt install -y curl +RUN pip3 install --user wget + +WORKDIR /app + +# Skopiujmy nasz skrypt do katalogu /app w kontenerze +COPY ./skrypt_download.py ./ +COPY ./skrypt_stat.py ./ + + + diff --git a/Jenkinsfile b/Jenkinsfile index 421c234..ec7c9a1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,14 +26,13 @@ node { checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[url: 'https://git.wmi.amu.edu.pl/s434732/ium_434732']]]) - sh "chmod 777 ./skrypt_zad2.sh" - sh "./skrypt_zad2.sh" + sh 'python3 ./skrypt_download.py' - archiveArtifacts "results.csv_cut.dev" - archiveArtifacts "results.csv_cut.test" - archiveArtifacts "results.csv_cut.train" + archiveArtifacts "valid" + archiveArtifacts "test" + archiveArtifacts "train" } } diff --git a/skrypt.py b/skrypt.py new file mode 100644 index 0000000..6e8715d --- /dev/null +++ b/skrypt.py @@ -0,0 +1,31 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import kaggle + +kaggle.api.authenticate() + +kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True) + +results = pd.read_csv('results.csv') + +#brak wierszy z NaN +results.dropna() + +#normalizacja itp +for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: + results[collumn] = results[collumn].str.lower() + +# Podział zbioru 6:1:1 +train, test = train_test_split(results, test_size= 1 - 0.6) + +valid, test = train_test_split(test, test_size=0.5) + +print("All data: ", results.size) +print("Train size: ", train.size) +print("Test size: ", test.size) +print("Validate size: ", valid.size) +print(results.describe(include='all')) + +# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy +print(train.size+test.size+valid.size) diff --git a/skrypt_download.py b/skrypt_download.py new file mode 100644 index 0000000..6e8715d --- /dev/null +++ b/skrypt_download.py @@ -0,0 +1,31 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import kaggle + +kaggle.api.authenticate() + +kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True) + +results = pd.read_csv('results.csv') + +#brak wierszy z NaN +results.dropna() + +#normalizacja itp +for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: + results[collumn] = results[collumn].str.lower() + +# Podział zbioru 6:1:1 +train, test = train_test_split(results, test_size= 1 - 0.6) + +valid, test = train_test_split(test, test_size=0.5) + +print("All data: ", results.size) +print("Train size: ", train.size) +print("Test size: ", test.size) +print("Validate size: ", valid.size) +print(results.describe(include='all')) + +# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy +print(train.size+test.size+valid.size) diff --git a/skrypt_stat.py b/skrypt_stat.py new file mode 100644 index 0000000..6e8715d --- /dev/null +++ b/skrypt_stat.py @@ -0,0 +1,31 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import kaggle + +kaggle.api.authenticate() + +kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True) + +results = pd.read_csv('results.csv') + +#brak wierszy z NaN +results.dropna() + +#normalizacja itp +for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']: + results[collumn] = results[collumn].str.lower() + +# Podział zbioru 6:1:1 +train, test = train_test_split(results, test_size= 1 - 0.6) + +valid, test = train_test_split(test, test_size=0.5) + +print("All data: ", results.size) +print("Train size: ", train.size) +print("Test size: ", test.size) +print("Validate size: ", valid.size) +print(results.describe(include='all')) + +# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy +print(train.size+test.size+valid.size)