From 316471eda92fd98dc55ab4c86b3baef216cf3648 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 20:28:16 +0200 Subject: [PATCH 01/17] r1 --- Dockerfile | 16 ++++++++-------- Jenkinsfile-docker | 2 +- script.py => script-download.py | 31 ++++++------------------------- script-stats.py | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 34 deletions(-) rename script.py => script-download.py (67%) create mode 100644 script-stats.py diff --git a/Dockerfile b/Dockerfile index cb900a6..f658aa9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:latest -# COPY ./kaggle.json /root/.kaggle/kaggle.json +COPY ./kaggle.json /root/.kaggle/kaggle.json WORKDIR /app @@ -12,16 +12,16 @@ RUN apt-get install -y python3-pip RUN python3 -m pip --version RUN python3 -m pip install kaggle RUN python3 -m pip install pandas - RUN python3 -m pip freeze -COPY ./download.sh ./ -COPY ./script.py ./ +ENV PATH="/root/.local/bin:${PATH}" +COPY . . -ARG KAGGLE_USERNAME=testKAGGLE_USERNAME -ARG KAGGLE_KEY=test1KAGGLE_KEY +ARG KAGGLE_USERNAME +ARG KAGGLE_KEY -RUN chmod u+x ./script.py +RUN chmod u+x ./script-download.py +RUN chmod u+x ./script-stats.py # RUN ./download.sh 117928 -# RUN python3 ./script.py \ No newline at end of file +RUN python3 ./script-download.py \ No newline at end of file diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index 1bcfe50..b549ff5 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -20,7 +20,7 @@ pipeline { } agent { dockerfile{ - additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true' + additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --no-cache=true' } } stages { diff --git a/script.py b/script-download.py similarity index 67% rename from script.py rename to script-download.py index 7d2a0fc..c6286ae 100644 --- a/script.py +++ b/script-download.py @@ -5,26 +5,20 @@ import os import numpy as np -def install_dependencies(): - """Install kaggle and pandas.""" - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn']) - def unzip_package(): """Unzip dataset""" print('Unzipping dataset...') os.system('unzip -o car-prices-poland.zip') print('Dataset unzipped') - + print('Removing .zip file...') + os.system('rm ./car-prices-poland.zip') + print('Zip file removed') def download_dataset(): """Download kaggle dataset.""" print('Downloading dataset...') - os.system('kaggle datasets download -d anikannal/solar-power-generation-data') + os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland') print('Dir after downloading') os.system('ls -la') @@ -54,17 +48,6 @@ def divide_dataset(dataset): print('Dataset devided') -def get_statistics(dataset): - """Mean, min, max, median etc.""" - - print(f'--------------- Normalized dataset length ---------------') - print(len(dataset)) - - print(f'---------------Describe dataset---------------') - pd.set_option('display.max_columns', None) - print(dataset.describe(include='all')) - - def normalize_dataset(dataset): """Drop unnecessary columns and set numeric values to [0,1] range""" @@ -78,16 +61,14 @@ def normalize_dataset(dataset): # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): - dataset[column] = (dataset[column] - dataset[column].min()) / ( - dataset[column].max() - dataset[column].min()) + dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) return dataset -# print(os.system('python3 -m pip freeze')) download_dataset() unzip_package() cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') df = pd.DataFrame(cars) df = normalize_dataset(df) divide_dataset(df) -get_statistics(df) + diff --git a/script-stats.py b/script-stats.py new file mode 100644 index 0000000..76b5504 --- /dev/null +++ b/script-stats.py @@ -0,0 +1,17 @@ +import pandas as pd + +print('--Full dataset stats--') +cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') +print(cars.describe(include='all')) + +print('Dev dataset stats') +cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv') +print(cars_dev.describe(include='all')) + +print('# statystyki dla zbioru test') +cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv') +print(cars_test.describe(include='all')) + +print('# statystyki dla zbioru train') +cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv') +print(cars_train.describe(include='all')) From 0af5db8cf113bc308f1fa768f68819eacacb1c48 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 20:30:07 +0200 Subject: [PATCH 02/17] r2 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f658aa9..df944ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:latest -COPY ./kaggle.json /root/.kaggle/kaggle.json +# COPY ./kaggle.json /root/.kaggle/kaggle.json WORKDIR /app From f019b042b709310795c86335d321b0403632db4d Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 20:34:02 +0200 Subject: [PATCH 03/17] r3 --- Jenkinsfile-docker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index b549ff5..51bc47a 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -26,7 +26,7 @@ pipeline { stages { stage('Prepare dataset') { steps { - sh 'python3 ./script.py' + sh 'python3 ./script-stats.py' } } } From e09e36265eb6193773c9da0ae7880222057b1bda Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 20:47:36 +0200 Subject: [PATCH 04/17] utf8 --- script-stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/script-stats.py b/script-stats.py index 76b5504..258504e 100644 --- a/script-stats.py +++ b/script-stats.py @@ -1,17 +1,17 @@ import pandas as pd print('--Full dataset stats--') -cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') +cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', encoding="utf-8") print(cars.describe(include='all')) print('Dev dataset stats') -cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv') +cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', encoding="utf-8") print(cars_dev.describe(include='all')) print('# statystyki dla zbioru test') -cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv') +cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv', encoding="utf-8") print(cars_test.describe(include='all')) print('# statystyki dla zbioru train') -cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv') +cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', encoding="utf-8") print(cars_train.describe(include='all')) From 8320d74da33c841e5ad265914e9280a7774d8cba Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:21:12 +0200 Subject: [PATCH 05/17] r4 --- Dockerfile | 2 +- Jenkinsfile-docker | 4 ++-- Jenkinsfile-stats | 38 ++++++++++++++++++++++++++++++++++++++ script-download.py | 2 +- stats-docker.sh | 12 ++++++++++++ 5 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 Jenkinsfile-stats create mode 100644 stats-docker.sh diff --git a/Dockerfile b/Dockerfile index df944ef..6a23d5c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ COPY . . ARG KAGGLE_USERNAME ARG KAGGLE_KEY -RUN chmod u+x ./script-download.py +RUN chmod u+x ./stats-docker.sh RUN chmod u+x ./script-stats.py # RUN ./download.sh 117928 diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index 51bc47a..e685d49 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -20,13 +20,13 @@ pipeline { } agent { dockerfile{ - additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --no-cache=true' + additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image' } } stages { stage('Prepare dataset') { steps { - sh 'python3 ./script-stats.py' + sh './stats-docker.sh' } } } diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats new file mode 100644 index 0000000..1bf1bd7 --- /dev/null +++ b/Jenkinsfile-stats @@ -0,0 +1,38 @@ +pipeline { + parameters { + string( + defaultValue: 'heatedboss2', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '117928', + description: 'Cutoff', + name: 'CUTOFF', + trim: false + ) + } + agent { + dockerfile{ + additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image' + } + } + stages { + stage('Prepare dataset') { + steps { + sh 'python3 ./script-stats.py' + } + } + } + post { + success { + archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false + } + } +} \ No newline at end of file diff --git a/script-download.py b/script-download.py index c6286ae..8f0145d 100644 --- a/script-download.py +++ b/script-download.py @@ -9,7 +9,7 @@ import numpy as np def unzip_package(): """Unzip dataset""" print('Unzipping dataset...') - os.system('unzip -o car-prices-poland.zip') + os.system('unzip -o ./car-prices-poland.zip') print('Dataset unzipped') print('Removing .zip file...') os.system('rm ./car-prices-poland.zip') diff --git a/stats-docker.sh b/stats-docker.sh new file mode 100644 index 0000000..ceb586e --- /dev/null +++ b/stats-docker.sh @@ -0,0 +1,12 @@ + +echo 'Total elements in Car Prices Poland dataset:' +wc -l ./Car_Prices_Poland_Kaggle.csv + +echo 'Total elements in train dataset:' +wc -l ./Car_Prices_Poland_Kaggle_train.csv + +echo 'Total elements in test dataset:' +wc -l ./Car_Prices_Poland_Kaggle_test.csv + +echo 'Total elements in dev dataset:' +wc -l ./Car_Prices_Poland_Kaggle_dev.csv From ab256b7c2a412b6e15eedf55da19676a5921401a Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:24:15 +0200 Subject: [PATCH 06/17] r5 --- Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a23d5c..4427188 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,13 +15,15 @@ RUN python3 -m pip install pandas RUN python3 -m pip freeze ENV PATH="/root/.local/bin:${PATH}" + +RUN chmod u+x ./stats-docker.sh +RUN chmod u+x ./script-stats.py + COPY . . ARG KAGGLE_USERNAME ARG KAGGLE_KEY -RUN chmod u+x ./stats-docker.sh -RUN chmod u+x ./script-stats.py # RUN ./download.sh 117928 RUN python3 ./script-download.py \ No newline at end of file From 73daf4b2a3eb5b2cbba90e5cb655138fbe656db3 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:26:34 +0200 Subject: [PATCH 07/17] r6 --- Dockerfile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4427188..ebbbd09 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,15 +15,13 @@ RUN python3 -m pip install pandas RUN python3 -m pip freeze ENV PATH="/root/.local/bin:${PATH}" - -RUN chmod u+x ./stats-docker.sh -RUN chmod u+x ./script-stats.py - COPY . . ARG KAGGLE_USERNAME ARG KAGGLE_KEY +RUN chmod a+x ./stats-docker.sh +RUN chmod a+x ./script-stats.py # RUN ./download.sh 117928 RUN python3 ./script-download.py \ No newline at end of file From 5fff814fddf9b37ff2036930159d038c7f2bd070 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:27:42 +0200 Subject: [PATCH 08/17] r7 --- stats-docker.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 stats-docker.sh diff --git a/stats-docker.sh b/stats-docker.sh old mode 100644 new mode 100755 From 5c89f40c7085a1b82d6014ac75555ea2f6874048 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:33:16 +0200 Subject: [PATCH 09/17] r8 --- Jenkinsfile-stats | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index 1bf1bd7..46011c5 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -1,38 +1,17 @@ pipeline { - parameters { - string( - defaultValue: 'heatedboss2', - description: 'Kaggle username', - name: 'KAGGLE_USERNAME', - trim: false - ) - password( - defaultValue: '', - description: 'Kaggle token', - name: 'KAGGLE_KEY' - ) - string( - defaultValue: '117928', - description: 'Cutoff', - name: 'CUTOFF', - trim: false - ) - } - agent { - dockerfile{ - additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image' - } + agent { + docker { image 's444507_create_dataset_image:latest' } } stages { - stage('Prepare dataset') { + stage('Get arifacts') { steps { - sh 'python3 ./script-stats.py' + copyArtifacts fingerprintArtifacts: true, projectName: 's444507_create_dataset_image', selector: lastSuccessful() + } + } + stage('Show stats') { + steps { + sh "./stats-docker.sh" } } } - post { - success { - archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false - } - } } \ No newline at end of file From 731d9d8c829c37b8ccf50581c30622455692de35 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:36:41 +0200 Subject: [PATCH 10/17] r9 --- Jenkinsfile-stats | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index 46011c5..82b6648 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -1,7 +1,5 @@ pipeline { - agent { - docker { image 's444507_create_dataset_image:latest' } - } + agent any stages { stage('Get arifacts') { steps { @@ -10,6 +8,7 @@ pipeline { } stage('Show stats') { steps { + sh " docker image ls" sh "./stats-docker.sh" } } From 93e2249deb9727c25b46e71f06450797329add08 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:39:07 +0200 Subject: [PATCH 11/17] r10 --- Jenkinsfile-stats | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index 82b6648..eb9559d 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -1,14 +1,15 @@ pipeline { - agent any + agent { + docker { image 's444507_create_dataset_image:latest' } + } stages { stage('Get arifacts') { steps { - copyArtifacts fingerprintArtifacts: true, projectName: 's444507_create_dataset_image', selector: lastSuccessful() + copyArtifacts fingerprintArtifacts: true, projectName: 's444507-create-dataset', selector: lastSuccessful() } } stage('Show stats') { steps { - sh " docker image ls" sh "./stats-docker.sh" } } From 64dab60a1ea3edcc574ff9f9fb1cfe13afe53100 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:43:11 +0200 Subject: [PATCH 12/17] r11 --- Jenkinsfile-stats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index eb9559d..00c3c69 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -10,7 +10,7 @@ pipeline { } stage('Show stats') { steps { - sh "./stats-docker.sh" + sh "./script-stats.py" } } } From 87b2996c6777bdd23adbe2fb7087774c3c153de8 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:44:45 +0200 Subject: [PATCH 13/17] r12 --- script-download.py | 0 script-stats.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 script-download.py mode change 100644 => 100755 script-stats.py diff --git a/script-download.py b/script-download.py old mode 100644 new mode 100755 diff --git a/script-stats.py b/script-stats.py old mode 100644 new mode 100755 From 06d57230a7f91eabeeadfb3bf0c5a75c62e57cca Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:45:55 +0200 Subject: [PATCH 14/17] r13 --- Jenkinsfile-stats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats index 00c3c69..e699d59 100644 --- a/Jenkinsfile-stats +++ b/Jenkinsfile-stats @@ -10,7 +10,7 @@ pipeline { } stage('Show stats') { steps { - sh "./script-stats.py" + sh "python3 ./script-stats.py" } } } From 1359958be8d2b0a3b87576b1a45be8b251246872 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:46:48 +0200 Subject: [PATCH 15/17] r14 --- Jenkinsfile-docker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index e685d49..b8a4beb 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -32,7 +32,7 @@ pipeline { } post { success { - archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false + archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle*', followSymlinks: false } } } \ No newline at end of file From dc4e39a1483d83daa12b6b0416c04998d044c966 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 22:49:02 +0200 Subject: [PATCH 16/17] utf8 --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index ebbbd09..14a912f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ FROM ubuntu:latest +RUN apt-get update && apt-get install -y locales && locale-gen en_US.UTF-8 # COPY ./kaggle.json /root/.kaggle/kaggle.json From d84f626933af659d7a5e00395aa1d43422f99b4c Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 23:02:39 +0200 Subject: [PATCH 17/17] utf8 2 --- Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Dockerfile b/Dockerfile index 14a912f..1381810 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,12 @@ RUN apt-get install -y python3 RUN apt-get install -y unzip RUN apt-get install -y python3-pip +ENV PYTHONIOENCODING=utf-8 +RUN apt-get install -y locales locales-all +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + RUN python3 -m pip --version RUN python3 -m pip install kaggle RUN python3 -m pip install pandas