From 316471eda92fd98dc55ab4c86b3baef216cf3648 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 2 Apr 2022 20:28:16 +0200 Subject: [PATCH] r1 --- Dockerfile | 16 ++++++++-------- Jenkinsfile-docker | 2 +- script.py => script-download.py | 31 ++++++------------------------- script-stats.py | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 34 deletions(-) rename script.py => script-download.py (67%) create mode 100644 script-stats.py diff --git a/Dockerfile b/Dockerfile index cb900a6..f658aa9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:latest -# COPY ./kaggle.json /root/.kaggle/kaggle.json +COPY ./kaggle.json /root/.kaggle/kaggle.json WORKDIR /app @@ -12,16 +12,16 @@ RUN apt-get install -y python3-pip RUN python3 -m pip --version RUN python3 -m pip install kaggle RUN python3 -m pip install pandas - RUN python3 -m pip freeze -COPY ./download.sh ./ -COPY ./script.py ./ +ENV PATH="/root/.local/bin:${PATH}" +COPY . . -ARG KAGGLE_USERNAME=testKAGGLE_USERNAME -ARG KAGGLE_KEY=test1KAGGLE_KEY +ARG KAGGLE_USERNAME +ARG KAGGLE_KEY -RUN chmod u+x ./script.py +RUN chmod u+x ./script-download.py +RUN chmod u+x ./script-stats.py # RUN ./download.sh 117928 -# RUN python3 ./script.py \ No newline at end of file +RUN python3 ./script-download.py \ No newline at end of file diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index 1bcfe50..b549ff5 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -20,7 +20,7 @@ pipeline { } agent { dockerfile{ - additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true' + additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --no-cache=true' } } stages { diff --git a/script.py b/script-download.py similarity index 67% rename from script.py rename to script-download.py index 7d2a0fc..c6286ae 100644 --- a/script.py +++ b/script-download.py @@ -5,26 +5,20 @@ import os import numpy as np -def install_dependencies(): - """Install kaggle and pandas.""" - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn']) - def unzip_package(): """Unzip dataset""" print('Unzipping dataset...') os.system('unzip -o car-prices-poland.zip') print('Dataset unzipped') - + print('Removing .zip file...') + os.system('rm ./car-prices-poland.zip') + print('Zip file removed') def download_dataset(): """Download kaggle dataset.""" print('Downloading dataset...') - os.system('kaggle datasets download -d anikannal/solar-power-generation-data') + os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland') print('Dir after downloading') os.system('ls -la') @@ -54,17 +48,6 @@ def divide_dataset(dataset): print('Dataset devided') -def get_statistics(dataset): - """Mean, min, max, median etc.""" - - print(f'--------------- Normalized dataset length ---------------') - print(len(dataset)) - - print(f'---------------Describe dataset---------------') - pd.set_option('display.max_columns', None) - print(dataset.describe(include='all')) - - def normalize_dataset(dataset): """Drop unnecessary columns and set numeric values to [0,1] range""" @@ -78,16 +61,14 @@ def normalize_dataset(dataset): # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): - dataset[column] = (dataset[column] - dataset[column].min()) / ( - dataset[column].max() - dataset[column].min()) + dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) return dataset -# print(os.system('python3 -m pip freeze')) download_dataset() unzip_package() cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') df = pd.DataFrame(cars) df = normalize_dataset(df) divide_dataset(df) -get_statistics(df) + diff --git a/script-stats.py b/script-stats.py new file mode 100644 index 0000000..76b5504 --- /dev/null +++ b/script-stats.py @@ -0,0 +1,17 @@ +import pandas as pd + +print('--Full dataset stats--') +cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') +print(cars.describe(include='all')) + +print('Dev dataset stats') +cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv') +print(cars_dev.describe(include='all')) + +print('# statystyki dla zbioru test') +cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv') +print(cars_test.describe(include='all')) + +print('# statystyki dla zbioru train') +cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv') +print(cars_train.describe(include='all'))