diff --git a/Dockerfile b/Dockerfile index cb900a6..1381810 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,5 @@ FROM ubuntu:latest +RUN apt-get update && apt-get install -y locales && locale-gen en_US.UTF-8 # COPY ./kaggle.json /root/.kaggle/kaggle.json @@ -9,19 +10,25 @@ RUN apt-get install -y python3 RUN apt-get install -y unzip RUN apt-get install -y python3-pip +ENV PYTHONIOENCODING=utf-8 +RUN apt-get install -y locales locales-all +ENV LC_ALL en_US.UTF-8 +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US.UTF-8 + RUN python3 -m pip --version RUN python3 -m pip install kaggle RUN python3 -m pip install pandas - RUN python3 -m pip freeze -COPY ./download.sh ./ -COPY ./script.py ./ +ENV PATH="/root/.local/bin:${PATH}" +COPY . . -ARG KAGGLE_USERNAME=testKAGGLE_USERNAME -ARG KAGGLE_KEY=test1KAGGLE_KEY +ARG KAGGLE_USERNAME +ARG KAGGLE_KEY -RUN chmod u+x ./script.py +RUN chmod a+x ./stats-docker.sh +RUN chmod a+x ./script-stats.py # RUN ./download.sh 117928 -# RUN python3 ./script.py \ No newline at end of file +RUN python3 ./script-download.py \ No newline at end of file diff --git a/Jenkinsfile-docker b/Jenkinsfile-docker index 1bcfe50..b8a4beb 100644 --- a/Jenkinsfile-docker +++ b/Jenkinsfile-docker @@ -20,19 +20,19 @@ pipeline { } agent { dockerfile{ - additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true' + additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image' } } stages { stage('Prepare dataset') { steps { - sh 'python3 ./script.py' + sh './stats-docker.sh' } } } post { success { - archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false + archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle*', followSymlinks: false } } } \ No newline at end of file diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats new file mode 100644 index 0000000..e699d59 --- /dev/null +++ b/Jenkinsfile-stats @@ -0,0 +1,17 @@ +pipeline { + agent { + docker { image 's444507_create_dataset_image:latest' } + } + stages { + stage('Get arifacts') { + steps { + copyArtifacts fingerprintArtifacts: true, projectName: 's444507-create-dataset', selector: lastSuccessful() + } + } + stage('Show stats') { + steps { + sh "python3 ./script-stats.py" + } + } + } +} \ No newline at end of file diff --git a/script.py b/script-download.py old mode 100644 new mode 100755 similarity index 66% rename from script.py rename to script-download.py index 7d2a0fc..8f0145d --- a/script.py +++ b/script-download.py @@ -5,26 +5,20 @@ import os import numpy as np -def install_dependencies(): - """Install kaggle and pandas.""" - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn']) - subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn']) - def unzip_package(): """Unzip dataset""" print('Unzipping dataset...') - os.system('unzip -o car-prices-poland.zip') + os.system('unzip -o ./car-prices-poland.zip') print('Dataset unzipped') - + print('Removing .zip file...') + os.system('rm ./car-prices-poland.zip') + print('Zip file removed') def download_dataset(): """Download kaggle dataset.""" print('Downloading dataset...') - os.system('kaggle datasets download -d anikannal/solar-power-generation-data') + os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland') print('Dir after downloading') os.system('ls -la') @@ -54,17 +48,6 @@ def divide_dataset(dataset): print('Dataset devided') -def get_statistics(dataset): - """Mean, min, max, median etc.""" - - print(f'--------------- Normalized dataset length ---------------') - print(len(dataset)) - - print(f'---------------Describe dataset---------------') - pd.set_option('display.max_columns', None) - print(dataset.describe(include='all')) - - def normalize_dataset(dataset): """Drop unnecessary columns and set numeric values to [0,1] range""" @@ -78,16 +61,14 @@ def normalize_dataset(dataset): # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): - dataset[column] = (dataset[column] - dataset[column].min()) / ( - dataset[column].max() - dataset[column].min()) + dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) return dataset -# print(os.system('python3 -m pip freeze')) download_dataset() unzip_package() cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') df = pd.DataFrame(cars) df = normalize_dataset(df) divide_dataset(df) -get_statistics(df) + diff --git a/script-stats.py b/script-stats.py new file mode 100755 index 0000000..258504e --- /dev/null +++ b/script-stats.py @@ -0,0 +1,17 @@ +import pandas as pd + +print('--Full dataset stats--') +cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', encoding="utf-8") +print(cars.describe(include='all')) + +print('Dev dataset stats') +cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', encoding="utf-8") +print(cars_dev.describe(include='all')) + +print('# statystyki dla zbioru test') +cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv', encoding="utf-8") +print(cars_test.describe(include='all')) + +print('# statystyki dla zbioru train') +cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', encoding="utf-8") +print(cars_train.describe(include='all')) diff --git a/stats-docker.sh b/stats-docker.sh new file mode 100755 index 0000000..ceb586e --- /dev/null +++ b/stats-docker.sh @@ -0,0 +1,12 @@ + +echo 'Total elements in Car Prices Poland dataset:' +wc -l ./Car_Prices_Poland_Kaggle.csv + +echo 'Total elements in train dataset:' +wc -l ./Car_Prices_Poland_Kaggle_train.csv + +echo 'Total elements in test dataset:' +wc -l ./Car_Prices_Poland_Kaggle_test.csv + +echo 'Total elements in dev dataset:' +wc -l ./Car_Prices_Poland_Kaggle_dev.csv