Merge branch 'refactor'

This commit is contained in:
Adam Wojdyla 2022-04-02 23:07:02 +02:00
commit 500dd2beee
6 changed files with 70 additions and 36 deletions

View File

@ -1,4 +1,5 @@
FROM ubuntu:latest FROM ubuntu:latest
RUN apt-get update && apt-get install -y locales && locale-gen en_US.UTF-8
# COPY ./kaggle.json /root/.kaggle/kaggle.json # COPY ./kaggle.json /root/.kaggle/kaggle.json
@ -9,19 +10,25 @@ RUN apt-get install -y python3
RUN apt-get install -y unzip RUN apt-get install -y unzip
RUN apt-get install -y python3-pip RUN apt-get install -y python3-pip
ENV PYTHONIOENCODING=utf-8
RUN apt-get install -y locales locales-all
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
RUN python3 -m pip --version RUN python3 -m pip --version
RUN python3 -m pip install kaggle RUN python3 -m pip install kaggle
RUN python3 -m pip install pandas RUN python3 -m pip install pandas
RUN python3 -m pip freeze RUN python3 -m pip freeze
COPY ./download.sh ./ ENV PATH="/root/.local/bin:${PATH}"
COPY ./script.py ./ COPY . .
ARG KAGGLE_USERNAME=testKAGGLE_USERNAME ARG KAGGLE_USERNAME
ARG KAGGLE_KEY=test1KAGGLE_KEY ARG KAGGLE_KEY
RUN chmod u+x ./script.py RUN chmod a+x ./stats-docker.sh
RUN chmod a+x ./script-stats.py
# RUN ./download.sh 117928 # RUN ./download.sh 117928
# RUN python3 ./script.py RUN python3 ./script-download.py

View File

@ -20,19 +20,19 @@ pipeline {
} }
agent { agent {
dockerfile{ dockerfile{
additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true' additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image'
} }
} }
stages { stages {
stage('Prepare dataset') { stage('Prepare dataset') {
steps { steps {
sh 'python3 ./script.py' sh './stats-docker.sh'
} }
} }
} }
post { post {
success { success {
archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle*', followSymlinks: false
} }
} }
} }

17
Jenkinsfile-stats Normal file
View File

@ -0,0 +1,17 @@
pipeline {
agent {
docker { image 's444507_create_dataset_image:latest' }
}
stages {
stage('Get arifacts') {
steps {
copyArtifacts fingerprintArtifacts: true, projectName: 's444507-create-dataset', selector: lastSuccessful()
}
}
stage('Show stats') {
steps {
sh "python3 ./script-stats.py"
}
}
}
}

33
script.py → script-download.py Normal file → Executable file
View File

@ -5,26 +5,20 @@ import os
import numpy as np import numpy as np
def install_dependencies():
"""Install kaggle and pandas."""
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
def unzip_package(): def unzip_package():
"""Unzip dataset""" """Unzip dataset"""
print('Unzipping dataset...') print('Unzipping dataset...')
os.system('unzip -o car-prices-poland.zip') os.system('unzip -o ./car-prices-poland.zip')
print('Dataset unzipped') print('Dataset unzipped')
print('Removing .zip file...')
os.system('rm ./car-prices-poland.zip')
print('Zip file removed')
def download_dataset(): def download_dataset():
"""Download kaggle dataset.""" """Download kaggle dataset."""
print('Downloading dataset...') print('Downloading dataset...')
os.system('kaggle datasets download -d anikannal/solar-power-generation-data') os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
print('Dir after downloading') print('Dir after downloading')
os.system('ls -la') os.system('ls -la')
@ -54,17 +48,6 @@ def divide_dataset(dataset):
print('Dataset devided') print('Dataset devided')
def get_statistics(dataset):
"""Mean, min, max, median etc."""
print(f'--------------- Normalized dataset length ---------------')
print(len(dataset))
print(f'---------------Describe dataset---------------')
pd.set_option('display.max_columns', None)
print(dataset.describe(include='all'))
def normalize_dataset(dataset): def normalize_dataset(dataset):
"""Drop unnecessary columns and set numeric values to [0,1] range""" """Drop unnecessary columns and set numeric values to [0,1] range"""
@ -78,16 +61,14 @@ def normalize_dataset(dataset):
# normalize numbers to [0, 1] # normalize numbers to [0, 1]
for column in dataset.columns: for column in dataset.columns:
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
dataset[column] = (dataset[column] - dataset[column].min()) / ( dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
dataset[column].max() - dataset[column].min())
return dataset return dataset
# print(os.system('python3 -m pip freeze'))
download_dataset() download_dataset()
unzip_package() unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
df = pd.DataFrame(cars) df = pd.DataFrame(cars)
df = normalize_dataset(df) df = normalize_dataset(df)
divide_dataset(df) divide_dataset(df)
get_statistics(df)

17
script-stats.py Executable file
View File

@ -0,0 +1,17 @@
import pandas as pd
print('--Full dataset stats--')
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', encoding="utf-8")
print(cars.describe(include='all'))
print('Dev dataset stats')
cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', encoding="utf-8")
print(cars_dev.describe(include='all'))
print('# statystyki dla zbioru test')
cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv', encoding="utf-8")
print(cars_test.describe(include='all'))
print('# statystyki dla zbioru train')
cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', encoding="utf-8")
print(cars_train.describe(include='all'))

12
stats-docker.sh Executable file
View File

@ -0,0 +1,12 @@
echo 'Total elements in Car Prices Poland dataset:'
wc -l ./Car_Prices_Poland_Kaggle.csv
echo 'Total elements in train dataset:'
wc -l ./Car_Prices_Poland_Kaggle_train.csv
echo 'Total elements in test dataset:'
wc -l ./Car_Prices_Poland_Kaggle_test.csv
echo 'Total elements in dev dataset:'
wc -l ./Car_Prices_Poland_Kaggle_dev.csv