Merge branch 'refactor'
This commit is contained in:
commit
500dd2beee
21
Dockerfile
21
Dockerfile
@ -1,4 +1,5 @@
|
|||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
RUN apt-get update && apt-get install -y locales && locale-gen en_US.UTF-8
|
||||||
|
|
||||||
# COPY ./kaggle.json /root/.kaggle/kaggle.json
|
# COPY ./kaggle.json /root/.kaggle/kaggle.json
|
||||||
|
|
||||||
@ -9,19 +10,25 @@ RUN apt-get install -y python3
|
|||||||
RUN apt-get install -y unzip
|
RUN apt-get install -y unzip
|
||||||
RUN apt-get install -y python3-pip
|
RUN apt-get install -y python3-pip
|
||||||
|
|
||||||
|
ENV PYTHONIOENCODING=utf-8
|
||||||
|
RUN apt-get install -y locales locales-all
|
||||||
|
ENV LC_ALL en_US.UTF-8
|
||||||
|
ENV LANG en_US.UTF-8
|
||||||
|
ENV LANGUAGE en_US.UTF-8
|
||||||
|
|
||||||
RUN python3 -m pip --version
|
RUN python3 -m pip --version
|
||||||
RUN python3 -m pip install kaggle
|
RUN python3 -m pip install kaggle
|
||||||
RUN python3 -m pip install pandas
|
RUN python3 -m pip install pandas
|
||||||
|
|
||||||
RUN python3 -m pip freeze
|
RUN python3 -m pip freeze
|
||||||
|
|
||||||
COPY ./download.sh ./
|
ENV PATH="/root/.local/bin:${PATH}"
|
||||||
COPY ./script.py ./
|
COPY . .
|
||||||
|
|
||||||
ARG KAGGLE_USERNAME=testKAGGLE_USERNAME
|
ARG KAGGLE_USERNAME
|
||||||
ARG KAGGLE_KEY=test1KAGGLE_KEY
|
ARG KAGGLE_KEY
|
||||||
|
|
||||||
RUN chmod u+x ./script.py
|
RUN chmod a+x ./stats-docker.sh
|
||||||
|
RUN chmod a+x ./script-stats.py
|
||||||
|
|
||||||
# RUN ./download.sh 117928
|
# RUN ./download.sh 117928
|
||||||
# RUN python3 ./script.py
|
RUN python3 ./script-download.py
|
@ -20,19 +20,19 @@ pipeline {
|
|||||||
}
|
}
|
||||||
agent {
|
agent {
|
||||||
dockerfile{
|
dockerfile{
|
||||||
additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" --build-arg --no-cache=true'
|
additionalBuildArgs '--build-arg KAGGLE_USERNAME="$KAGGLE_USERNAME" --build-arg KAGGLE_KEY="$KAGGLE_KEY" -t s444507_create_dataset_image'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stages {
|
stages {
|
||||||
stage('Prepare dataset') {
|
stage('Prepare dataset') {
|
||||||
steps {
|
steps {
|
||||||
sh 'python3 ./script.py'
|
sh './stats-docker.sh'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
post {
|
post {
|
||||||
success {
|
success {
|
||||||
archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle_*', followSymlinks: false
|
archiveArtifacts artifacts: 'Car_Prices_Poland_Kaggle*', followSymlinks: false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
17
Jenkinsfile-stats
Normal file
17
Jenkinsfile-stats
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
pipeline {
|
||||||
|
agent {
|
||||||
|
docker { image 's444507_create_dataset_image:latest' }
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Get arifacts') {
|
||||||
|
steps {
|
||||||
|
copyArtifacts fingerprintArtifacts: true, projectName: 's444507-create-dataset', selector: lastSuccessful()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Show stats') {
|
||||||
|
steps {
|
||||||
|
sh "python3 ./script-stats.py"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
33
script.py → script-download.py
Normal file → Executable file
33
script.py → script-download.py
Normal file → Executable file
@ -5,26 +5,20 @@ import os
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def install_dependencies():
|
|
||||||
"""Install kaggle and pandas."""
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
|
|
||||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
|
|
||||||
|
|
||||||
|
|
||||||
def unzip_package():
|
def unzip_package():
|
||||||
"""Unzip dataset"""
|
"""Unzip dataset"""
|
||||||
print('Unzipping dataset...')
|
print('Unzipping dataset...')
|
||||||
os.system('unzip -o car-prices-poland.zip')
|
os.system('unzip -o ./car-prices-poland.zip')
|
||||||
print('Dataset unzipped')
|
print('Dataset unzipped')
|
||||||
|
print('Removing .zip file...')
|
||||||
|
os.system('rm ./car-prices-poland.zip')
|
||||||
|
print('Zip file removed')
|
||||||
|
|
||||||
def download_dataset():
|
def download_dataset():
|
||||||
"""Download kaggle dataset."""
|
"""Download kaggle dataset."""
|
||||||
print('Downloading dataset...')
|
print('Downloading dataset...')
|
||||||
os.system('kaggle datasets download -d anikannal/solar-power-generation-data')
|
os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
|
||||||
|
|
||||||
print('Dir after downloading')
|
print('Dir after downloading')
|
||||||
os.system('ls -la')
|
os.system('ls -la')
|
||||||
@ -54,17 +48,6 @@ def divide_dataset(dataset):
|
|||||||
print('Dataset devided')
|
print('Dataset devided')
|
||||||
|
|
||||||
|
|
||||||
def get_statistics(dataset):
|
|
||||||
"""Mean, min, max, median etc."""
|
|
||||||
|
|
||||||
print(f'--------------- Normalized dataset length ---------------')
|
|
||||||
print(len(dataset))
|
|
||||||
|
|
||||||
print(f'---------------Describe dataset---------------')
|
|
||||||
pd.set_option('display.max_columns', None)
|
|
||||||
print(dataset.describe(include='all'))
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_dataset(dataset):
|
def normalize_dataset(dataset):
|
||||||
"""Drop unnecessary columns and set numeric values to [0,1] range"""
|
"""Drop unnecessary columns and set numeric values to [0,1] range"""
|
||||||
|
|
||||||
@ -78,16 +61,14 @@ def normalize_dataset(dataset):
|
|||||||
# normalize numbers to [0, 1]
|
# normalize numbers to [0, 1]
|
||||||
for column in dataset.columns:
|
for column in dataset.columns:
|
||||||
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
|
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
|
||||||
dataset[column] = (dataset[column] - dataset[column].min()) / (
|
dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
|
||||||
dataset[column].max() - dataset[column].min())
|
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
# print(os.system('python3 -m pip freeze'))
|
|
||||||
download_dataset()
|
download_dataset()
|
||||||
unzip_package()
|
unzip_package()
|
||||||
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
|
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
|
||||||
df = pd.DataFrame(cars)
|
df = pd.DataFrame(cars)
|
||||||
df = normalize_dataset(df)
|
df = normalize_dataset(df)
|
||||||
divide_dataset(df)
|
divide_dataset(df)
|
||||||
get_statistics(df)
|
|
17
script-stats.py
Executable file
17
script-stats.py
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
print('--Full dataset stats--')
|
||||||
|
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', encoding="utf-8")
|
||||||
|
print(cars.describe(include='all'))
|
||||||
|
|
||||||
|
print('Dev dataset stats')
|
||||||
|
cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', encoding="utf-8")
|
||||||
|
print(cars_dev.describe(include='all'))
|
||||||
|
|
||||||
|
print('# statystyki dla zbioru test')
|
||||||
|
cars_test = pd.read_csv('./Car_Prices_Poland_Kaggle_test.csv', encoding="utf-8")
|
||||||
|
print(cars_test.describe(include='all'))
|
||||||
|
|
||||||
|
print('# statystyki dla zbioru train')
|
||||||
|
cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', encoding="utf-8")
|
||||||
|
print(cars_train.describe(include='all'))
|
12
stats-docker.sh
Executable file
12
stats-docker.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
echo 'Total elements in Car Prices Poland dataset:'
|
||||||
|
wc -l ./Car_Prices_Poland_Kaggle.csv
|
||||||
|
|
||||||
|
echo 'Total elements in train dataset:'
|
||||||
|
wc -l ./Car_Prices_Poland_Kaggle_train.csv
|
||||||
|
|
||||||
|
echo 'Total elements in test dataset:'
|
||||||
|
wc -l ./Car_Prices_Poland_Kaggle_test.csv
|
||||||
|
|
||||||
|
echo 'Total elements in dev dataset:'
|
||||||
|
wc -l ./Car_Prices_Poland_Kaggle_dev.csv
|
Loading…
Reference in New Issue
Block a user