This commit is contained in:
Kacper 2022-04-02 15:36:46 +02:00
parent 8b54c42295
commit 2cf15fcc25
4 changed files with 74 additions and 8 deletions

View File

@ -2,15 +2,16 @@ FROM ubuntu:latest
WORKDIR /ium
COPY ./skrypt.sh ./
COPY ./statystyki.sh ./
RUN apt update && apt install -y python3-pip unzip
RUN chmod u+x ./skrypt.sh ./statystyki.sh
RUN apt update && apt install -y python3-pip zip
RUN pip3 install kaggle
RUN mkdir ~/.kaggle
RUN echo "{\"username\":\"kaerde\",\"key\":\"9831adcf128300863404c16935674937\"}" > ~/.kaggle/kaggle.json
RUN pip3 install --user kaggle
RUN echo "export PATH=\"\$HOME/.local/bin:\$PATH\"" >> ~/.bashrc
RUN pip3 install --user pandas
RUN pip3 install --user numpy
RUN pip3 install --user sklearn
COPY ./download_dataset.sh ./
COPY ./process_data.py ./
RUN chmod u+x ./download_dataset.sh ./process_data.py

32
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,32 @@
pipeline {
parameters{
string(
defaultValue: 'kaerde',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: 'e3fdd794699dbaf3ca7517dd8bb15d16',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
}
agent {
dockerfile {
additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY}"
}
}
stages {
stage('Stage 1') {
steps {
echo 'Downloading dataset...'
sh './download_dataset.sh'
echo 'Dataset downloaded'
echo 'Processing dataset...'
sh 'python3 process_dataset.py'
echo 'Dataset processed'
}
}
}
}

3
download_dataset.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/sh
kaggle datasets download -d mterzolo/lego-sets
unzip -o lego-sets.zip

30
process_dataset.py Normal file
View File

@ -0,0 +1,30 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# usuwamy przy okazji puste pola
lego = pd.read_csv('lego_sets.csv').dropna()
# list_price moze byc do dwoch miejsc po przecinku
lego['list_price'] = lego['list_price'].round(2)
# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi
lego['num_reviews'] = lego['num_reviews'].apply(np.int64)
lego['piece_count'] = lego['piece_count'].apply(np.int64)
lego['prod_id'] = lego['prod_id'].apply(np.int64)
# wglad, statystyki
print(lego)
print(lego.describe(include='all'))
# pierwszy podzial, wydzielamy zbior treningowy
lego_train, lego_rem = train_test_split(lego, train_size=0.8, random_state=1)
# drugi podział, wydzielamy walidacyjny i testowy
lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)
# zapis
lego.to_csv('lego_sets_clean.csv', index=None, header=True)
lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True)
lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True)
lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True)