diff --git a/Dockerfile b/Dockerfile index 31fe6fa..8d205c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,15 +2,16 @@ FROM ubuntu:latest WORKDIR /ium -COPY ./skrypt.sh ./ -COPY ./statystyki.sh ./ +RUN apt update && apt install -y python3-pip unzip -RUN chmod u+x ./skrypt.sh ./statystyki.sh +RUN pip3 install --user kaggle +RUN echo "export PATH=\"\$HOME/.local/bin:\$PATH\"" >> ~/.bashrc -RUN apt update && apt install -y python3-pip zip +RUN pip3 install --user pandas +RUN pip3 install --user numpy +RUN pip3 install --user sklearn -RUN pip3 install kaggle +COPY ./download_dataset.sh ./ +COPY ./process_data.py ./ -RUN mkdir ~/.kaggle -RUN echo "{\"username\":\"kaerde\",\"key\":\"9831adcf128300863404c16935674937\"}" > ~/.kaggle/kaggle.json -RUN echo "export PATH=\"\$HOME/.local/bin:\$PATH\"" >> ~/.bashrc \ No newline at end of file +RUN chmod u+x ./download_dataset.sh ./process_data.py \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..82f4c17 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,32 @@ +pipeline { + parameters{ + string( + defaultValue: 'kaerde', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: 'e3fdd794699dbaf3ca7517dd8bb15d16', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + } + agent { + dockerfile { + additionalBuildArgs "--build-arg KAGGLE_USERNAME=${params.KAGGLE_USERNAME} --build-arg KAGGLE_KEY=${params.KAGGLE_KEY}" + } + } + stages { + stage('Stage 1') { + steps { + echo 'Downloading dataset...' + sh './download_dataset.sh' + echo 'Dataset downloaded' + echo 'Processing dataset...' + sh 'python3 process_dataset.py' + echo 'Dataset processed' + } + } + } + } diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000..bae3c01 --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,3 @@ +#!/bin/sh +kaggle datasets download -d mterzolo/lego-sets +unzip -o lego-sets.zip diff --git a/process_dataset.py b/process_dataset.py new file mode 100644 index 0000000..2384cc9 --- /dev/null +++ b/process_dataset.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split + +# usuwamy przy okazji puste pola +lego = pd.read_csv('lego_sets.csv').dropna() + +# list_price moze byc do dwoch miejsc po przecinku +lego['list_price'] = lego['list_price'].round(2) + +# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi +lego['num_reviews'] = lego['num_reviews'].apply(np.int64) +lego['piece_count'] = lego['piece_count'].apply(np.int64) +lego['prod_id'] = lego['prod_id'].apply(np.int64) + +# wglad, statystyki +print(lego) +print(lego.describe(include='all')) + +# pierwszy podzial, wydzielamy zbior treningowy +lego_train, lego_rem = train_test_split(lego, train_size=0.8, random_state=1) + +# drugi podziaƂ, wydzielamy walidacyjny i testowy +lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1) + +# zapis +lego.to_csv('lego_sets_clean.csv', index=None, header=True) +lego_train.to_csv('lego_sets_clean_train.csv', index=None, header=True) +lego_valid.to_csv('lego_sets_clean_valid.csv', index=None, header=True) +lego_test.to_csv('lego_sets_clean_test.csv', index=None, header=True) \ No newline at end of file