diff --git a/Dockerfile b/Dockerfile index eb89aad..804a083 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,4 @@ FROM ubuntu:latest RUN apt-get update && \ apt-get install -y python3 python3-pip -RUN pip3 install pandas scikit-learn kaggle - -WORKDIR /app - -COPY create-dataset.py /app -COPY data/barcelona_weekends.csv /app - -CMD ["python3", "create-dataset.py"] \ No newline at end of file +RUN pip3 install pandas numpy diff --git a/Jenkinsfile-create-dataset-docker b/Jenkinsfile-create-dataset-docker index 3aaf7a7..d28e69e 100644 --- a/Jenkinsfile-create-dataset-docker +++ b/Jenkinsfile-create-dataset-docker @@ -1,5 +1,7 @@ pipeline { - agent any + agent { + dockerfile true + } parameters{ string( defaultValue: 'piotrwrzodak', @@ -36,19 +38,10 @@ pipeline { sh 'kaggle datasets download -d thedevastator/airbnb-prices-in-european-cities' sh 'unzip airbnb-prices-in-european-cities.zip -d data' sh 'ls' + sh 'python create-dataset.py' + archiveArtifacts artifacts: 'data/barcelona_weekends.train.csv, data/barcelona_weekends.dev.csv, data/barcelona_weekends.test.csv', fingerprint: true } } } - stage('Docker') { - agent { - dockerfile { - filename 'Dockerfile' - reuseNode true - } - } - steps { - archiveArtifacts artifacts: 'barcelona_weekends.train.csv, barcelona_weekends.dev.csv, barcelona_weekends.test.csv', fingerprint: true - } - } } } \ No newline at end of file diff --git a/create-dataset.py b/create-dataset.py index d93da1e..281d1be 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -5,16 +5,16 @@ import numpy as np cutoff = 10 -data = pd.read_csv('./barcelona_weekends.csv') +data = pd.read_csv('./data/barcelona_weekends.csv') data = data.sample(cutoff) data = data.iloc[:, 1:] train_set, dev_set, test_set = np.split(data.sample(frac=1, random_state=42), [int(.6 * len(data)), int(.8 * len(data))]) -train_set.to_csv('barcelona_weekends.train.csv', index=False) -dev_set.to_csv('barcelona_weekends.dev.csv', index=False) -test_set.to_csv('barcelona_weekends.test.csv', index=False) +train_set.to_csv('data/barcelona_weekends.train.csv', index=False) +dev_set.to_csv('data/barcelona_weekends.dev.csv', index=False) +test_set.to_csv('data/barcelona_weekends.test.csv', index=False) check = pd.read_csv('./train.csv')