Compare commits

..

2 Commits

Author SHA1 Message Date
piotrwrzodak
31ffbd656c update Jenkinsfile 2023-04-20 21:20:12 +02:00
piotrwrzodak
1afa0cf50e add create-dataset python file 2023-04-20 21:03:45 +02:00
2 changed files with 50 additions and 7 deletions

30
Jenkinsfile vendored
View File

@ -1,9 +1,35 @@
pipeline { pipeline {
agent any agent any
parameters{
string(
defaultValue: 'piotrwrzodak',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '1000',
description: 'CUTOFF',
name: 'CUTOFF',
trim: false
)
}
stages { stages {
stage('Stage 1') { stage('Build') {
steps { steps {
echo 'Hello world!' sh 'git clone https://git.wmi.amu.edu.pl/s444510/ium_z444510.git'
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d thedevastator/airbnb-prices-in-european-cities'
sh 'unzip airbnb-prices-in-european-cities.zip -d ./ium_z444510'
sh 'rm airbnb-prices-in-european-cities.zip'
sh 'ls -a'
sh 'ls -a ./ium_z444510'
}
} }
} }
} }

17
create-dataset.py Normal file
View File

@ -0,0 +1,17 @@
import pandas as pd
import os
import numpy as np
cutoff = int(os.environ['CUTOFF'])
data = pd.read_csv('./ium_z444510/barcelona_weekends.csv')
data = data.sample(cutoff)
data = data.iloc[:, 1:]
train_set, dev_set, test_set = np.split(data.sample(frac=1, random_state=42),
[int(.6 * len(data)), int(.8 * len(data))])
train_set.to_csv('train.csv', index=False)
dev_set.to_csv('dev.csv', index=False)
test_set.to_csv('test.csv', index=False)