diff --git a/create-dataset.py b/create-dataset.py new file mode 100644 index 0000000..34955f4 --- /dev/null +++ b/create-dataset.py @@ -0,0 +1,17 @@ +import pandas as pd +import os +import numpy as np + + +cutoff = int(os.environ['CUTOFF']) + +data = pd.read_csv('./ium_z444510/barcelona_weekends.csv') +data = data.sample(cutoff) +data = data.iloc[:, 1:] + +train_set, dev_set, test_set = np.split(data.sample(frac=1, random_state=42), + [int(.6 * len(data)), int(.8 * len(data))]) + +train_set.to_csv('train.csv', index=False) +dev_set.to_csv('dev.csv', index=False) +test_set.to_csv('test.csv', index=False)