From 054eb5a64bd909d011e2244e2dd1833b53a228dd Mon Sep 17 00:00:00 2001 From: Mateusz Kuc Date: Thu, 20 Apr 2023 18:48:29 +0200 Subject: [PATCH] added data preparation for trainset --- create-dataset.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 create-dataset.py diff --git a/create-dataset.py b/create-dataset.py new file mode 100644 index 0000000..2c731c4 --- /dev/null +++ b/create-dataset.py @@ -0,0 +1,22 @@ +import pandas +import os +from sklearn.model_selection import train_test_split + + +CUTOFF = int(os.environ['CUTOFF']) + +salaries = pandas.read_csv('./ium_458023/ds_salaries.csv',engine='python',encoding='ISO-8859-1',sep=',') + +salaries = salaries.dropna() + +salaries = salaries.sample(100) + +X,Y = salaries,salaries + +# SPLIT BETWEEN DEV, TRAINS, AND TEST +salaries_train, salaries_temp, salaries_train, salaries_temp = train_test_split(X, Y, test_size=0.2, random_state=1) +salaries_dev, salaries_test, salaries_dev, salaries_test = train_test_split(salaries_temp, salaries_temp, test_size=0.2) + +salaries_train.to_csv('salaries_train.csv', index=False) +salaries_dev.to_csv('salaries_dev.csv', index=False) +salaries_test.to_csv('salaries_test.csv', index=False)