added data preparation for trainset
This commit is contained in:
parent
b000b5f248
commit
054eb5a64b
22
create-dataset.py
Normal file
22
create-dataset.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import pandas
|
||||||
|
import os
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
CUTOFF = int(os.environ['CUTOFF'])
|
||||||
|
|
||||||
|
salaries = pandas.read_csv('./ium_458023/ds_salaries.csv',engine='python',encoding='ISO-8859-1',sep=',')
|
||||||
|
|
||||||
|
salaries = salaries.dropna()
|
||||||
|
|
||||||
|
salaries = salaries.sample(100)
|
||||||
|
|
||||||
|
X,Y = salaries,salaries
|
||||||
|
|
||||||
|
# SPLIT BETWEEN DEV, TRAINS, AND TEST
|
||||||
|
salaries_train, salaries_temp, salaries_train, salaries_temp = train_test_split(X, Y, test_size=0.2, random_state=1)
|
||||||
|
salaries_dev, salaries_test, salaries_dev, salaries_test = train_test_split(salaries_temp, salaries_temp, test_size=0.2)
|
||||||
|
|
||||||
|
salaries_train.to_csv('salaries_train.csv', index=False)
|
||||||
|
salaries_dev.to_csv('salaries_dev.csv', index=False)
|
||||||
|
salaries_test.to_csv('salaries_test.csv', index=False)
|
Loading…
Reference in New Issue
Block a user