put generate subsets at the end of init script

2022-03-20 21:34:03 +01:00 · 2022-03-20 21:34:03 +01:00 · ee8737fc15
commit ee8737fc15
parent 16081a3564
1 changed files with 12 additions and 12 deletions
--- a/init.py
+++ b/init.py
@ -20,18 +20,6 @@ if not file_exists:
 atp_data = pd.read_csv('df_atp.csv')
 print(atp_data)

-# Podział na podzbiory: trenujący, testowy, walidujący
-
-atp_train, atp_test = train_test_split(atp_data, test_size=0.4, random_state=1)
-atp_dev, atp_test = train_test_split(atp_test, test_size=0.5, random_state=1)
-
-# Wielkość zbioru i podzbiorów
-
-print("Elements of total set: " + str(len(atp_data)))
-print("Elements of test set: " + str(len(atp_test)))
-print("Elements of dev set: " + str(len(atp_dev)))
-print("Elements of train set: " + str(len(atp_train)))
-
 # Średnia ilość gemów w pierwszym secie zwycięzców meczu

 print(atp_data[["Winner", "W1"]].mean())
@ -77,3 +65,15 @@ print(atp_data["Round"])

 atp_data.loc[atp_data["Date"] == '########', "Date"] = ''
 print(atp_data["Date"])
+
+# Podział na podzbiory: trenujący, testowy, walidujący w proporcjach 6:2:2
+
+atp_train, atp_test = train_test_split(atp_data, test_size=0.4, random_state=1)
+atp_dev, atp_test = train_test_split(atp_test, test_size=0.5, random_state=1)
+
+# Wielkość zbioru i podzbiorów
+
+print("Elements of total set: " + str(len(atp_data)))
+print("Elements of test set: " + str(len(atp_test)))
+print("Elements of dev set: " + str(len(atp_dev)))
+print("Elements of train set: " + str(len(atp_train)))