From fa74b171c4a73723948b175e6d929171792e1c34 Mon Sep 17 00:00:00 2001
From: AdamOsiowy123 <adaosi@st.amu.edu.pl>
Date: Mon, 21 Mar 2022 00:03:25 +0100
Subject: [PATCH] lab2 - data

---
 .gitignore   |  2 ++
 lab2_data.py | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 lab2_data.py

diff --git a/.gitignore b/.gitignore
index 55be276..970c783 100644
--- a/.gitignore
+++ b/.gitignore
@@ -152,3 +152,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+./data/
+
diff --git a/lab2_data.py b/lab2_data.py
new file mode 100644
index 0000000..29a7d3a
--- /dev/null
+++ b/lab2_data.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+from kaggle import api
+from pandas import read_csv, DataFrame
+from sklearn.model_selection import train_test_split
+
+
+def download_and_save_dataset():
+    api.authenticate()
+    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
+                               path='./data',
+                               unzip=True)
+
+
+def split_dataset(data: DataFrame):
+    train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
+    data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
+
+    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)
+
+    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
+                                                    test_size=test_ratio / (test_ratio + validation_ratio),
+                                                    random_state=123)
+
+    return x_train, x_val, x_test, y_train, y_val, y_test
+
+
+def main():
+    # download_and_save_dataset()
+    df = read_csv('./data/fake_job_postings.csv')
+    print(df.describe(include='all'))
+    print(df.shape)
+    x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
+    print(x_train.shape, x_val.shape, x_test.shape)
+    print(y_train.shape, y_val.shape, y_test.shape)
+
+
+if __name__ == '__main__':
+    main()