From fa74b171c4a73723948b175e6d929171792e1c34 Mon Sep 17 00:00:00 2001 From: AdamOsiowy123 Date: Mon, 21 Mar 2022 00:03:25 +0100 Subject: [PATCH] lab2 - data --- .gitignore | 2 ++ lab2_data.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 lab2_data.py diff --git a/.gitignore b/.gitignore index 55be276..970c783 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +./data/ + diff --git a/lab2_data.py b/lab2_data.py new file mode 100644 index 0000000..29a7d3a --- /dev/null +++ b/lab2_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python +from kaggle import api +from pandas import read_csv, DataFrame +from sklearn.model_selection import train_test_split + + +def download_and_save_dataset(): + api.authenticate() + api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', + path='./data', + unzip=True) + + +def split_dataset(data: DataFrame): + train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2 + data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:] + + x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123) + + x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, + test_size=test_ratio / (test_ratio + validation_ratio), + random_state=123) + + return x_train, x_val, x_test, y_train, y_val, y_test + + +def main(): + # download_and_save_dataset() + df = read_csv('./data/fake_job_postings.csv') + print(df.describe(include='all')) + print(df.shape) + x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df) + print(x_train.shape, x_val.shape, x_test.shape) + print(y_train.shape, y_val.shape, y_test.shape) + + +if __name__ == '__main__': + main()