#!/usr/bin/python from kaggle import api from pandas import read_csv, DataFrame from sklearn.model_selection import train_test_split def download_and_save_dataset(): api.authenticate() api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction', path='./data', unzip=True) def split_dataset(data: DataFrame): train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2 data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:] x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123) x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio / (test_ratio + validation_ratio), random_state=123) return x_train, x_val, x_test, y_train, y_val, y_test def main(): # download_and_save_dataset() df = read_csv('./data/fake_job_postings.csv') print(df.describe(include='all')) print(df.shape) x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df) print(x_train.shape, x_val.shape, x_test.shape) print(y_train.shape, y_val.shape, y_test.shape) if __name__ == '__main__': main()