ium_444452/lab2_data.py

#!/usr/bin/python
from kaggle import api
from pandas import read_csv, DataFrame
from sklearn.model_selection import train_test_split


def download_and_save_dataset():
    api.authenticate() 
    api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
                               path='./data',
                               unzip=True)


def split_dataset(data: DataFrame):
    train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
    data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]

    x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)

    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
                                                    test_size=test_ratio / (test_ratio + validation_ratio),
                                                    random_state=123)

    return x_train, x_val, x_test, y_train, y_val, y_test


def main():
    # download_and_save_dataset()
    df = read_csv('./data/fake_job_postings.csv')
    print(df.describe(include='all'))
    print(df.shape)
    x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
    print(x_train.shape, x_val.shape, x_test.shape)
    print(y_train.shape, y_val.shape, y_test.shape)


if __name__ == '__main__':
    main()
lab2 - data 2022-03-21 00:03:25 +01:00			`#!/usr/bin/python`
			`from kaggle import api`
			`from pandas import read_csv, DataFrame`
			`from sklearn.model_selection import train_test_split`


			`def download_and_save_dataset():`
jenkins test1 2022-03-21 11:16:36 +01:00			`api.authenticate()`
lab2 - data 2022-03-21 00:03:25 +01:00			`api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',`
			`path='./data',`
			`unzip=True)`


			`def split_dataset(data: DataFrame):`
			`train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2`
			`data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]`

			`x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)`

			`x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,`
			`test_size=test_ratio / (test_ratio + validation_ratio),`
			`random_state=123)`

			`return x_train, x_val, x_test, y_train, y_val, y_test`


			`def main():`
			`# download_and_save_dataset()`
			`df = read_csv('./data/fake_job_postings.csv')`
			`print(df.describe(include='all'))`
			`print(df.shape)`
			`x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)`
			`print(x_train.shape, x_val.shape, x_test.shape)`
			`print(y_train.shape, y_val.shape, y_test.shape)`


			`if __name__ == '__main__':`
			`main()`