lab2 - data
This commit is contained in:
parent
659ee402f0
commit
fa74b171c4
2
.gitignore
vendored
2
.gitignore
vendored
@ -152,3 +152,5 @@ cython_debug/
|
|||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
./data/
|
||||||
|
|
||||||
|
38
lab2_data.py
Normal file
38
lab2_data.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
from kaggle import api
|
||||||
|
from pandas import read_csv, DataFrame
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_save_dataset():
|
||||||
|
api.authenticate()
|
||||||
|
api.dataset_download_files('shivamb/real-or-fake-fake-jobposting-prediction',
|
||||||
|
path='./data',
|
||||||
|
unzip=True)
|
||||||
|
|
||||||
|
|
||||||
|
def split_dataset(data: DataFrame):
|
||||||
|
train_ratio, validation_ratio, test_ratio = 0.6, 0.2, 0.2
|
||||||
|
data_x, data_y = data.iloc[:, :-1], data.iloc[:, -1:]
|
||||||
|
|
||||||
|
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=1 - train_ratio, random_state=123)
|
||||||
|
|
||||||
|
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
|
||||||
|
test_size=test_ratio / (test_ratio + validation_ratio),
|
||||||
|
random_state=123)
|
||||||
|
|
||||||
|
return x_train, x_val, x_test, y_train, y_val, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# download_and_save_dataset()
|
||||||
|
df = read_csv('./data/fake_job_postings.csv')
|
||||||
|
print(df.describe(include='all'))
|
||||||
|
print(df.shape)
|
||||||
|
x_train, x_val, x_test, y_train, y_val, y_test = split_dataset(df)
|
||||||
|
print(x_train.shape, x_val.shape, x_test.shape)
|
||||||
|
print(y_train.shape, y_val.shape, y_test.shape)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user