2022-03-21 11:27:46 +01:00
|
|
|
#!/bin/bash
|
2022-03-27 21:43:29 +02:00
|
|
|
echo "Download data from kaggle"
|
2022-03-27 22:04:56 +02:00
|
|
|
echo $KAGGLE_USERNAME
|
2022-04-09 10:10:16 +02:00
|
|
|
# kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction
|
2022-03-27 21:43:29 +02:00
|
|
|
unzip -o real-or-fake-fake-jobposting-prediction.zip
|
|
|
|
echo "Save column titles"
|
|
|
|
head -n 1 fake_job_postings.csv > column_titles.csv
|
|
|
|
tail -n +2 fake_job_postings.csv > data_not_shuf.csv
|
|
|
|
echo "Create sets"
|
|
|
|
shuf data_not_shuf.csv > data_not_cutted.csv
|
2022-03-27 21:55:59 +02:00
|
|
|
head -n $CUTOFF data_not_cutted.csv > data.csv
|
2022-03-27 21:43:29 +02:00
|
|
|
sed -n '1,2500p' data.csv > data_test.csv
|
|
|
|
sed -n '2501,5000p' data.csv > data_dev.csv
|
2022-03-27 21:47:35 +02:00
|
|
|
tail -n +5001 data.csv > data_train.csv
|
2022-04-09 10:22:52 +02:00
|
|
|
rm data.csv data_not_shuf.csv data_not_cutted.csv
|