This commit is contained in:
Mikołaj Pokrywka 2022-03-27 21:43:29 +02:00
parent 90c97b64d2
commit 3c80804774
11 changed files with 35775 additions and 4 deletions

1
Jenkinsfile vendored
View File

@ -28,6 +28,7 @@ pipeline {
steps {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh "./process_data.sh"
archiveArtifacts artifacts: "data_test.csv, data_dev.csv, data_train.csv"
}
}
}

1
column_titles.csv Normal file
View File

@ -0,0 +1 @@
job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
1 job_id title location department salary_range company_profile description requirements benefits telecommuting has_company_logo has_questions employment_type required_experience required_education industry function fraudulent

0
data.csv Normal file
View File

0
data_dev.csv Normal file
View File

17880
data_not_cutted.csv Normal file

File diff suppressed because one or more lines are too long

17880
data_not_shuf.csv Normal file

File diff suppressed because one or more lines are too long

0
data_test.csv Normal file
View File

0
data_train.csv Normal file
View File

View File

@ -1528,7 +1528,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.8.12"
}
},
"nbformat": 4,

View File

@ -1,4 +1,13 @@
#!/bin/bash
echo "welcome"
ls
echo "this is the whole list of dir"
echo "Download data from kaggle"
kaggle datasets download -d shivamb/real-or-fake-fake-jobposting-prediction
unzip -o real-or-fake-fake-jobposting-prediction.zip
echo "Save column titles"
head -n 1 fake_job_postings.csv > column_titles.csv
tail -n +2 fake_job_postings.csv > data_not_shuf.csv
echo "Create sets"
shuf data_not_shuf.csv > data_not_cutted.csv
head -n $1 data_not_cutted.csv > data.csv
sed -n '1,2500p' data.csv > data_test.csv
sed -n '2501,5000p' data.csv > data_dev.csv
tail -n +5001 data.csv > data_train.csv

Binary file not shown.