diff --git a/Jenkinsfile b/Jenkinsfile index 81c566b..8670a93 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -13,7 +13,7 @@ pipeline { name: 'KAGGLE_KEY' ) string( - defaultValue: '10', + defaultValue: '10841', description: 'cutoff parameter', name: 'CUTOFF' ) diff --git a/data_prep.sh b/data_prep.sh index d8ace10..a6c66a4 100755 --- a/data_prep.sh +++ b/data_prep.sh @@ -1,12 +1,24 @@ #!/bin/bash -kaggle datasets download -d lava18/google-play-store-apps +#kaggle datasets download -d lava18/google-play-store-apps unzip -o google-play-store-apps.zip sed -i '1d' googleplaystore.csv -shuf googleplaystore.csv > apps_shuf.csv +shuf googleplaystore.csv > apps_shuf_.csv +head -n $CUTOFF apps_shuf_.csv > apps_shuf.csv + +total=$(wc -l apps_shuf.csv | awk '{print $1}') +test_set=$(( (total*60+50) / 100 )) +train_val_set=$(( (total*20+50) / 100 )) + #cp apps_shuf.csv apps_shuf_copy.csv -head -n 6505 apps_shuf.csv > apps_train.csv -sed -i '1,6505d' apps_shuf.csv -head -n 2168 apps_shuf.csv > apps_test.csv -sed -i '1,2168d' apps_shuf.csv -head -n 2168 apps_shuf.csv > apps_validate.csv \ No newline at end of file + +head -n $test_set apps_shuf.csv > apps_train.csv +lines="1,$test_set" +sed -i "$lines"'d' apps_shuf.csv +head -n $train_val_set apps_shuf.csv > apps_test.csv +lines="1,$train_val_set" +sed -i "$lines"'d' apps_shuf.csv +head -n $train_val_set apps_shuf.csv > apps_validate.csv +wc -l apps_train.csv +wc -l apps_test.csv +wc -l apps_validate.csv \ No newline at end of file