Added new jenkinsfile and kuggle download script
This commit is contained in:
parent
68c3e170d0
commit
8143cba976
@ -1,30 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Pobieranie i rozpakowanie
|
||||
pip install kaggle
|
||||
|
||||
kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate
|
||||
|
||||
unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip
|
||||
DATASET_FILE="beer_reviews.csv"
|
||||
|
||||
shuf beer_reviews.csv > shuffled_dataset.csv
|
||||
head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE
|
||||
|
||||
pwd
|
||||
ls -a
|
||||
# Mieszanie i podział
|
||||
total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l)
|
||||
|
||||
split -l 80000 shuffled_dataset.csv train.csv
|
||||
split -l 10000 train.csv dev.csv
|
||||
mv shuffled_dataset.csv test.csv
|
||||
train_lines=$((total_lines * 80 / 100))
|
||||
dev_lines=$((total_lines * 10 / 100))
|
||||
test_lines=$((total_lines - train_lines - dev_lines))
|
||||
|
||||
head -n 1000 train.csv > train_head.csv
|
||||
tail -n 1000 train.csv > train_tail.csv
|
||||
shuf cutoff_$DATASET_FILE -o shuffled.csv
|
||||
|
||||
if [ -n "$CUTOFF" ]; then
|
||||
head -n "$CUTOFF" train.csv > train_cutoff.csv
|
||||
fi
|
||||
head -n $train_lines shuffled.csv > train.csv
|
||||
tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv
|
||||
tail -n $test_lines shuffled.csv > test.csv
|
||||
|
||||
tar -czf artifacts.tar.gz train.csv dev.csv test.csv train_head.csv train_tail.csv train_cutoff.csv
|
||||
# Archiwizacja
|
||||
tar -czf artifacts.tar.gz train.csv dev.csv test.csv
|
||||
|
||||
rm $DATASET_FILE shuffled_dataset.csv
|
||||
# Czyszczenie
|
||||
rm cutoff_$DATASET_FILE shuffled.csv
|
||||
|
||||
echo "artifacts.tar.gz"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user