28 lines
579 B
Bash
28 lines
579 B
Bash
|
#!/bin/bash
|
||
|
|
||
|
pip install kaggle
|
||
|
|
||
|
kaggle datasets download -d $DATASET_ID
|
||
|
|
||
|
unzip -o $DATASET_FILE
|
||
|
|
||
|
shuf $DATASET_FILE > shuffled_dataset.csv
|
||
|
|
||
|
split -l 80000 shuffled_dataset.csv train.csv
|
||
|
split -l 10000 train.csv dev.csv
|
||
|
mv shuffled_dataset.csv test.csv
|
||
|
|
||
|
head -n 1000 train.csv > train_head.csv
|
||
|
tail -n 1000 train.csv > train_tail.csv
|
||
|
|
||
|
if [ -n "$CUTOFF" ]; then
|
||
|
head -n "$CUTOFF" train.csv > train_cutoff.csv
|
||
|
fi
|
||
|
|
||
|
tar -czf artifacts.tar.gz train.csv dev.csv test.csv train_head.csv train_tail.csv train_cutoff.csv
|
||
|
|
||
|
rm $DATASET_FILE shuffled_dataset.csv
|
||
|
|
||
|
echo "artifacts.tar.gz"
|
||
|
|