2024-03-20 17:26:55 +01:00
|
|
|
#!/bin/bash
|
2024-03-20 18:31:15 +01:00
|
|
|
# Download dataset from kaggle
|
2024-03-20 17:26:55 +01:00
|
|
|
kaggle datasets download -d uciml/breast-cancer-wisconsin-data
|
2024-03-20 18:31:15 +01:00
|
|
|
|
|
|
|
# Unzip dataset -> data.csv
|
2024-03-20 17:26:55 +01:00
|
|
|
unzip -o breast-cancer-wisconsin-data.zip
|
2024-03-20 18:31:15 +01:00
|
|
|
|
|
|
|
# Remove id column
|
|
|
|
cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
|
|
|
|
|
|
|
# Remove first n rows CUTOFF
|
|
|
|
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
|
|
|
|
|
|
|
# Get number of rows in data.csv
|
|
|
|
if [ $1 -gt $(wc -l < data.csv) ]; then
|
|
|
|
data_size=$(wc -l < data.csv)
|
|
|
|
else
|
|
|
|
data_size=$1
|
|
|
|
fi
|
|
|
|
|
|
|
|
# Split data into train, dev, test
|
|
|
|
head -n $(expr $data_size / 10 \* 8) data.csv > train.csv
|
|
|
|
tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp
|
|
|
|
|
2024-03-20 18:36:43 +01:00
|
|
|
# Get number of rows in dev.csv.tmp
|
2024-03-20 18:31:15 +01:00
|
|
|
dev_size=$(wc -l < dev.csv.tmp)
|
|
|
|
|
|
|
|
head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv
|
|
|
|
tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp
|