ium_464863/download_dataset.sh
2024-03-20 18:36:43 +01:00

29 lines
849 B
Bash

#!/bin/bash
# Download dataset from kaggle
kaggle datasets download -d uciml/breast-cancer-wisconsin-data
# Unzip dataset -> data.csv
unzip -o breast-cancer-wisconsin-data.zip
# Remove id column
cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv
# Remove first n rows CUTOFF
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
# Get number of rows in data.csv
if [ $1 -gt $(wc -l < data.csv) ]; then
data_size=$(wc -l < data.csv)
else
data_size=$1
fi
# Split data into train, dev, test
head -n $(expr $data_size / 10 \* 8) data.csv > train.csv
tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp
# Get number of rows in dev.csv.tmp
dev_size=$(wc -l < dev.csv.tmp)
head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv
tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp