#!/bin/bash # Download dataset from kaggle kaggle datasets download -d uciml/breast-cancer-wisconsin-data # Unzip dataset -> data.csv unzip -o breast-cancer-wisconsin-data.zip # Remove id column cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv # Remove first n rows CUTOFF head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv # Get number of rows in data.csv if [ $1 -gt $(wc -l < data.csv) ]; then data_size=$(wc -l < data.csv) else data_size=$1 fi # Split data into train, dev, test head -n $(expr $data_size / 10 \* 8) data.csv > train.csv tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp # Get number of rows in dev.csv.tmp dev_size=$(wc -l < dev.csv.tmp) head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp