ium_464913/create-dataset.sh
2024-04-01 13:53:46 +02:00

42 lines
1.4 KiB
Bash

#!/bin/bash
# Install the Kaggle API
pip install kaggle
# Download the dataset from Kaggle
kaggle datasets download -d mlg-ulb/creditcardfraud
# Unzip the dataset
unzip -o creditcardfraud.zip
# Remove the zip file
rm creditcardfraud.zip
# Create a header file
head -n 1 creditcard.csv > creditcard_header.csv
# Remove the header from the dataset
tail -n +2 creditcard.csv > creditcard_no_header.csv
# Remove the original dataset
rm creditcard.csv
# Shuffle the dataset
shuf creditcard_no_header.csv > creditcard_shuf_no_header.csv
# Remove the unshuffled dataset
rm creditcard_no_header.csv
# Add the header back to the shuffled dataset
cat creditcard_header.csv creditcard_shuf_no_header.csv > creditcard_shuf.csv
# Split the dataset into training and testing
tail -n +10001 creditcard_shuf_no_header.csv > creditcard_train_no_header.csv
head -n 10000 creditcard_shuf_no_header.csv > creditcard_test_no_header.csv
# Add the header back to the training and testing datasets
cat creditcard_header.csv creditcard_train_no_header.csv > creditcard_train.csv
cat creditcard_header.csv creditcard_test_no_header.csv > creditcard_test.csv
# Remove the intermediate files
rm creditcard_header.csv creditcard_shuf_no_header.csv creditcard_train_no_header.csv creditcard_test_no_header.csv
# Create a directory for the data
mkdir -p data
# Move the datasets to the data directory
mv creditcard_shuf.csv creditcard_train.csv creditcard_test.csv data/