2024-03-24 10:41:05 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
2024-03-24 11:31:18 +01:00
|
|
|
# Download the dataset from Kaggle
|
2024-03-24 10:41:05 +01:00
|
|
|
kaggle datasets download -d mlg-ulb/creditcardfraud
|
|
|
|
|
2024-03-24 11:31:18 +01:00
|
|
|
# Unzip the dataset
|
|
|
|
unzip -o creditcardfraud.zip
|
|
|
|
# Remove the zip file
|
|
|
|
rm creditcardfraud.zip
|
|
|
|
|
2024-03-24 12:55:09 +01:00
|
|
|
# Create a header file
|
|
|
|
head -n 1 creditcard.csv > creditcard_header.csv
|
|
|
|
# Remove the header from the dataset
|
|
|
|
tail -n +2 creditcard.csv > creditcard_no_header.csv
|
2024-03-24 11:31:18 +01:00
|
|
|
# Remove the original dataset
|
|
|
|
rm creditcard.csv
|
|
|
|
|
2024-03-24 12:55:09 +01:00
|
|
|
# Shuffle the dataset
|
|
|
|
shuf creditcard_no_header.csv > creditcard_shuf_no_header.csv
|
|
|
|
# Remove the unshuffled dataset
|
|
|
|
rm creditcard_no_header.csv
|
|
|
|
|
|
|
|
# Add the header back to the shuffled dataset
|
|
|
|
cat creditcard_header.csv creditcard_shuf_no_header.csv > creditcard_shuf.csv
|
|
|
|
|
2024-03-24 11:31:18 +01:00
|
|
|
# Split the dataset into training and testing
|
2024-03-24 12:55:09 +01:00
|
|
|
tail -n +10001 creditcard_shuf_no_header.csv > creditcard_train_no_header.csv
|
|
|
|
head -n 10000 creditcard_shuf_no_header.csv > creditcard_test_no_header.csv
|
|
|
|
|
|
|
|
# Add the header back to the training and testing datasets
|
|
|
|
cat creditcard_header.csv creditcard_train_no_header.csv > creditcard_train.csv
|
|
|
|
cat creditcard_header.csv creditcard_test_no_header.csv > creditcard_test.csv
|
|
|
|
|
|
|
|
# Remove the intermediate files
|
|
|
|
rm creditcard_header.csv creditcard_shuf_no_header.csv creditcard_train_no_header.csv creditcard_test_no_header.csv
|
2024-03-24 11:31:18 +01:00
|
|
|
|
|
|
|
# Create a directory for the data
|
|
|
|
mkdir -p data
|
|
|
|
# Move the datasets to the data directory
|
|
|
|
mv creditcard_shuf.csv creditcard_train.csv creditcard_test.csv data/
|