48 lines
1005 B
Python
48 lines
1005 B
Python
# Import dependencies
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sys
|
|
|
|
from datasets import load_dataset
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
|
# Load dataset
|
|
|
|
data = load_dataset('mstz/spambase')
|
|
data = pd.DataFrame(data['train'])
|
|
|
|
|
|
# Read CUTOFF
|
|
|
|
cutoff = int(sys.argv[1])
|
|
cutoff = cutoff if cutoff > 0 else len(data)
|
|
|
|
|
|
# Shuffle data
|
|
|
|
data = data.sample(frac=1).head(cutoff)
|
|
|
|
|
|
# Split data
|
|
|
|
train_data, val_data, test_data = np.split(data, [int(.6*len(data)), int(.8*len(data))])
|
|
|
|
|
|
# Normalize data
|
|
|
|
scaler = MinMaxScaler()
|
|
|
|
scaler.fit(train_data)
|
|
|
|
normalized_train_data = scaler.transform(train_data)
|
|
normalized_val_data = scaler.transform(val_data)
|
|
normalized_test_data = scaler.transform(test_data)
|
|
|
|
# Save data
|
|
|
|
pd.DataFrame(normalized_train_data, columns=data.columns).to_csv('spambase.data.train.csv')
|
|
pd.DataFrame(normalized_val_data, columns=data.columns).to_csv('spambase.data.val.csv')
|
|
pd.DataFrame(normalized_test_data, columns=data.columns).to_csv('spambase.data.test.csv')
|