inzynieria-uczenia-maszynowego/create-dataset/main.py
2023-04-19 20:33:37 +02:00

48 lines
1005 B
Python

# Import dependencies
import numpy as np
import pandas as pd
import sys
from datasets import load_dataset
from sklearn.preprocessing import MinMaxScaler
# Load dataset
data = load_dataset('mstz/spambase')
data = pd.DataFrame(data['train'])
# Read CUTOFF
cutoff = int(sys.argv[1])
cutoff = cutoff if cutoff > 0 else len(data)
# Shuffle data
data = data.sample(frac=1).head(cutoff)
# Split data
train_data, val_data, test_data = np.split(data, [int(.6*len(data)), int(.8*len(data))])
# Normalize data
scaler = MinMaxScaler()
scaler.fit(train_data)
normalized_train_data = scaler.transform(train_data)
normalized_val_data = scaler.transform(val_data)
normalized_test_data = scaler.transform(test_data)
# Save data
pd.DataFrame(normalized_train_data, columns=data.columns).to_csv('spambase.data.train.csv')
pd.DataFrame(normalized_val_data, columns=data.columns).to_csv('spambase.data.val.csv')
pd.DataFrame(normalized_test_data, columns=data.columns).to_csv('spambase.data.test.csv')