2024-04-01 13:47:56 +02:00
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
|
|
def load_data(name):
|
|
|
|
df = pd.read_csv(name)
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_data(df):
|
|
|
|
scaler = StandardScaler()
|
|
|
|
df["Amount"] = scaler.fit_transform(df["Amount"].values.reshape(-1, 1))
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
2024-04-15 12:58:41 +02:00
|
|
|
def split_data(df):
|
2024-04-01 13:47:56 +02:00
|
|
|
X = df.iloc[:, df.columns != "Class"]
|
|
|
|
y = df.iloc[:, df.columns == "Class"]
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
2024-04-15 12:58:41 +02:00
|
|
|
X, y, test_size=0.2, random_state=0
|
2024-04-01 13:47:56 +02:00
|
|
|
)
|
2024-04-15 12:58:41 +02:00
|
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
|
|
X_train, y_train, test_size=0.25, random_state=0
|
|
|
|
)
|
|
|
|
|
|
|
|
return X_train, X_val, X_test, y_train, y_val, y_test
|
2024-04-01 13:47:56 +02:00
|
|
|
|
|
|
|
|
2024-04-15 12:58:41 +02:00
|
|
|
def save_data(df, X_train, X_val, X_test, y_train, y_val, y_test):
|
2024-04-01 17:09:39 +02:00
|
|
|
df.to_csv("data/creditcard.csv", index=False)
|
|
|
|
X_train.to_csv("data/X_train.csv", index=False)
|
2024-04-15 12:58:41 +02:00
|
|
|
X_val.to_csv("data/X_val.csv", index=False)
|
2024-04-01 17:09:39 +02:00
|
|
|
X_test.to_csv("data/X_test.csv", index=False)
|
|
|
|
y_train.to_csv("data/y_train.csv", index=False)
|
2024-04-15 12:58:41 +02:00
|
|
|
y_val.to_csv("data/y_val.csv", index=False)
|
2024-04-01 17:09:39 +02:00
|
|
|
y_test.to_csv("data/y_test.csv", index=False)
|
2024-04-01 13:47:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2024-04-01 17:09:39 +02:00
|
|
|
os.makedirs("data", exist_ok=True)
|
2024-04-01 19:39:09 +02:00
|
|
|
os.system("rm -rf data/*")
|
2024-04-01 13:47:56 +02:00
|
|
|
|
2024-04-01 18:04:49 +02:00
|
|
|
df = load_data("creditcard.csv")
|
|
|
|
df = normalize_data(df)
|
|
|
|
|
2024-04-15 12:58:41 +02:00
|
|
|
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
|
|
|
|
save_data(df, X_train, X_val, X_test, y_train, y_val, y_test)
|
2024-04-01 17:53:47 +02:00
|
|
|
|
2024-04-01 13:47:56 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|