diff --git a/.gitignore b/.gitignore index 4e8128b..06ccd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ creditcardfraud.zip creditcard.csv data -model/model.keras \ No newline at end of file +model/model.keras +stats_data \ No newline at end of file diff --git a/create-dataset.py b/create-dataset.py index 4903d69..b0ddd7c 100644 --- a/create-dataset.py +++ b/create-dataset.py @@ -1,9 +1,6 @@ import os import pandas as pd -import numpy as np - from sklearn.preprocessing import StandardScaler - from sklearn.model_selection import train_test_split @@ -18,72 +15,27 @@ def normalize_data(df): return df -def create_undersample_data(df): - # Determine the number of instances in the minority class - fraud_count = len(df[df.Class == 1]) - fraud_indices = np.array(df[df.Class == 1].index) - - # Select indices corresponding to majority class instances - normal_indices = df[df.Class == 0].index - - # Randomly sample the same number of instances from the majority class - random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False) - random_normal_indices = np.array(random_normal_indices) - - # Combine indices of both classes - undersample_indice = np.concatenate([fraud_indices, random_normal_indices]) - - # Undersample dataset - undersample_data = df.iloc[undersample_indice, :] - - X_undersample = undersample_data.iloc[:, undersample_data.columns != "Class"] - y_undersample = undersample_data.iloc[:, undersample_data.columns == "Class"] - - return undersample_data, X_undersample, y_undersample - - -def split_undersample_data(X_undersample, y_undersample): - X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = ( - train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0) - ) - - return ( - X_train_undersample, - X_test_undersample, - y_train_undersample, - y_test_undersample, - ) - - -def save_undersample_data( - undersample_data, - X_train_undersample, - X_test_undersample, - y_train_undersample, - y_test_undersample, -): - undersample_data.to_csv("data/undersample_data.csv", index=False) - X_train_undersample.to_csv("data/X_train_undersample.csv", index=False) - X_test_undersample.to_csv("data/X_test_undersample.csv", index=False) - y_train_undersample.to_csv("data/y_train_undersample.csv", index=False) - y_test_undersample.to_csv("data/y_test_undersample.csv", index=False) - - -def split_whole_data(df): +def split_data(df): X = df.iloc[:, df.columns != "Class"] y = df.iloc[:, df.columns == "Class"] X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=0 + X, y, test_size=0.2, random_state=0 ) - return X_train, X_test, y_train, y_test + X_train, X_val, y_train, y_val = train_test_split( + X_train, y_train, test_size=0.25, random_state=0 + ) + + return X_train, X_val, X_test, y_train, y_val, y_test -def save_whole_data(df, X_train, X_test, y_train, y_test): +def save_data(df, X_train, X_val, X_test, y_train, y_val, y_test): df.to_csv("data/creditcard.csv", index=False) X_train.to_csv("data/X_train.csv", index=False) + X_val.to_csv("data/X_val.csv", index=False) X_test.to_csv("data/X_test.csv", index=False) y_train.to_csv("data/y_train.csv", index=False) + y_val.to_csv("data/y_val.csv", index=False) y_test.to_csv("data/y_test.csv", index=False) @@ -94,20 +46,8 @@ def main(): df = load_data("creditcard.csv") df = normalize_data(df) - undersample_data, X_undersample, y_undersample = create_undersample_data(df) - X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = ( - split_undersample_data(X_undersample, y_undersample) - ) - save_undersample_data( - undersample_data, - X_train_undersample, - X_test_undersample, - y_train_undersample, - y_test_undersample, - ) - - X_train, X_test, y_train, y_test = split_whole_data(df) - save_whole_data(df, X_train, X_test, y_train, y_test) + X_train, X_val, X_test, y_train, y_val, y_test = split_data(df) + save_data(df, X_train, X_val, X_test, y_train, y_val, y_test) if __name__ == "__main__": diff --git a/dataset-stats.py b/dataset-stats.py index 7cb9d2a..b994488 100644 --- a/dataset-stats.py +++ b/dataset-stats.py @@ -7,16 +7,12 @@ def write_to_file(file_name): df = pd.read_csv("data/creditcard.csv") pd.set_option("display.max_columns", None) - undersample_data = pd.read_csv("data/undersample_data.csv") - X_test_undersample = pd.read_csv("data/X_test_undersample.csv") - y_test_undersample = pd.read_csv("data/y_test_undersample.csv") - X_train_undersample = pd.read_csv("data/X_train_undersample.csv") - y_train_undersample = pd.read_csv("data/y_train_undersample.csv") - - X_test = pd.read_csv("data/X_test.csv") - y_test = pd.read_csv("data/y_test.csv") X_train = pd.read_csv("data/X_train.csv") + X_val = pd.read_csv("data/X_val.csv") + X_test = pd.read_csv("data/X_test.csv") y_train = pd.read_csv("data/y_train.csv") + y_val = pd.read_csv("data/y_val.csv") + y_test = pd.read_csv("data/y_test.csv") with open("stats_data/" + file_name, "w") as f: sys.stdout = f @@ -41,24 +37,7 @@ def write_to_file(file_name): f.write("\n\n") - f.write("Size of undersampled dataset\n") - undersample_data.info() - - f.write("\n\n") - - f.write("Summary statistics of the undersampled dataset\n") - f.write(str(undersample_data.describe())) - - f.write("\n\n") - - f.write( - "Distribution of legitimate and fraudulent transactions in an undersampled dataset\n" - ) - f.write(str(undersample_data["Class"].value_counts())) - - f.write("\n\n") - - f.write("Statistical measures of the training dataset of whole data\n") + f.write("Statistical measures of the training dataset\n") pd.concat([X_train, y_train], axis=1).info() f.write("\n") f.write(str(pd.concat([X_train, y_train], axis=1).describe())) @@ -67,49 +46,22 @@ def write_to_file(file_name): f.write("\n\n") - f.write("Statistical measures of the test dataset of whole data\n") + f.write("Statistical measures of the validation dataset\n") + pd.concat([X_val, y_val], axis=1).info() + f.write("\n") + f.write(str(pd.concat([X_val, y_val], axis=1).describe())) + f.write("\n") + f.write(str(pd.concat([X_val, y_val], axis=1)["Class"].value_counts())) + + f.write("\n\n") + + f.write("Statistical measures of the test dataset\n") pd.concat([X_test, y_test], axis=1).info() f.write("\n") f.write(str(pd.concat([X_test, y_test], axis=1).describe())) f.write("\n") f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts())) - f.write("\n\n") - - f.write("Statistical measures of the training dataset of undersampled data\n") - pd.concat([X_train_undersample, y_train_undersample], axis=1).info() - f.write("\n") - f.write( - str( - pd.concat([X_train_undersample, y_train_undersample], axis=1).describe() - ) - ) - f.write("\n") - f.write( - str( - pd.concat([X_train_undersample, y_train_undersample], axis=1)[ - "Class" - ].value_counts() - ) - ) - - f.write("\n\n") - - f.write("Statistical measures of the test dataset of undersampled data\n") - pd.concat([X_test_undersample, y_test_undersample], axis=1).info() - f.write("\n") - f.write( - str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe()) - ) - f.write("\n") - f.write( - str( - pd.concat([X_test_undersample, y_test_undersample], axis=1)[ - "Class" - ].value_counts() - ) - ) - sys.stdout = sys.__stdout__ diff --git a/train_model.py b/train_model.py index c77ecb5..6706bdc 100644 --- a/train_model.py +++ b/train_model.py @@ -10,12 +10,17 @@ import pandas as pd def main(): X_train = pd.read_csv("data/X_train.csv") + X_val = pd.read_csv("data/X_val.csv") y_train = pd.read_csv("data/y_train.csv") + y_val = pd.read_csv("data/y_val.csv") X_train = X_train.to_numpy() + X_val = X_val.to_numpy() y_train = y_train.to_numpy() + y_val = y_val.to_numpy() X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1) + X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1) model = Sequential( [ @@ -41,6 +46,7 @@ def main(): model.fit( X_train, y_train, + validation_data=(X_val, y_val), epochs=5, verbose=1, )