IUM_05
This commit is contained in:
parent
ffdbe0a365
commit
a6be9a7295
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,3 +2,4 @@ creditcardfraud.zip
|
|||||||
creditcard.csv
|
creditcard.csv
|
||||||
data
|
data
|
||||||
model/model.keras
|
model/model.keras
|
||||||
|
stats_data
|
@ -1,9 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
@ -18,72 +15,27 @@ def normalize_data(df):
|
|||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def create_undersample_data(df):
|
def split_data(df):
|
||||||
# Determine the number of instances in the minority class
|
|
||||||
fraud_count = len(df[df.Class == 1])
|
|
||||||
fraud_indices = np.array(df[df.Class == 1].index)
|
|
||||||
|
|
||||||
# Select indices corresponding to majority class instances
|
|
||||||
normal_indices = df[df.Class == 0].index
|
|
||||||
|
|
||||||
# Randomly sample the same number of instances from the majority class
|
|
||||||
random_normal_indices = np.random.choice(normal_indices, fraud_count, replace=False)
|
|
||||||
random_normal_indices = np.array(random_normal_indices)
|
|
||||||
|
|
||||||
# Combine indices of both classes
|
|
||||||
undersample_indice = np.concatenate([fraud_indices, random_normal_indices])
|
|
||||||
|
|
||||||
# Undersample dataset
|
|
||||||
undersample_data = df.iloc[undersample_indice, :]
|
|
||||||
|
|
||||||
X_undersample = undersample_data.iloc[:, undersample_data.columns != "Class"]
|
|
||||||
y_undersample = undersample_data.iloc[:, undersample_data.columns == "Class"]
|
|
||||||
|
|
||||||
return undersample_data, X_undersample, y_undersample
|
|
||||||
|
|
||||||
|
|
||||||
def split_undersample_data(X_undersample, y_undersample):
|
|
||||||
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (
|
|
||||||
train_test_split(X_undersample, y_undersample, test_size=0.3, random_state=0)
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
X_train_undersample,
|
|
||||||
X_test_undersample,
|
|
||||||
y_train_undersample,
|
|
||||||
y_test_undersample,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def save_undersample_data(
|
|
||||||
undersample_data,
|
|
||||||
X_train_undersample,
|
|
||||||
X_test_undersample,
|
|
||||||
y_train_undersample,
|
|
||||||
y_test_undersample,
|
|
||||||
):
|
|
||||||
undersample_data.to_csv("data/undersample_data.csv", index=False)
|
|
||||||
X_train_undersample.to_csv("data/X_train_undersample.csv", index=False)
|
|
||||||
X_test_undersample.to_csv("data/X_test_undersample.csv", index=False)
|
|
||||||
y_train_undersample.to_csv("data/y_train_undersample.csv", index=False)
|
|
||||||
y_test_undersample.to_csv("data/y_test_undersample.csv", index=False)
|
|
||||||
|
|
||||||
|
|
||||||
def split_whole_data(df):
|
|
||||||
X = df.iloc[:, df.columns != "Class"]
|
X = df.iloc[:, df.columns != "Class"]
|
||||||
y = df.iloc[:, df.columns == "Class"]
|
y = df.iloc[:, df.columns == "Class"]
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
X, y, test_size=0.3, random_state=0
|
X, y, test_size=0.2, random_state=0
|
||||||
)
|
)
|
||||||
return X_train, X_test, y_train, y_test
|
X_train, X_val, y_train, y_val = train_test_split(
|
||||||
|
X_train, y_train, test_size=0.25, random_state=0
|
||||||
|
)
|
||||||
|
|
||||||
|
return X_train, X_val, X_test, y_train, y_val, y_test
|
||||||
|
|
||||||
|
|
||||||
def save_whole_data(df, X_train, X_test, y_train, y_test):
|
def save_data(df, X_train, X_val, X_test, y_train, y_val, y_test):
|
||||||
df.to_csv("data/creditcard.csv", index=False)
|
df.to_csv("data/creditcard.csv", index=False)
|
||||||
X_train.to_csv("data/X_train.csv", index=False)
|
X_train.to_csv("data/X_train.csv", index=False)
|
||||||
|
X_val.to_csv("data/X_val.csv", index=False)
|
||||||
X_test.to_csv("data/X_test.csv", index=False)
|
X_test.to_csv("data/X_test.csv", index=False)
|
||||||
y_train.to_csv("data/y_train.csv", index=False)
|
y_train.to_csv("data/y_train.csv", index=False)
|
||||||
|
y_val.to_csv("data/y_val.csv", index=False)
|
||||||
y_test.to_csv("data/y_test.csv", index=False)
|
y_test.to_csv("data/y_test.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
@ -94,20 +46,8 @@ def main():
|
|||||||
df = load_data("creditcard.csv")
|
df = load_data("creditcard.csv")
|
||||||
df = normalize_data(df)
|
df = normalize_data(df)
|
||||||
|
|
||||||
undersample_data, X_undersample, y_undersample = create_undersample_data(df)
|
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
|
||||||
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = (
|
save_data(df, X_train, X_val, X_test, y_train, y_val, y_test)
|
||||||
split_undersample_data(X_undersample, y_undersample)
|
|
||||||
)
|
|
||||||
save_undersample_data(
|
|
||||||
undersample_data,
|
|
||||||
X_train_undersample,
|
|
||||||
X_test_undersample,
|
|
||||||
y_train_undersample,
|
|
||||||
y_test_undersample,
|
|
||||||
)
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = split_whole_data(df)
|
|
||||||
save_whole_data(df, X_train, X_test, y_train, y_test)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -7,16 +7,12 @@ def write_to_file(file_name):
|
|||||||
df = pd.read_csv("data/creditcard.csv")
|
df = pd.read_csv("data/creditcard.csv")
|
||||||
pd.set_option("display.max_columns", None)
|
pd.set_option("display.max_columns", None)
|
||||||
|
|
||||||
undersample_data = pd.read_csv("data/undersample_data.csv")
|
|
||||||
X_test_undersample = pd.read_csv("data/X_test_undersample.csv")
|
|
||||||
y_test_undersample = pd.read_csv("data/y_test_undersample.csv")
|
|
||||||
X_train_undersample = pd.read_csv("data/X_train_undersample.csv")
|
|
||||||
y_train_undersample = pd.read_csv("data/y_train_undersample.csv")
|
|
||||||
|
|
||||||
X_test = pd.read_csv("data/X_test.csv")
|
|
||||||
y_test = pd.read_csv("data/y_test.csv")
|
|
||||||
X_train = pd.read_csv("data/X_train.csv")
|
X_train = pd.read_csv("data/X_train.csv")
|
||||||
|
X_val = pd.read_csv("data/X_val.csv")
|
||||||
|
X_test = pd.read_csv("data/X_test.csv")
|
||||||
y_train = pd.read_csv("data/y_train.csv")
|
y_train = pd.read_csv("data/y_train.csv")
|
||||||
|
y_val = pd.read_csv("data/y_val.csv")
|
||||||
|
y_test = pd.read_csv("data/y_test.csv")
|
||||||
|
|
||||||
with open("stats_data/" + file_name, "w") as f:
|
with open("stats_data/" + file_name, "w") as f:
|
||||||
sys.stdout = f
|
sys.stdout = f
|
||||||
@ -41,24 +37,7 @@ def write_to_file(file_name):
|
|||||||
|
|
||||||
f.write("\n\n")
|
f.write("\n\n")
|
||||||
|
|
||||||
f.write("Size of undersampled dataset\n")
|
f.write("Statistical measures of the training dataset\n")
|
||||||
undersample_data.info()
|
|
||||||
|
|
||||||
f.write("\n\n")
|
|
||||||
|
|
||||||
f.write("Summary statistics of the undersampled dataset\n")
|
|
||||||
f.write(str(undersample_data.describe()))
|
|
||||||
|
|
||||||
f.write("\n\n")
|
|
||||||
|
|
||||||
f.write(
|
|
||||||
"Distribution of legitimate and fraudulent transactions in an undersampled dataset\n"
|
|
||||||
)
|
|
||||||
f.write(str(undersample_data["Class"].value_counts()))
|
|
||||||
|
|
||||||
f.write("\n\n")
|
|
||||||
|
|
||||||
f.write("Statistical measures of the training dataset of whole data\n")
|
|
||||||
pd.concat([X_train, y_train], axis=1).info()
|
pd.concat([X_train, y_train], axis=1).info()
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
f.write(str(pd.concat([X_train, y_train], axis=1).describe()))
|
f.write(str(pd.concat([X_train, y_train], axis=1).describe()))
|
||||||
@ -67,49 +46,22 @@ def write_to_file(file_name):
|
|||||||
|
|
||||||
f.write("\n\n")
|
f.write("\n\n")
|
||||||
|
|
||||||
f.write("Statistical measures of the test dataset of whole data\n")
|
f.write("Statistical measures of the validation dataset\n")
|
||||||
|
pd.concat([X_val, y_val], axis=1).info()
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_val, y_val], axis=1).describe()))
|
||||||
|
f.write("\n")
|
||||||
|
f.write(str(pd.concat([X_val, y_val], axis=1)["Class"].value_counts()))
|
||||||
|
|
||||||
|
f.write("\n\n")
|
||||||
|
|
||||||
|
f.write("Statistical measures of the test dataset\n")
|
||||||
pd.concat([X_test, y_test], axis=1).info()
|
pd.concat([X_test, y_test], axis=1).info()
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
f.write(str(pd.concat([X_test, y_test], axis=1).describe()))
|
f.write(str(pd.concat([X_test, y_test], axis=1).describe()))
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts()))
|
f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts()))
|
||||||
|
|
||||||
f.write("\n\n")
|
|
||||||
|
|
||||||
f.write("Statistical measures of the training dataset of undersampled data\n")
|
|
||||||
pd.concat([X_train_undersample, y_train_undersample], axis=1).info()
|
|
||||||
f.write("\n")
|
|
||||||
f.write(
|
|
||||||
str(
|
|
||||||
pd.concat([X_train_undersample, y_train_undersample], axis=1).describe()
|
|
||||||
)
|
|
||||||
)
|
|
||||||
f.write("\n")
|
|
||||||
f.write(
|
|
||||||
str(
|
|
||||||
pd.concat([X_train_undersample, y_train_undersample], axis=1)[
|
|
||||||
"Class"
|
|
||||||
].value_counts()
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
f.write("\n\n")
|
|
||||||
|
|
||||||
f.write("Statistical measures of the test dataset of undersampled data\n")
|
|
||||||
pd.concat([X_test_undersample, y_test_undersample], axis=1).info()
|
|
||||||
f.write("\n")
|
|
||||||
f.write(
|
|
||||||
str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe())
|
|
||||||
)
|
|
||||||
f.write("\n")
|
|
||||||
f.write(
|
|
||||||
str(
|
|
||||||
pd.concat([X_test_undersample, y_test_undersample], axis=1)[
|
|
||||||
"Class"
|
|
||||||
].value_counts()
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
sys.stdout = sys.__stdout__
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,12 +10,17 @@ import pandas as pd
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
X_train = pd.read_csv("data/X_train.csv")
|
X_train = pd.read_csv("data/X_train.csv")
|
||||||
|
X_val = pd.read_csv("data/X_val.csv")
|
||||||
y_train = pd.read_csv("data/y_train.csv")
|
y_train = pd.read_csv("data/y_train.csv")
|
||||||
|
y_val = pd.read_csv("data/y_val.csv")
|
||||||
|
|
||||||
X_train = X_train.to_numpy()
|
X_train = X_train.to_numpy()
|
||||||
|
X_val = X_val.to_numpy()
|
||||||
y_train = y_train.to_numpy()
|
y_train = y_train.to_numpy()
|
||||||
|
y_val = y_val.to_numpy()
|
||||||
|
|
||||||
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
|
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
|
||||||
|
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
|
||||||
|
|
||||||
model = Sequential(
|
model = Sequential(
|
||||||
[
|
[
|
||||||
@ -41,6 +46,7 @@ def main():
|
|||||||
model.fit(
|
model.fit(
|
||||||
X_train,
|
X_train,
|
||||||
y_train,
|
y_train,
|
||||||
|
validation_data=(X_val, y_val),
|
||||||
epochs=5,
|
epochs=5,
|
||||||
verbose=1,
|
verbose=1,
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user