120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
import os
|
|
import pandas as pd
|
|
|
|
|
|
def write_to_file(file_name):
|
|
df = pd.read_csv("data/creditcard.csv")
|
|
pd.set_option("display.max_columns", None)
|
|
|
|
undersample_data = pd.read_csv("data/undersample_data.csv")
|
|
X_test_undersample = pd.read_csv("data/X_test_undersample.csv")
|
|
y_test_undersample = pd.read_csv("data/y_test_undersample.csv")
|
|
X_train_undersample = pd.read_csv("data/X_train_undersample.csv")
|
|
y_train_undersample = pd.read_csv("data/y_train_undersample.csv")
|
|
|
|
X_test = pd.read_csv("data/X_test.csv")
|
|
y_test = pd.read_csv("data/y_test.csv")
|
|
X_train = pd.read_csv("data/X_train.csv")
|
|
y_train = pd.read_csv("data/y_train.csv")
|
|
|
|
with open("stats_data/" + file_name, "w") as f:
|
|
f.write("Check missing values\n")
|
|
f.write(str(df.isnull().sum()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Size of the dataset\n")
|
|
f.write(str(df.info()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Summary statistics\n")
|
|
f.write(str(df.describe()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Distribution of legitimate and fraudulent transactions\n")
|
|
f.write(str(df["Class"].value_counts()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Size of undersampled dataset\n")
|
|
f.write(str(undersample_data.info()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Summary statistics of the undersampled dataset\n")
|
|
f.write(str(undersample_data.describe()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write(
|
|
"Distribution of legitimate and fraudulent transactions in an undersampled dataset\n"
|
|
)
|
|
f.write(str(undersample_data["Class"].value_counts()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Statistical measures of the training dataset of whole data\n")
|
|
f.write(str(pd.concat([X_train, y_train], axis=1).info()))
|
|
f.write("\n")
|
|
f.write(str(pd.concat([X_train, y_train], axis=1).describe()))
|
|
f.write("\n")
|
|
f.write(str(pd.concat([X_train, y_train], axis=1)["Class"].value_counts()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Statistical measures of the test dataset of whole data\n")
|
|
f.write(str(pd.concat([X_test, y_test], axis=1).info()))
|
|
f.write("\n")
|
|
f.write(str(pd.concat([X_test, y_test], axis=1).describe()))
|
|
f.write("\n")
|
|
f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts()))
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Statistical measures of the training dataset of undersampled data\n")
|
|
f.write(
|
|
str(pd.concat([X_train_undersample, y_train_undersample], axis=1).info())
|
|
)
|
|
f.write("\n")
|
|
f.write(
|
|
str(
|
|
pd.concat([X_train_undersample, y_train_undersample], axis=1).describe()
|
|
)
|
|
)
|
|
f.write("\n")
|
|
f.write(
|
|
str(
|
|
pd.concat([X_train_undersample, y_train_undersample], axis=1)[
|
|
"Class"
|
|
].value_counts()
|
|
)
|
|
)
|
|
|
|
f.write("\n\n")
|
|
|
|
f.write("Statistical measures of the test dataset of undersampled data\n")
|
|
f.write(str(pd.concat([X_test_undersample, y_test_undersample], axis=1).info()))
|
|
f.write("\n")
|
|
f.write(
|
|
str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe())
|
|
)
|
|
f.write("\n")
|
|
f.write(
|
|
str(
|
|
pd.concat([X_test_undersample, y_test_undersample], axis=1)[
|
|
"Class"
|
|
].value_counts()
|
|
)
|
|
)
|
|
|
|
|
|
def main():
|
|
os.makedirs("stats_data", exist_ok=True)
|
|
write_to_file("stats.txt")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|