import os import pandas as pd def write_to_file(file_name): df = pd.read_csv("data/creditcard.csv") pd.set_option("display.max_columns", None) undersample_data = pd.read_csv("data/undersample_data.csv") X_test_undersample = pd.read_csv("data/X_test_undersample.csv") y_test_undersample = pd.read_csv("data/y_test_undersample.csv") X_train_undersample = pd.read_csv("data/X_train_undersample.csv") y_train_undersample = pd.read_csv("data/y_train_undersample.csv") X_test = pd.read_csv("data/X_test.csv") y_test = pd.read_csv("data/y_test.csv") X_train = pd.read_csv("data/X_train.csv") y_train = pd.read_csv("data/y_train.csv") with open("stats_data/" + file_name, "w") as f: f.write("Check missing values\n") f.write(str(df.isnull().sum())) f.write("\n\n") f.write("Size of the dataset\n") f.write(str(df.info())) f.write("\n\n") f.write("Summary statistics\n") f.write(str(df.describe())) f.write("\n\n") f.write("Distribution of legitimate and fraudulent transactions\n") f.write(str(df["Class"].value_counts())) f.write("\n\n") f.write("Size of undersampled dataset\n") f.write(str(undersample_data.info())) f.write("\n\n") f.write("Summary statistics of the undersampled dataset\n") f.write(str(undersample_data.describe())) f.write("\n\n") f.write( "Distribution of legitimate and fraudulent transactions in an undersampled dataset\n" ) f.write(str(undersample_data["Class"].value_counts())) f.write("\n\n") f.write("Statistical measures of the training dataset of whole data\n") f.write(str(pd.concat([X_train, y_train], axis=1).info())) f.write("\n") f.write(str(pd.concat([X_train, y_train], axis=1).describe())) f.write("\n") f.write(str(pd.concat([X_train, y_train], axis=1)["Class"].value_counts())) f.write("\n\n") f.write("Statistical measures of the test dataset of whole data\n") f.write(str(pd.concat([X_test, y_test], axis=1).info())) f.write("\n") f.write(str(pd.concat([X_test, y_test], axis=1).describe())) f.write("\n") f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts())) f.write("\n\n") f.write("Statistical measures of the training dataset of undersampled data\n") f.write( str(pd.concat([X_train_undersample, y_train_undersample], axis=1).info()) ) f.write("\n") f.write( str( pd.concat([X_train_undersample, y_train_undersample], axis=1).describe() ) ) f.write("\n") f.write( str( pd.concat([X_train_undersample, y_train_undersample], axis=1)[ "Class" ].value_counts() ) ) f.write("\n\n") f.write("Statistical measures of the test dataset of undersampled data\n") f.write(str(pd.concat([X_test_undersample, y_test_undersample], axis=1).info())) f.write("\n") f.write( str(pd.concat([X_test_undersample, y_test_undersample], axis=1).describe()) ) f.write("\n") f.write( str( pd.concat([X_test_undersample, y_test_undersample], axis=1)[ "Class" ].value_counts() ) ) def main(): os.makedirs("stats_data", exist_ok=True) write_to_file("stats.txt") if __name__ == "__main__": main()