ium_464913/dataset-stats.py

76 lines
2.1 KiB
Python
Raw Normal View History

2024-04-01 19:14:34 +02:00
import os
2024-04-01 19:39:09 +02:00
import sys
2024-04-01 19:14:34 +02:00
import pandas as pd
def write_to_file(file_name):
df = pd.read_csv("data/creditcard.csv")
pd.set_option("display.max_columns", None)
X_train = pd.read_csv("data/X_train.csv")
2024-04-15 12:58:41 +02:00
X_val = pd.read_csv("data/X_val.csv")
X_test = pd.read_csv("data/X_test.csv")
2024-04-01 19:14:34 +02:00
y_train = pd.read_csv("data/y_train.csv")
2024-04-15 12:58:41 +02:00
y_val = pd.read_csv("data/y_val.csv")
y_test = pd.read_csv("data/y_test.csv")
2024-04-01 19:14:34 +02:00
with open("stats_data/" + file_name, "w") as f:
2024-04-01 19:39:09 +02:00
sys.stdout = f
2024-04-01 19:14:34 +02:00
f.write("Check missing values\n")
f.write(str(df.isnull().sum()))
f.write("\n\n")
f.write("Size of the dataset\n")
2024-04-01 19:39:09 +02:00
df.info()
2024-04-01 19:14:34 +02:00
f.write("\n\n")
f.write("Summary statistics\n")
f.write(str(df.describe()))
f.write("\n\n")
f.write("Distribution of legitimate and fraudulent transactions\n")
f.write(str(df["Class"].value_counts()))
f.write("\n\n")
2024-04-15 12:58:41 +02:00
f.write("Statistical measures of the training dataset\n")
2024-04-01 19:39:09 +02:00
pd.concat([X_train, y_train], axis=1).info()
2024-04-01 19:14:34 +02:00
f.write("\n")
f.write(str(pd.concat([X_train, y_train], axis=1).describe()))
f.write("\n")
f.write(str(pd.concat([X_train, y_train], axis=1)["Class"].value_counts()))
f.write("\n\n")
2024-04-15 12:58:41 +02:00
f.write("Statistical measures of the validation dataset\n")
pd.concat([X_val, y_val], axis=1).info()
2024-04-01 19:14:34 +02:00
f.write("\n")
2024-04-15 12:58:41 +02:00
f.write(str(pd.concat([X_val, y_val], axis=1).describe()))
2024-04-01 19:14:34 +02:00
f.write("\n")
2024-04-15 12:58:41 +02:00
f.write(str(pd.concat([X_val, y_val], axis=1)["Class"].value_counts()))
2024-04-01 19:14:34 +02:00
f.write("\n\n")
2024-04-15 12:58:41 +02:00
f.write("Statistical measures of the test dataset\n")
pd.concat([X_test, y_test], axis=1).info()
2024-04-01 19:14:34 +02:00
f.write("\n")
2024-04-15 12:58:41 +02:00
f.write(str(pd.concat([X_test, y_test], axis=1).describe()))
2024-04-01 19:14:34 +02:00
f.write("\n")
2024-04-15 12:58:41 +02:00
f.write(str(pd.concat([X_test, y_test], axis=1)["Class"].value_counts()))
2024-04-01 19:14:34 +02:00
2024-04-01 19:39:09 +02:00
sys.stdout = sys.__stdout__
2024-04-01 19:14:34 +02:00
def main():
os.makedirs("stats_data", exist_ok=True)
2024-04-01 19:39:09 +02:00
os.system("rm -rf stats_data/*")
2024-04-01 19:14:34 +02:00
write_to_file("stats.txt")
if __name__ == "__main__":
main()