From 76154a9157a71bada3a48d260e5d3759982fc63d Mon Sep 17 00:00:00 2001 From: Sheaza Date: Tue, 2 Apr 2024 18:31:57 +0200 Subject: [PATCH] create python scripts for stats and downloading dataset --- get_dataset.py | 18 ++++++++++++++++++ stats/get_stats.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 get_dataset.py create mode 100644 stats/get_stats.py diff --git a/get_dataset.py b/get_dataset.py new file mode 100644 index 0000000..0543413 --- /dev/null +++ b/get_dataset.py @@ -0,0 +1,18 @@ +import opendatasets as od +import pandas as pd +from sklearn import preprocessing +from sklearn.model_selection import train_test_split + +od.download("https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code") +data = pd.read_csv("student-performance-multiple-linear-regression/Student_Performance.csv") +print(data.head()) +data.drop_duplicates(inplace=True) +data["Extracurricular Activities"] = data["Extracurricular Activities"].replace({'Yes': 1, 'No': 0}) + +data[data.columns[:-1]] = preprocessing.StandardScaler().fit_transform(data[data.columns[:-1]]) +print(data.head()) +df_train, df_test = train_test_split(data, test_size=0.2, random_state=21, shuffle=True) +data.to_csv("dataset.csv", index=False) +df_train.to_csv("df_train.csv", index=False) +df_test.to_csv("df_test.csv", index=False) + diff --git a/stats/get_stats.py b/stats/get_stats.py new file mode 100644 index 0000000..43fba39 --- /dev/null +++ b/stats/get_stats.py @@ -0,0 +1,34 @@ +import pandas as pd + + +pd.set_option('display.max_columns', None) +pd.set_option('display.max_rows', None) + + +df = pd.read_csv('./dataset.csv') +df_train = pd.read_csv('./df_train.csv') +df_test = pd.read_csv('./df_test.csv') + + +with open('stats.txt', 'w') as f: + f.write(f"Wielkość całego zbioru:\n {df.shape}\n") + f.write(f"Wielkość treningowego zbioru:\n {df_train.shape}\n") + f.write(f"Wielkość testowego zbioru:\n {df_test.shape}\n") + + f.write(f"\nStatystyki dla całego zbioru:\n") + f.write(f"{df.describe()}\n") + + f.write(f"\nStatystyki dla treningowego zbioru:\n") + f.write(f"{df_train.describe()}\n") + + f.write(f"\nStatystyki dla testowego zbioru:\n") + f.write(f"{df_test.describe()}\n") + + f.write(f"\nRozkład zmiennej wyjściowej dla całości:\n") + f.write(f"{df['Performance Index'].value_counts()}\n") + + f.write(f"\nRozkład zmiennej wyjściowej dla zbioru treningowego:\n") + f.write(f"{df_train['Performance Index'].value_counts()}\n") + + f.write(f"\nRozkład zmiennej wyjściowej dla zbioru testowego:\n") + f.write(f"{df_test['Performance Index'].value_counts()}\n") \ No newline at end of file