create python scripts for stats and downloading dataset
This commit is contained in:
parent
bec0f944b2
commit
76154a9157
18
get_dataset.py
Normal file
18
get_dataset.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import opendatasets as od
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
od.download("https://www.kaggle.com/datasets/nikhil7280/student-performance-multiple-linear-regression/code")
|
||||||
|
data = pd.read_csv("student-performance-multiple-linear-regression/Student_Performance.csv")
|
||||||
|
print(data.head())
|
||||||
|
data.drop_duplicates(inplace=True)
|
||||||
|
data["Extracurricular Activities"] = data["Extracurricular Activities"].replace({'Yes': 1, 'No': 0})
|
||||||
|
|
||||||
|
data[data.columns[:-1]] = preprocessing.StandardScaler().fit_transform(data[data.columns[:-1]])
|
||||||
|
print(data.head())
|
||||||
|
df_train, df_test = train_test_split(data, test_size=0.2, random_state=21, shuffle=True)
|
||||||
|
data.to_csv("dataset.csv", index=False)
|
||||||
|
df_train.to_csv("df_train.csv", index=False)
|
||||||
|
df_test.to_csv("df_test.csv", index=False)
|
||||||
|
|
34
stats/get_stats.py
Normal file
34
stats/get_stats.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('./dataset.csv')
|
||||||
|
df_train = pd.read_csv('./df_train.csv')
|
||||||
|
df_test = pd.read_csv('./df_test.csv')
|
||||||
|
|
||||||
|
|
||||||
|
with open('stats.txt', 'w') as f:
|
||||||
|
f.write(f"Wielkość całego zbioru:\n {df.shape}\n")
|
||||||
|
f.write(f"Wielkość treningowego zbioru:\n {df_train.shape}\n")
|
||||||
|
f.write(f"Wielkość testowego zbioru:\n {df_test.shape}\n")
|
||||||
|
|
||||||
|
f.write(f"\nStatystyki dla całego zbioru:\n")
|
||||||
|
f.write(f"{df.describe()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nStatystyki dla treningowego zbioru:\n")
|
||||||
|
f.write(f"{df_train.describe()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nStatystyki dla testowego zbioru:\n")
|
||||||
|
f.write(f"{df_test.describe()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nRozkład zmiennej wyjściowej dla całości:\n")
|
||||||
|
f.write(f"{df['Performance Index'].value_counts()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nRozkład zmiennej wyjściowej dla zbioru treningowego:\n")
|
||||||
|
f.write(f"{df_train['Performance Index'].value_counts()}\n")
|
||||||
|
|
||||||
|
f.write(f"\nRozkład zmiennej wyjściowej dla zbioru testowego:\n")
|
||||||
|
f.write(f"{df_test['Performance Index'].value_counts()}\n")
|
Loading…
Reference in New Issue
Block a user