From 2b8f8919ed96ffaf0aab96dd909c33a7d5304b6f Mon Sep 17 00:00:00 2001 From: pi4 Date: Mon, 17 Apr 2023 22:46:22 +0200 Subject: [PATCH] 2nd commit --- skrypt.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 skrypt.py diff --git a/skrypt.py b/skrypt.py new file mode 100644 index 0000000..5bd04e6 --- /dev/null +++ b/skrypt.py @@ -0,0 +1,69 @@ +import os +import zipfile +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler + + +os.system("kaggle datasets download -d imoore/7k-indian-famous-women") +with zipfile.ZipFile("7k-indian-famous-women.zip", "r") as zip_ref: + zip_ref.extractall("7k-indian-famous-women") + + +csv_file = None +for root, dirs, files in os.walk("7k-indian-famous-women"): + for file in files: + if file.endswith(".csv"): + csv_file = os.path.join(root, file) + break +if csv_file is None: + raise FileNotFoundError("CSV file not found in the extracted dataset") + + +data = pd.read_csv(csv_file) + + +train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42) +dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) + + +def print_stats(df, name): + print(f"\nStatystyki dla {name}:") + print(f"Wielkość zbioru: {len(df)}") + for col in df.columns: + if df[col].dtype != "object": + print(f"\nParametr: {col}") + print(f"Minimum: {df[col].min()}") + print(f"Maksimum: {df[col].max()}") + print(f"Średnia: {df[col].mean()}") + print(f"Odchylenie standardowe: {df[col].std()}") + print(f"Mediana: {df[col].median()}") + + +print_stats(data, "Cały zbiór") +print_stats(train_data, "Zbiór treningowy") +print_stats(dev_data, "Zbiór walidacyjny") +print_stats(test_data, "Zbiór testowy") + + +def normalize_data(df, columns): + scaler = MinMaxScaler() + for col in columns: + if df[col].dtype != "object": + df[col] = scaler.fit_transform(df[[col]]) + +normalize_data(train_data, train_data.columns) +normalize_data(dev_data, dev_data.columns) +normalize_data(test_data, test_data.columns) + + +def clean_data(df): + df.dropna(inplace=True) + df.drop_duplicates(inplace=True) + + +clean_data(data) +clean_data(train_data) +clean_data(dev_data) +clean_data(test_data)