import os import zipfile import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler os.system("kaggle datasets download -d gpreda/covid-world-vaccination-progress") with zipfile.ZipFile("covid-world-vaccination-progress.zip", "r") as zip_ref: zip_ref.extractall("country_vaccinations") csv_file = None for root, dirs, files in os.walk("country_vaccinations"): for file in files: if file.endswith(".csv"): csv_file = os.path.join(root, file) break if csv_file is None: raise FileNotFoundError("CSV file not found in the extracted dataset") data = pd.read_csv(csv_file) train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42) dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42) def print_stats(df, name): print(f"\nStatystyki dla {name}:") print(f"Wielkość zbioru: {len(df)}") for col in df.columns: if df[col].dtype != "object": print(f"\nParametr: {col}") print(f"Minimum: {df[col].min()}") print(f"Maksimum: {df[col].max()}") print(f"Średnia: {df[col].mean()}") print(f"Odchylenie standardowe: {df[col].std()}") print(f"Mediana: {df[col].median()}") print_stats(data, "Cały zbiór") print_stats(train_data, "Zbiór treningowy") print_stats(dev_data, "Zbiór walidacyjny") print_stats(test_data, "Zbiór testowy") def normalize_data(df, columns): scaler = MinMaxScaler() for col in columns: if df[col].dtype != "object": df[col] = scaler.fit_transform(df[[col]]) normalize_data(train_data, train_data.columns) normalize_data(dev_data, dev_data.columns) normalize_data(test_data, test_data.columns) def clean_data(df): df.dropna(inplace=True) df.drop_duplicates(inplace=True) clean_data(data) clean_data(train_data) clean_data(dev_data) clean_data(test_data) train_data.to_csv("./results/train_data.csv", index=False) dev_data.to_csv("./results/dev_data.csv", index=False) test_data.to_csv("./results/test_data.csv", index=False)