2023-04-17 22:46:22 +02:00
|
|
|
import os
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
import zipfile
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
import pandas as pd
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
import numpy as np
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
from sklearn.model_selection import train_test_split
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
2023-04-19 21:24:41 +02:00
|
|
|
os.system("kaggle datasets download -d gpreda/covid-world-vaccination-progress")
|
2023-04-17 22:46:22 +02:00
|
|
|
|
2023-04-19 21:24:41 +02:00
|
|
|
with zipfile.ZipFile("covid-world-vaccination-progress.zip", "r") as zip_ref:
|
2023-04-17 22:46:22 +02:00
|
|
|
|
2023-04-19 21:24:41 +02:00
|
|
|
zip_ref.extractall("country_vaccinations")
|
2023-04-17 22:46:22 +02:00
|
|
|
|
|
|
|
csv_file = None
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-19 20:47:35 +02:00
|
|
|
for root, dirs, files in os.walk("country_vaccinations"):
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
for file in files:
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
if file.endswith(".csv"):
|
|
|
|
csv_file = os.path.join(root, file)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
break
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
if csv_file is None:
|
|
|
|
raise FileNotFoundError("CSV file not found in the extracted dataset")
|
|
|
|
|
|
|
|
data = pd.read_csv(csv_file)
|
|
|
|
|
|
|
|
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
|
|
|
|
|
|
|
|
|
|
|
|
def print_stats(df, name):
|
|
|
|
print(f"\nStatystyki dla {name}:")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Wielkość zbioru: {len(df)}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
for col in df.columns:
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
if df[col].dtype != "object":
|
|
|
|
print(f"\nParametr: {col}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Minimum: {df[col].min()}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Maksimum: {df[col].max()}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Średnia: {df[col].mean()}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Odchylenie standardowe: {df[col].std()}")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print(f"Mediana: {df[col].median()}")
|
|
|
|
|
|
|
|
|
|
|
|
print_stats(data, "Cały zbiór")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print_stats(train_data, "Zbiór treningowy")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print_stats(dev_data, "Zbiór walidacyjny")
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
print_stats(test_data, "Zbiór testowy")
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_data(df, columns):
|
|
|
|
scaler = MinMaxScaler()
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
for col in columns:
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
if df[col].dtype != "object":
|
|
|
|
df[col] = scaler.fit_transform(df[[col]])
|
|
|
|
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
normalize_data(train_data, train_data.columns)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
normalize_data(dev_data, dev_data.columns)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
normalize_data(test_data, test_data.columns)
|
|
|
|
|
|
|
|
|
|
|
|
def clean_data(df):
|
|
|
|
df.dropna(inplace=True)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
df.drop_duplicates(inplace=True)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
|
|
|
|
clean_data(data)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
clean_data(train_data)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
clean_data(dev_data)
|
2023-04-19 21:24:41 +02:00
|
|
|
|
2023-04-17 22:46:22 +02:00
|
|
|
clean_data(test_data)
|
2023-04-20 18:37:09 +02:00
|
|
|
|
|
|
|
|
2023-04-20 20:34:41 +02:00
|
|
|
train_data.to_csv("./results/train_data.csv", index=False)
|
|
|
|
dev_data.to_csv("./results/dev_data.csv", index=False)
|
|
|
|
test_data.to_csv("./results/test_data.csv", index=False)
|