2021-04-08 21:58:41 +02:00
|
|
|
import zipfile
|
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
2021-04-08 23:02:23 +02:00
|
|
|
import wget
|
2021-04-08 21:58:41 +02:00
|
|
|
from sklearn import preprocessing
|
|
|
|
|
2021-04-08 22:59:17 +02:00
|
|
|
url = 'https://git.wmi.amu.edu.pl/s434804/ium_434804/raw/branch/master/country_vaccinations.csv'
|
|
|
|
wget.download(url, out='country_vaccinations.csv', bar=None)
|
2021-04-08 21:58:41 +02:00
|
|
|
|
|
|
|
df = pd.read_csv('country_vaccinations.csv')
|
|
|
|
# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas
|
|
|
|
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
|
|
|
|
|
2021-05-14 22:48:23 +02:00
|
|
|
train.to_csv("train.csv")
|
|
|
|
validate.to_csv("validate.csv")
|
|
|
|
test.to_csv("test.csv")
|
2021-04-08 21:58:41 +02:00
|
|
|
# Wypisanie ilości elementów w poszczególnych ramkach danych
|
|
|
|
print("Whole set size".ljust(20), df.size)
|
|
|
|
print("Train set size: ".ljust(20), train.size)
|
|
|
|
print("Validate set size: ".ljust(20), validate.size)
|
|
|
|
print("Test set size: ".ljust(20), test.size)
|
|
|
|
|
|
|
|
df.describe(include='all')
|
|
|
|
|
|
|
|
for col in df.columns:
|
|
|
|
column = df[col].value_counts().plot(kind="bar",figsize=(30,10))
|
|
|
|
print("\n", col)
|
|
|
|
print(column)
|
|
|
|
|
|
|
|
# normalizacja wartości numerycznych
|
|
|
|
numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne
|
|
|
|
min_max_scaler = preprocessing.MinMaxScaler()
|
2021-04-08 23:06:49 +02:00
|
|
|
x_scaled = min_max_scaler.fit_transform(numeric_values)
|
2021-04-08 21:58:41 +02:00
|
|
|
numeric_columns = df.select_dtypes(include='float64').columns
|
|
|
|
df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns)
|
|
|
|
for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych
|
|
|
|
if col in numeric_columns: df[col] = df_normalized[col]
|
|
|
|
|
|
|
|
df.dropna() # usunięcie wierszy z polami NaN
|