780 KiB
780 KiB
# !pip install kaggle
# !pip install pandas
# !kaggle datasets download -d gpreda/covid-world-vaccination-progress
Downloading covid-world-vaccination-progress.zip to E:\Na studia\Magisterka\Inżynieria uczenia maszynowego\IUM_434804
0%| | 0.00/160k [00:00<?, ?B/s] 100%|##########| 160k/160k [00:00<00:00, 1.20MB/s] 100%|##########| 160k/160k [00:00<00:00, 1.19MB/s]
import zipfile
with zipfile.ZipFile('covid-world-vaccination-progress.zip', 'r') as zip_ref:
zip_ref.extractall(".")
import numpy as np
import pandas as pd
df = pd.read_csv('country_vaccinations.csv')
# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
# Wypisanie ilości elementów w poszczególnych ramkach danych
print("Whole set size".ljust(20), df.size)
print("Train set size: ".ljust(20), train.size)
print("Validate set size: ".ljust(20), validate.size)
print("Test set size: ".ljust(20), test.size)
Whole set size 110055 Train set size: 66030 Validate set size: 22005 Test set size: 22020
df.describe(include='all')
country | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | vaccines | source_name | source_website | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 7337 | 7337 | 7337 | 4.552000e+03 | 4.053000e+03 | 2.749000e+03 | 3.830000e+03 | 7.150000e+03 | 4552.000000 | 4053.000000 | 2749.000000 | 7150.000000 | 7337 | 7337 | 7337 |
unique | 150 | 150 | 97 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 25 | 91 | 145 |
top | Canada | GBR | 2021-03-09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Moderna, Oxford/AstraZeneca, Pfizer/BioNTech | Ministry of Health | https://coronavirus.data.gov.uk/details/health... |
freq | 96 | 96 | 129 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1798 | 2329 | 480 |
mean | NaN | NaN | NaN | 2.361121e+06 | 1.918598e+06 | 7.999520e+05 | 8.744129e+04 | 5.825144e+04 | 9.398541 | 7.237774 | 3.361342 | 2675.625594 | NaN | NaN | NaN |
std | NaN | NaN | NaN | 8.421579e+06 | 6.249484e+06 | 3.230805e+06 | 2.693155e+05 | 1.992295e+05 | 16.995766 | 11.614673 | 7.262965 | 4229.243670 | NaN | NaN | NaN |
min | NaN | NaN | NaN | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 | 1.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | NaN | NaN |
25% | NaN | NaN | NaN | 3.741475e+04 | 3.457400e+04 | 1.799500e+04 | 2.732000e+03 | 9.882500e+02 | 0.717500 | 0.720000 | 0.370000 | 355.000000 | NaN | NaN | NaN |
50% | NaN | NaN | NaN | 2.536690e+05 | 2.334230e+05 | 9.966600e+04 | 1.365700e+04 | 5.952500e+03 | 3.465000 | 3.050000 | 1.360000 | 1247.000000 | NaN | NaN | NaN |
75% | NaN | NaN | NaN | 1.195748e+06 | 9.467810e+05 | 4.625030e+05 | 5.718200e+04 | 2.680500e+04 | 10.080000 | 7.890000 | 3.000000 | 3026.750000 | NaN | NaN | NaN |
max | NaN | NaN | NaN | 1.183138e+08 | 7.723006e+07 | 4.193463e+07 | 4.575496e+06 | 2.541597e+06 | 151.860000 | 88.790000 | 63.070000 | 54264.000000 | NaN | NaN | NaN |
for col in df.columns:
column = df[col].value_counts().plot(kind="bar",figsize=(30,10))
print("\n", col)
print(column)
country AxesSubplot(0.125,0.125;0.775x0.755) iso_code AxesSubplot(0.125,0.125;0.775x0.755) date AxesSubplot(0.125,0.125;0.775x0.755) total_vaccinations AxesSubplot(0.125,0.125;0.775x0.755) people_vaccinated AxesSubplot(0.125,0.125;0.775x0.755) people_fully_vaccinated AxesSubplot(0.125,0.125;0.775x0.755) daily_vaccinations_raw AxesSubplot(0.125,0.125;0.775x0.755) daily_vaccinations AxesSubplot(0.125,0.125;0.775x0.755) total_vaccinations_per_hundred AxesSubplot(0.125,0.125;0.775x0.755) people_vaccinated_per_hundred AxesSubplot(0.125,0.125;0.775x0.755) people_fully_vaccinated_per_hundred AxesSubplot(0.125,0.125;0.775x0.755) daily_vaccinations_per_million AxesSubplot(0.125,0.125;0.775x0.755) vaccines AxesSubplot(0.125,0.125;0.775x0.755) source_name AxesSubplot(0.125,0.125;0.775x0.755) source_website AxesSubplot(0.125,0.125;0.775x0.755)
# !pip install sklearn
from sklearn import preprocessing
# normalizacja wartości numerycznych
numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(values)
numeric_columns = df.select_dtypes(include='float64').columns
df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns)
for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych
if col in numeric_columns: df[col] = df_normalized[col]
country | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | vaccines | source_name | source_website | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | AFG | 2021-02-22 | 0.0 | 0.0 | NaN | NaN | NaN | 0.0 | 0.0 | NaN | NaN | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
1 | Afghanistan | AFG | 2021-02-23 | NaN | NaN | NaN | NaN | 0.000537 | NaN | NaN | NaN | 0.000645 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
2 | Afghanistan | AFG | 2021-02-24 | NaN | NaN | NaN | NaN | 0.000537 | NaN | NaN | NaN | 0.000645 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
3 | Afghanistan | AFG | 2021-02-25 | NaN | NaN | NaN | NaN | 0.000537 | NaN | NaN | NaN | 0.000645 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
4 | Afghanistan | AFG | 2021-02-26 | NaN | NaN | NaN | NaN | 0.000537 | NaN | NaN | NaN | 0.000645 | Oxford/AstraZeneca | Government of Afghanistan | http://www.xinhuanet.com/english/asiapacific/2... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7332 | Zimbabwe | ZWE | 2021-03-15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sinopharm/Beijing | Ministry of Health | https://twitter.com/MoHCCZim/status/1373023610... |
7333 | Zimbabwe | ZWE | 2021-03-16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sinopharm/Beijing | Ministry of Health | https://twitter.com/MoHCCZim/status/1373023610... |
7334 | Zimbabwe | ZWE | 2021-03-17 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sinopharm/Beijing | Ministry of Health | https://twitter.com/MoHCCZim/status/1373023610... |
7335 | Zimbabwe | ZWE | 2021-03-18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sinopharm/Beijing | Ministry of Health | https://twitter.com/MoHCCZim/status/1373023610... |
7336 | Zimbabwe | ZWE | 2021-03-19 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Sinopharm/Beijing | Ministry of Health | https://twitter.com/MoHCCZim/status/1373023610... |
7337 rows × 15 columns
df.dropna() # usunięcie wierszy z polami NaN
country | iso_code | date | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations_raw | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | vaccines | source_name | source_website | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
46 | Albania | ALB | 2021-02-02 | 0.000027 | 0.000033 | 0.000015 | 0.000295 | 0.000100 | 0.000751 | 0.000906 | 0.000344 | 0.001622 | Pfizer/BioNTech | Ministry of Health | https://shendetesia.gov.al/covid19-ministria-e... |
234 | Antigua and Barbuda | ATG | 2021-03-13 | 0.002351 | 0.003385 | 0.000409 | 0.003888 | 0.004605 | 0.004030 | 0.006229 | 0.000688 | 0.004773 | Oxford/AstraZeneca | Ministry of Health | https://www.facebook.com/investingforwellness/... |
235 | Antigua and Barbuda | ATG | 2021-03-14 | 0.002474 | 0.003454 | 0.000629 | 0.003033 | 0.004431 | 0.004235 | 0.006342 | 0.001033 | 0.004589 | Oxford/AstraZeneca | Ministry of Health | https://www.facebook.com/investingforwellness/... |
236 | Antigua and Barbuda | ATG | 2021-03-15 | 0.002548 | 0.003514 | 0.000730 | 0.001849 | 0.004376 | 0.004371 | 0.006455 | 0.001033 | 0.004533 | Oxford/AstraZeneca | Ministry of Health | https://www.facebook.com/investingforwellness/... |
237 | Argentina | ARG | 2020-12-29 | 0.002583 | 0.003530 | 0.000800 | 0.000865 | 0.004069 | 0.004440 | 0.006569 | 0.001205 | 0.004220 | Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik V | Ministry of Health | http://datos.salud.gob.ar/dataset/vacunas-cont... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6965 | United Arab Emirates | ARE | 2021-03-10 | 0.011805 | 0.014719 | 0.006252 | 0.008788 | 0.010273 | 0.289051 | 0.389468 | 0.136465 | 0.152606 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | National Emergency Crisis and Disaster Managem... | http://covid19.ncema.gov.ae/en |
6966 | United Arab Emirates | ARE | 2021-03-11 | 0.012128 | 0.015115 | 0.006437 | 0.007986 | 0.011229 | 0.296974 | 0.400000 | 0.140423 | 0.166814 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | National Emergency Crisis and Disaster Managem... | http://covid19.ncema.gov.ae/en |
6967 | United Arab Emirates | ARE | 2021-03-12 | 0.012272 | 0.015243 | 0.006608 | 0.003560 | 0.011531 | 0.300526 | 0.403398 | 0.144209 | 0.171292 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | National Emergency Crisis and Disaster Managem... | http://covid19.ncema.gov.ae/en |
6968 | United Arab Emirates | ARE | 2021-03-13 | 0.012499 | 0.015473 | 0.006826 | 0.005609 | 0.011996 | 0.306058 | 0.409400 | 0.149028 | 0.178221 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | National Emergency Crisis and Disaster Managem... | http://covid19.ncema.gov.ae/en |
6969 | United Arab Emirates | ARE | 2021-03-14 | 0.012796 | 0.015709 | 0.007232 | 0.007341 | 0.012412 | 0.313367 | 0.415629 | 0.157804 | 0.184395 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | National Emergency Crisis and Disaster Managem... | http://covid19.ncema.gov.ae/en |
2367 rows × 15 columns