ium_434804/Zadanie_1.ipynb
2021-03-20 18:07:33 +01:00

780 KiB
Raw Blame History

# !pip install kaggle
# !pip install pandas
# !kaggle datasets download -d gpreda/covid-world-vaccination-progress
Downloading covid-world-vaccination-progress.zip to E:\Na studia\Magisterka\Inżynieria uczenia maszynowego\IUM_434804

  0%|          | 0.00/160k [00:00<?, ?B/s]
100%|##########| 160k/160k [00:00<00:00, 1.20MB/s]
100%|##########| 160k/160k [00:00<00:00, 1.19MB/s]
import zipfile
with zipfile.ZipFile('covid-world-vaccination-progress.zip', 'r') as zip_ref:
    zip_ref.extractall(".") 
import numpy as np
import pandas as pd
df = pd.read_csv('country_vaccinations.csv')
# podział danych na train/validate/test (6:2:2) za pomocą biblioteki numpy i pandas
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
# Wypisanie ilości elementów w poszczególnych ramkach danych
print("Whole set size".ljust(20), df.size)
print("Train set size: ".ljust(20), train.size)
print("Validate set size: ".ljust(20), validate.size)
print("Test set size: ".ljust(20), test.size)
Whole set size       110055
Train set size:      66030
Validate set size:   22005
Test set size:       22020
df.describe(include='all')
country iso_code date total_vaccinations people_vaccinated people_fully_vaccinated daily_vaccinations_raw daily_vaccinations total_vaccinations_per_hundred people_vaccinated_per_hundred people_fully_vaccinated_per_hundred daily_vaccinations_per_million vaccines source_name source_website
count 7337 7337 7337 4.552000e+03 4.053000e+03 2.749000e+03 3.830000e+03 7.150000e+03 4552.000000 4053.000000 2749.000000 7150.000000 7337 7337 7337
unique 150 150 97 NaN NaN NaN NaN NaN NaN NaN NaN NaN 25 91 145
top Canada GBR 2021-03-09 NaN NaN NaN NaN NaN NaN NaN NaN NaN Moderna, Oxford/AstraZeneca, Pfizer/BioNTech Ministry of Health https://coronavirus.data.gov.uk/details/health...
freq 96 96 129 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1798 2329 480
mean NaN NaN NaN 2.361121e+06 1.918598e+06 7.999520e+05 8.744129e+04 5.825144e+04 9.398541 7.237774 3.361342 2675.625594 NaN NaN NaN
std NaN NaN NaN 8.421579e+06 6.249484e+06 3.230805e+06 2.693155e+05 1.992295e+05 16.995766 11.614673 7.262965 4229.243670 NaN NaN NaN
min NaN NaN NaN 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 1.000000e+00 0.000000 0.000000 0.000000 0.000000 NaN NaN NaN
25% NaN NaN NaN 3.741475e+04 3.457400e+04 1.799500e+04 2.732000e+03 9.882500e+02 0.717500 0.720000 0.370000 355.000000 NaN NaN NaN
50% NaN NaN NaN 2.536690e+05 2.334230e+05 9.966600e+04 1.365700e+04 5.952500e+03 3.465000 3.050000 1.360000 1247.000000 NaN NaN NaN
75% NaN NaN NaN 1.195748e+06 9.467810e+05 4.625030e+05 5.718200e+04 2.680500e+04 10.080000 7.890000 3.000000 3026.750000 NaN NaN NaN
max NaN NaN NaN 1.183138e+08 7.723006e+07 4.193463e+07 4.575496e+06 2.541597e+06 151.860000 88.790000 63.070000 54264.000000 NaN NaN NaN
for col in df.columns:
    column = df[col].value_counts().plot(kind="bar",figsize=(30,10))
    print("\n", col)
    print(column)
 country
AxesSubplot(0.125,0.125;0.775x0.755)

 iso_code
AxesSubplot(0.125,0.125;0.775x0.755)

 date
AxesSubplot(0.125,0.125;0.775x0.755)

 total_vaccinations
AxesSubplot(0.125,0.125;0.775x0.755)

 people_vaccinated
AxesSubplot(0.125,0.125;0.775x0.755)

 people_fully_vaccinated
AxesSubplot(0.125,0.125;0.775x0.755)

 daily_vaccinations_raw
AxesSubplot(0.125,0.125;0.775x0.755)

 daily_vaccinations
AxesSubplot(0.125,0.125;0.775x0.755)

 total_vaccinations_per_hundred
AxesSubplot(0.125,0.125;0.775x0.755)

 people_vaccinated_per_hundred
AxesSubplot(0.125,0.125;0.775x0.755)

 people_fully_vaccinated_per_hundred
AxesSubplot(0.125,0.125;0.775x0.755)

 daily_vaccinations_per_million
AxesSubplot(0.125,0.125;0.775x0.755)

 vaccines
AxesSubplot(0.125,0.125;0.775x0.755)

 source_name
AxesSubplot(0.125,0.125;0.775x0.755)

 source_website
AxesSubplot(0.125,0.125;0.775x0.755)
# !pip install sklearn
from sklearn import preprocessing
# normalizacja wartości numerycznych
numeric_values = df.select_dtypes(include='float64').values # tylko wartości numeryczne
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(values)
numeric_columns = df.select_dtypes(include='float64').columns
df_normalized = pd.DataFrame(x_scaled, columns=numeric_columns)
for col in df.columns: # usunięcie nieznormalizowanych danych i wstawienie nowych już znormalizowanych do oryginalnej ramki danych
    if col in numeric_columns: df[col] = df_normalized[col]
country iso_code date total_vaccinations people_vaccinated people_fully_vaccinated daily_vaccinations_raw daily_vaccinations total_vaccinations_per_hundred people_vaccinated_per_hundred people_fully_vaccinated_per_hundred daily_vaccinations_per_million vaccines source_name source_website
0 Afghanistan AFG 2021-02-22 0.0 0.0 NaN NaN NaN 0.0 0.0 NaN NaN Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
1 Afghanistan AFG 2021-02-23 NaN NaN NaN NaN 0.000537 NaN NaN NaN 0.000645 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
2 Afghanistan AFG 2021-02-24 NaN NaN NaN NaN 0.000537 NaN NaN NaN 0.000645 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
3 Afghanistan AFG 2021-02-25 NaN NaN NaN NaN 0.000537 NaN NaN NaN 0.000645 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
4 Afghanistan AFG 2021-02-26 NaN NaN NaN NaN 0.000537 NaN NaN NaN 0.000645 Oxford/AstraZeneca Government of Afghanistan http://www.xinhuanet.com/english/asiapacific/2...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7332 Zimbabwe ZWE 2021-03-15 NaN NaN NaN NaN NaN NaN NaN NaN NaN Sinopharm/Beijing Ministry of Health https://twitter.com/MoHCCZim/status/1373023610...
7333 Zimbabwe ZWE 2021-03-16 NaN NaN NaN NaN NaN NaN NaN NaN NaN Sinopharm/Beijing Ministry of Health https://twitter.com/MoHCCZim/status/1373023610...
7334 Zimbabwe ZWE 2021-03-17 NaN NaN NaN NaN NaN NaN NaN NaN NaN Sinopharm/Beijing Ministry of Health https://twitter.com/MoHCCZim/status/1373023610...
7335 Zimbabwe ZWE 2021-03-18 NaN NaN NaN NaN NaN NaN NaN NaN NaN Sinopharm/Beijing Ministry of Health https://twitter.com/MoHCCZim/status/1373023610...
7336 Zimbabwe ZWE 2021-03-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN Sinopharm/Beijing Ministry of Health https://twitter.com/MoHCCZim/status/1373023610...

7337 rows × 15 columns

df.dropna() # usunięcie wierszy z polami NaN
country iso_code date total_vaccinations people_vaccinated people_fully_vaccinated daily_vaccinations_raw daily_vaccinations total_vaccinations_per_hundred people_vaccinated_per_hundred people_fully_vaccinated_per_hundred daily_vaccinations_per_million vaccines source_name source_website
46 Albania ALB 2021-02-02 0.000027 0.000033 0.000015 0.000295 0.000100 0.000751 0.000906 0.000344 0.001622 Pfizer/BioNTech Ministry of Health https://shendetesia.gov.al/covid19-ministria-e...
234 Antigua and Barbuda ATG 2021-03-13 0.002351 0.003385 0.000409 0.003888 0.004605 0.004030 0.006229 0.000688 0.004773 Oxford/AstraZeneca Ministry of Health https://www.facebook.com/investingforwellness/...
235 Antigua and Barbuda ATG 2021-03-14 0.002474 0.003454 0.000629 0.003033 0.004431 0.004235 0.006342 0.001033 0.004589 Oxford/AstraZeneca Ministry of Health https://www.facebook.com/investingforwellness/...
236 Antigua and Barbuda ATG 2021-03-15 0.002548 0.003514 0.000730 0.001849 0.004376 0.004371 0.006455 0.001033 0.004533 Oxford/AstraZeneca Ministry of Health https://www.facebook.com/investingforwellness/...
237 Argentina ARG 2020-12-29 0.002583 0.003530 0.000800 0.000865 0.004069 0.004440 0.006569 0.001205 0.004220 Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik V Ministry of Health http://datos.salud.gob.ar/dataset/vacunas-cont...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6965 United Arab Emirates ARE 2021-03-10 0.011805 0.014719 0.006252 0.008788 0.010273 0.289051 0.389468 0.136465 0.152606 Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... National Emergency Crisis and Disaster Managem... http://covid19.ncema.gov.ae/en
6966 United Arab Emirates ARE 2021-03-11 0.012128 0.015115 0.006437 0.007986 0.011229 0.296974 0.400000 0.140423 0.166814 Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... National Emergency Crisis and Disaster Managem... http://covid19.ncema.gov.ae/en
6967 United Arab Emirates ARE 2021-03-12 0.012272 0.015243 0.006608 0.003560 0.011531 0.300526 0.403398 0.144209 0.171292 Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... National Emergency Crisis and Disaster Managem... http://covid19.ncema.gov.ae/en
6968 United Arab Emirates ARE 2021-03-13 0.012499 0.015473 0.006826 0.005609 0.011996 0.306058 0.409400 0.149028 0.178221 Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... National Emergency Crisis and Disaster Managem... http://covid19.ncema.gov.ae/en
6969 United Arab Emirates ARE 2021-03-14 0.012796 0.015709 0.007232 0.007341 0.012412 0.313367 0.415629 0.157804 0.184395 Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... National Emergency Crisis and Disaster Managem... http://covid19.ncema.gov.ae/en

2367 rows × 15 columns