ium_434784/UM1.ipynb
2021-03-22 00:03:37 +01:00

265 KiB
Raw Blame History

import sys
!{sys.executable} -m pip install kaggle
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install matplotlib
Requirement already satisfied: kaggle in /home/maciej/.local/lib/python3.8/site-packages (1.5.12)
Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)
Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)
Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)
Requirement already satisfied: tqdm in /home/maciej/.local/lib/python3.8/site-packages (from kaggle) (4.59.0)
Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)
Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3)
Requirement already satisfied: python-slugify in /home/maciej/.local/lib/python3.8/site-packages (from kaggle) (4.0.1)
Requirement already satisfied: text-unidecode>=1.3 in /home/maciej/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: pandas in /home/maciej/.local/lib/python3.8/site-packages (1.2.3)
Requirement already satisfied: numpy>=1.16.5 in /home/maciej/.local/lib/python3.8/site-packages (from pandas) (1.20.1)
Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas) (2019.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas) (2.7.3)
Requirement already satisfied: sklearn in /home/maciej/.local/lib/python3.8/site-packages (0.0)
Requirement already satisfied: scikit-learn in /home/maciej/.local/lib/python3.8/site-packages (from sklearn) (0.24.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (2.1.0)
Requirement already satisfied: joblib>=0.11 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.0.1)
Requirement already satisfied: numpy>=1.13.3 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.20.1)
Requirement already satisfied: scipy>=0.19.1 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.6.1)
Collecting matplotlib
  Downloading matplotlib-3.3.4-cp38-cp38-manylinux1_x86_64.whl (11.6 MB)
     |████████████████████████████████| 11.6 MB 39 kB/s  eta 0:00:01    |██████████▏                     | 3.7 MB 5.2 MB/s eta 0:00:02     |██████████████▏                 | 5.1 MB 5.2 MB/s eta 0:00:02     |██████████████▋                 | 5.3 MB 5.2 MB/s eta 0:00:02     |████████████████████████▏       | 8.7 MB 2.1 MB/s eta 0:00:02     |████████████████████████████    | 10.1 MB 2.1 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Requirement already satisfied: python-dateutil>=2.1 in /usr/lib/python3/dist-packages (from matplotlib) (2.7.3)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 9.9 MB/s eta 0:00:01
[?25hRequirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib) (7.0.0)
Requirement already satisfied: numpy>=1.15 in /home/maciej/.local/lib/python3.8/site-packages (from matplotlib) (1.20.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /home/maciej/.local/lib/python3.8/site-packages (from matplotlib) (2.4.7)
Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib) (1.14.0)
Installing collected packages: cycler, kiwisolver, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.3.4
# Zadanie 1
!kaggle datasets download -d szamil/who-suicide-statistics
Downloading who-suicide-statistics.zip to /home/maciej/Desktop/INZ/ium_434784

  0%|                                                | 0.00/304k [00:00<?, ?B/s]

100%|████████████████████████████████████████| 304k/304k [00:00<00:00, 17.9MB/s]
import pandas as pd
sc = pd.read_csv('who_suicide_statistics.csv')
sc
country year sex age suicides_no population
0 Albania 1985 female 15-24 years NaN 277900.0
1 Albania 1985 female 25-34 years NaN 246800.0
2 Albania 1985 female 35-54 years NaN 267500.0
3 Albania 1985 female 5-14 years NaN 298300.0
4 Albania 1985 female 55-74 years NaN 138700.0
... ... ... ... ... ... ...
43771 Zimbabwe 1990 male 25-34 years 150.0 NaN
43772 Zimbabwe 1990 male 35-54 years 132.0 NaN
43773 Zimbabwe 1990 male 5-14 years 6.0 NaN
43774 Zimbabwe 1990 male 55-74 years 74.0 NaN
43775 Zimbabwe 1990 male 75+ years 13.0 NaN

43776 rows × 6 columns

# Zadanie 2
# Podzial na 3 podzbiory w proporcji 6:2:2
import numpy as np
train, validate, test = np.split(sc.sample(frac=1, random_state=42),
                                 [int(.6*len(sc)), int(.8*len(sc))])
# Zadanie 3
import matplotlib.pyplot as plt
print("Train set: ", train.size)
print("Validate set: ", validate.size)
print("Test set: ", test.size)
print(train.describe(include='all'))
print(train.country.value_counts())
Train set:  157590
Validate set:  52530
Test set:  52536
                         country          year     sex          age  \
count                      26265  26265.000000   26265        26265   
unique                       141           NaN       2            6   
top     United States of America           NaN  female  55-74 years   
freq                         298           NaN   13170         4420   
mean                         NaN   1998.562688     NaN          NaN   
std                          NaN     10.310004     NaN          NaN   
min                          NaN   1979.000000     NaN          NaN   
25%                          NaN   1990.000000     NaN          NaN   
50%                          NaN   1999.000000     NaN          NaN   
75%                          NaN   2007.000000     NaN          NaN   
max                          NaN   2016.000000     NaN          NaN   

         suicides_no    population  
count   24919.000000  2.298300e+04  
unique           NaN           NaN  
top              NaN           NaN  
freq             NaN           NaN  
mean      194.504113  1.684849e+06  
std       789.159429  3.667651e+06  
min         0.000000  2.780000e+02  
25%         1.000000  8.678000e+04  
50%        14.000000  3.861960e+05  
75%        93.000000  1.333594e+06  
max     22338.000000  4.380521e+07  
United States of America    298
Sweden                      292
Romania                     292
Hungary                     289
Iceland                     283
                           ... 
Cabo Verde                   10
Iraq                          9
Monaco                        9
Macau                         8
Zimbabwe                      6
Name: country, Length: 141, dtype: int64
# Zadanie 3
print(validate.describe(include='all'))
print(validate.country.value_counts())
          country         year   sex         age   suicides_no    population
count        8755  8755.000000  8755        8755   8299.000000  7.707000e+03
unique        141          NaN     2           6           NaN           NaN
top     Mauritius          NaN  male  5-14 years           NaN           NaN
freq          108          NaN  4461        1506           NaN           NaN
mean          NaN  1998.390520   NaN         NaN    197.230389  1.640237e+06
std           NaN    10.441815   NaN         NaN    880.620233  3.628585e+06
min           NaN  1979.000000   NaN         NaN      0.000000  2.590000e+02
25%           NaN  1989.000000   NaN         NaN      1.000000  8.303000e+04
50%           NaN  1999.000000   NaN         NaN     13.000000  3.798980e+05
75%           NaN  2007.500000   NaN         NaN     90.000000  1.307090e+06
max           NaN  2016.000000   NaN         NaN  21706.000000  4.324090e+07
Mauritius                108
Hong Kong SAR            106
United Kingdom           106
Russian Federation       103
Belgium                  103
                        ... 
Tunisia                    3
Iran (Islamic Rep of)      3
Macau                      2
Iraq                       2
Cabo Verde                 1
Name: country, Length: 141, dtype: int64
# Zadanie 3
print(test.describe(include='all'))
print(test.country.value_counts())
          country         year     sex        age   suicides_no    population
count        8756  8756.000000    8756       8756   8302.000000  7.626000e+03
unique        141          NaN       2          6           NaN           NaN
top     Lithuania          NaN  female  75+ years           NaN           NaN
freq          102          NaN    4424       1501           NaN           NaN
mean          NaN  1998.433760     NaN        NaN    185.833775  1.625640e+06
std           NaN    10.320908     NaN        NaN    749.047182  3.604071e+06
min           NaN  1979.000000     NaN        NaN      0.000000  2.790000e+02
25%           NaN  1990.000000     NaN        NaN      1.000000  8.113700e+04
50%           NaN  1999.000000     NaN        NaN     13.000000  3.660465e+05
75%           NaN  2007.000000     NaN        NaN     87.000000  1.241382e+06
max           NaN  2016.000000     NaN        NaN  17355.000000  4.299788e+07
Lithuania       102
Denmark         102
Israel          100
Luxembourg      100
Ireland          99
               ... 
Saudi Arabia      3
Zimbabwe          2
Macau             2
Cabo Verde        1
Iraq              1
Name: country, Length: 141, dtype: int64
pd.value_counts(train['country']).plot.bar()
<AxesSubplot:>
pd.value_counts(validate['country']).plot.bar()
<AxesSubplot:>
pd.value_counts(test['country']).plot.bar()
<AxesSubplot:>
train
country year sex age suicides_no population
10289 Cuba 1993 female 75+ years 84.0 208800.0
18495 Hungary 2004 female 5-14 years 2.0 544457.0
1930 Aruba 1987 male 55-74 years 0.0 3118.0
20315 Italy 2001 male 75+ years 560.0 1675192.0
15269 Georgia 1993 female 75+ years NaN 133600.0
... ... ... ... ... ... ...
35206 Singapore 1981 male 55-74 years 18.0 108600.0
33416 Saint Kitts and Nevis 1987 male 35-54 years 0.0 NaN
7622 Bulgaria 2011 female 35-54 years 41.0 1036483.0
37277 Suriname 1982 female 75+ years 1.0 3100.0
13448 El Salvador 2014 male 35-54 years 85.0 586412.0

26265 rows × 6 columns

# Zadanie 4
# Wydaje mi sie ze w moim zbiorze nie jest wymagania zadna normalizacja danych.
# zadanie 5
print(train.isnull().sum())
print(validate.isnull().sum())
print(test.isnull().sum())
country           0
year              0
sex               0
age               0
suicides_no    1346
population     3282
dtype: int64
country           0
year              0
sex               0
age               0
suicides_no     456
population     1048
dtype: int64
country           0
year              0
sex               0
age               0
suicides_no     454
population     1130
dtype: int64
train.dropna(inplace=True)
validate.dropna(inplace=True)
test.dropna(inplace=True)
print(train)
print(validate)
print(test)
             country  year     sex          age  suicides_no  population
10289           Cuba  1993  female    75+ years         84.0    208800.0
18495        Hungary  2004  female   5-14 years          2.0    544457.0
1930           Aruba  1987    male  55-74 years          0.0      3118.0
20315          Italy  2001    male    75+ years        560.0   1675192.0
23505     Luxembourg  1984    male   5-14 years          0.0     22100.0
...              ...   ...     ...          ...          ...         ...
18031  Hong Kong SAR  2002    male  25-34 years        145.0    487800.0
35206      Singapore  1981    male  55-74 years         18.0    108600.0
7622        Bulgaria  2011  female  35-54 years         41.0   1036483.0
37277       Suriname  1982  female    75+ years          1.0      3100.0
13448    El Salvador  2014    male  35-54 years         85.0    586412.0

[21637 rows x 6 columns]
            country  year     sex          age  suicides_no  population
19952        Israel  2009    male  35-54 years         91.0    836965.0
36424  South Africa  2001  female  55-74 years          6.0   2053745.0
23461    Luxembourg  1981  female  25-34 years          3.0     28300.0
16512       Grenada  2009  female  15-24 years          0.0     11815.0
12873       Ecuador  2015    male   5-14 years         35.0   1569519.0
...             ...   ...     ...          ...          ...         ...
7523       Bulgaria  2002    male    75+ years        181.0    198560.0
42715       Uruguay  2009    male  25-34 years         79.0    238754.0
36799         Spain  1995    male  25-34 years        398.0   3196300.0
1559        Armenia  1986    male    75+ years          2.0     29000.0
13313   El Salvador  2003  female    75+ years          1.0     71062.0

[7251 rows x 6 columns]
                   country  year     sex          age  suicides_no  population
13528              Estonia  1988  female  55-74 years         40.0    169100.0
25017            Mauritius  1991    male   5-14 years          0.0    103900.0
19317              Ireland  1992    male   5-14 years          3.0    339800.0
7928                Canada  1999    male  35-54 years       1442.0   4743615.0
2107                 Aruba  2011    male  25-34 years          0.0      5440.0
...                    ...   ...     ...          ...          ...         ...
37194            Sri Lanka  2001    male  15-24 years        508.0   1811743.0
16850            Guatemala  1984  female  35-54 years          0.0    596000.0
6265                Brazil  1984  female  25-34 years        233.0  10566400.0
860    Antigua and Barbuda  1995    male  35-54 years          0.0      7809.0
15795              Germany  2011  female   5-14 years          9.0   3641215.0

[7172 rows x 6 columns]