265 KiB
265 KiB
import sys
!{sys.executable} -m pip install kaggle
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install matplotlib
Requirement already satisfied: kaggle in /home/maciej/.local/lib/python3.8/site-packages (1.5.12) Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28) Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0) Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0) Requirement already satisfied: tqdm in /home/maciej/.local/lib/python3.8/site-packages (from kaggle) (4.59.0) Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8) Requirement already satisfied: python-dateutil in /usr/lib/python3/dist-packages (from kaggle) (2.7.3) Requirement already satisfied: python-slugify in /home/maciej/.local/lib/python3.8/site-packages (from kaggle) (4.0.1) Requirement already satisfied: text-unidecode>=1.3 in /home/maciej/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3) Requirement already satisfied: pandas in /home/maciej/.local/lib/python3.8/site-packages (1.2.3) Requirement already satisfied: numpy>=1.16.5 in /home/maciej/.local/lib/python3.8/site-packages (from pandas) (1.20.1) Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas) (2019.3) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas) (2.7.3) Requirement already satisfied: sklearn in /home/maciej/.local/lib/python3.8/site-packages (0.0) Requirement already satisfied: scikit-learn in /home/maciej/.local/lib/python3.8/site-packages (from sklearn) (0.24.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (2.1.0) Requirement already satisfied: joblib>=0.11 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.0.1) Requirement already satisfied: numpy>=1.13.3 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.20.1) Requirement already satisfied: scipy>=0.19.1 in /home/maciej/.local/lib/python3.8/site-packages (from scikit-learn->sklearn) (1.6.1) Collecting matplotlib Downloading matplotlib-3.3.4-cp38-cp38-manylinux1_x86_64.whl (11.6 MB) [K |████████████████████████████████| 11.6 MB 39 kB/s eta 0:00:01 |██████████▏ | 3.7 MB 5.2 MB/s eta 0:00:02 |██████████████▏ | 5.1 MB 5.2 MB/s eta 0:00:02 |██████████████▋ | 5.3 MB 5.2 MB/s eta 0:00:02 |████████████████████████▏ | 8.7 MB 2.1 MB/s eta 0:00:02 |████████████████████████████ | 10.1 MB 2.1 MB/s eta 0:00:01 [?25hCollecting cycler>=0.10 Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB) Requirement already satisfied: python-dateutil>=2.1 in /usr/lib/python3/dist-packages (from matplotlib) (2.7.3) Collecting kiwisolver>=1.0.1 Downloading kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl (1.2 MB) [K |████████████████████████████████| 1.2 MB 9.9 MB/s eta 0:00:01 [?25hRequirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib) (7.0.0) Requirement already satisfied: numpy>=1.15 in /home/maciej/.local/lib/python3.8/site-packages (from matplotlib) (1.20.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /home/maciej/.local/lib/python3.8/site-packages (from matplotlib) (2.4.7) Requirement already satisfied: six in /usr/lib/python3/dist-packages (from cycler>=0.10->matplotlib) (1.14.0) Installing collected packages: cycler, kiwisolver, matplotlib Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.3.4
# Zadanie 1
!kaggle datasets download -d szamil/who-suicide-statistics
Downloading who-suicide-statistics.zip to /home/maciej/Desktop/INZ/ium_434784 0%| | 0.00/304k [00:00<?, ?B/s] 100%|████████████████████████████████████████| 304k/304k [00:00<00:00, 17.9MB/s]
import pandas as pd
sc = pd.read_csv('who_suicide_statistics.csv')
sc
country | year | sex | age | suicides_no | population | |
---|---|---|---|---|---|---|
0 | Albania | 1985 | female | 15-24 years | NaN | 277900.0 |
1 | Albania | 1985 | female | 25-34 years | NaN | 246800.0 |
2 | Albania | 1985 | female | 35-54 years | NaN | 267500.0 |
3 | Albania | 1985 | female | 5-14 years | NaN | 298300.0 |
4 | Albania | 1985 | female | 55-74 years | NaN | 138700.0 |
... | ... | ... | ... | ... | ... | ... |
43771 | Zimbabwe | 1990 | male | 25-34 years | 150.0 | NaN |
43772 | Zimbabwe | 1990 | male | 35-54 years | 132.0 | NaN |
43773 | Zimbabwe | 1990 | male | 5-14 years | 6.0 | NaN |
43774 | Zimbabwe | 1990 | male | 55-74 years | 74.0 | NaN |
43775 | Zimbabwe | 1990 | male | 75+ years | 13.0 | NaN |
43776 rows × 6 columns
# Zadanie 2
# Podzial na 3 podzbiory w proporcji 6:2:2
import numpy as np
train, validate, test = np.split(sc.sample(frac=1, random_state=42),
[int(.6*len(sc)), int(.8*len(sc))])
# Zadanie 3
import matplotlib.pyplot as plt
print("Train set: ", train.size)
print("Validate set: ", validate.size)
print("Test set: ", test.size)
print(train.describe(include='all'))
print(train.country.value_counts())
Train set: 157590 Validate set: 52530 Test set: 52536 country year sex age \ count 26265 26265.000000 26265 26265 unique 141 NaN 2 6 top United States of America NaN female 55-74 years freq 298 NaN 13170 4420 mean NaN 1998.562688 NaN NaN std NaN 10.310004 NaN NaN min NaN 1979.000000 NaN NaN 25% NaN 1990.000000 NaN NaN 50% NaN 1999.000000 NaN NaN 75% NaN 2007.000000 NaN NaN max NaN 2016.000000 NaN NaN suicides_no population count 24919.000000 2.298300e+04 unique NaN NaN top NaN NaN freq NaN NaN mean 194.504113 1.684849e+06 std 789.159429 3.667651e+06 min 0.000000 2.780000e+02 25% 1.000000 8.678000e+04 50% 14.000000 3.861960e+05 75% 93.000000 1.333594e+06 max 22338.000000 4.380521e+07 United States of America 298 Sweden 292 Romania 292 Hungary 289 Iceland 283 ... Cabo Verde 10 Iraq 9 Monaco 9 Macau 8 Zimbabwe 6 Name: country, Length: 141, dtype: int64
# Zadanie 3
print(validate.describe(include='all'))
print(validate.country.value_counts())
country year sex age suicides_no population count 8755 8755.000000 8755 8755 8299.000000 7.707000e+03 unique 141 NaN 2 6 NaN NaN top Mauritius NaN male 5-14 years NaN NaN freq 108 NaN 4461 1506 NaN NaN mean NaN 1998.390520 NaN NaN 197.230389 1.640237e+06 std NaN 10.441815 NaN NaN 880.620233 3.628585e+06 min NaN 1979.000000 NaN NaN 0.000000 2.590000e+02 25% NaN 1989.000000 NaN NaN 1.000000 8.303000e+04 50% NaN 1999.000000 NaN NaN 13.000000 3.798980e+05 75% NaN 2007.500000 NaN NaN 90.000000 1.307090e+06 max NaN 2016.000000 NaN NaN 21706.000000 4.324090e+07 Mauritius 108 Hong Kong SAR 106 United Kingdom 106 Russian Federation 103 Belgium 103 ... Tunisia 3 Iran (Islamic Rep of) 3 Macau 2 Iraq 2 Cabo Verde 1 Name: country, Length: 141, dtype: int64
# Zadanie 3
print(test.describe(include='all'))
print(test.country.value_counts())
country year sex age suicides_no population count 8756 8756.000000 8756 8756 8302.000000 7.626000e+03 unique 141 NaN 2 6 NaN NaN top Lithuania NaN female 75+ years NaN NaN freq 102 NaN 4424 1501 NaN NaN mean NaN 1998.433760 NaN NaN 185.833775 1.625640e+06 std NaN 10.320908 NaN NaN 749.047182 3.604071e+06 min NaN 1979.000000 NaN NaN 0.000000 2.790000e+02 25% NaN 1990.000000 NaN NaN 1.000000 8.113700e+04 50% NaN 1999.000000 NaN NaN 13.000000 3.660465e+05 75% NaN 2007.000000 NaN NaN 87.000000 1.241382e+06 max NaN 2016.000000 NaN NaN 17355.000000 4.299788e+07 Lithuania 102 Denmark 102 Israel 100 Luxembourg 100 Ireland 99 ... Saudi Arabia 3 Zimbabwe 2 Macau 2 Cabo Verde 1 Iraq 1 Name: country, Length: 141, dtype: int64
pd.value_counts(train['country']).plot.bar()
<AxesSubplot:>
pd.value_counts(validate['country']).plot.bar()
<AxesSubplot:>
pd.value_counts(test['country']).plot.bar()
<AxesSubplot:>
train
country | year | sex | age | suicides_no | population | |
---|---|---|---|---|---|---|
10289 | Cuba | 1993 | female | 75+ years | 84.0 | 208800.0 |
18495 | Hungary | 2004 | female | 5-14 years | 2.0 | 544457.0 |
1930 | Aruba | 1987 | male | 55-74 years | 0.0 | 3118.0 |
20315 | Italy | 2001 | male | 75+ years | 560.0 | 1675192.0 |
15269 | Georgia | 1993 | female | 75+ years | NaN | 133600.0 |
... | ... | ... | ... | ... | ... | ... |
35206 | Singapore | 1981 | male | 55-74 years | 18.0 | 108600.0 |
33416 | Saint Kitts and Nevis | 1987 | male | 35-54 years | 0.0 | NaN |
7622 | Bulgaria | 2011 | female | 35-54 years | 41.0 | 1036483.0 |
37277 | Suriname | 1982 | female | 75+ years | 1.0 | 3100.0 |
13448 | El Salvador | 2014 | male | 35-54 years | 85.0 | 586412.0 |
26265 rows × 6 columns
# Zadanie 4
# Wydaje mi sie ze w moim zbiorze nie jest wymagania zadna normalizacja danych.
# zadanie 5
print(train.isnull().sum())
print(validate.isnull().sum())
print(test.isnull().sum())
country 0 year 0 sex 0 age 0 suicides_no 1346 population 3282 dtype: int64 country 0 year 0 sex 0 age 0 suicides_no 456 population 1048 dtype: int64 country 0 year 0 sex 0 age 0 suicides_no 454 population 1130 dtype: int64
train.dropna(inplace=True)
validate.dropna(inplace=True)
test.dropna(inplace=True)
print(train)
print(validate)
print(test)
country year sex age suicides_no population 10289 Cuba 1993 female 75+ years 84.0 208800.0 18495 Hungary 2004 female 5-14 years 2.0 544457.0 1930 Aruba 1987 male 55-74 years 0.0 3118.0 20315 Italy 2001 male 75+ years 560.0 1675192.0 23505 Luxembourg 1984 male 5-14 years 0.0 22100.0 ... ... ... ... ... ... ... 18031 Hong Kong SAR 2002 male 25-34 years 145.0 487800.0 35206 Singapore 1981 male 55-74 years 18.0 108600.0 7622 Bulgaria 2011 female 35-54 years 41.0 1036483.0 37277 Suriname 1982 female 75+ years 1.0 3100.0 13448 El Salvador 2014 male 35-54 years 85.0 586412.0 [21637 rows x 6 columns] country year sex age suicides_no population 19952 Israel 2009 male 35-54 years 91.0 836965.0 36424 South Africa 2001 female 55-74 years 6.0 2053745.0 23461 Luxembourg 1981 female 25-34 years 3.0 28300.0 16512 Grenada 2009 female 15-24 years 0.0 11815.0 12873 Ecuador 2015 male 5-14 years 35.0 1569519.0 ... ... ... ... ... ... ... 7523 Bulgaria 2002 male 75+ years 181.0 198560.0 42715 Uruguay 2009 male 25-34 years 79.0 238754.0 36799 Spain 1995 male 25-34 years 398.0 3196300.0 1559 Armenia 1986 male 75+ years 2.0 29000.0 13313 El Salvador 2003 female 75+ years 1.0 71062.0 [7251 rows x 6 columns] country year sex age suicides_no population 13528 Estonia 1988 female 55-74 years 40.0 169100.0 25017 Mauritius 1991 male 5-14 years 0.0 103900.0 19317 Ireland 1992 male 5-14 years 3.0 339800.0 7928 Canada 1999 male 35-54 years 1442.0 4743615.0 2107 Aruba 2011 male 25-34 years 0.0 5440.0 ... ... ... ... ... ... ... 37194 Sri Lanka 2001 male 15-24 years 508.0 1811743.0 16850 Guatemala 1984 female 35-54 years 0.0 596000.0 6265 Brazil 1984 female 25-34 years 233.0 10566400.0 860 Antigua and Barbuda 1995 male 35-54 years 0.0 7809.0 15795 Germany 2011 female 5-14 years 9.0 3641215.0 [7172 rows x 6 columns]