40 KiB
40 KiB
!pip install --user kaggle
Requirement already satisfied: kaggle in /home/osboxes/.local/lib/python3.8/site-packages (1.5.12) Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0) Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8) Requirement already satisfied: python-dateutil in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (2.8.2) Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28) Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0) Requirement already satisfied: tqdm in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (4.63.0) Requirement already satisfied: python-slugify in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (6.1.1) Requirement already satisfied: text-unidecode>=1.3 in /home/osboxes/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)
!pip install --user pandas
Requirement already satisfied: pandas in /home/osboxes/.local/lib/python3.8/site-packages (1.4.1) Requirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2022.1) Requirement already satisfied: numpy>=1.18.5; platform_machine != "aarch64" and platform_machine != "arm64" and python_version < "3.10" in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (1.22.3) Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas) (1.14.0)
!mkdir ~/.kaggle
mkdir: cannot create directory ‘/home/osboxes/.kaggle’: File exists
!cp /home/osboxes/Downloads/kaggle.json /home/osboxes/.kaggle/kaggle.json
!kaggle datasets download -d slehkyi/extended-football-stats-for-european-leagues-xg
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/osboxes/.kaggle/kaggle.json' Downloading extended-football-stats-for-european-leagues-xg.zip to /home/osboxes/jupyter_dir/notebooks/IUM 73%|███████████████████████████▋ | 1.00M/1.37M [00:00<00:00, 4.92MB/s] 100%|██████████████████████████████████████| 1.37M/1.37M [00:00<00:00, 6.55MB/s]
!unzip -o extended-football-stats-for-european-leagues-xg.zip
Archive: extended-football-stats-for-european-leagues-xg.zip inflating: understat.com.csv inflating: understat_per_game.csv
!pip install --user seaborn
Collecting seaborn Downloading seaborn-0.11.2-py3-none-any.whl (292 kB) [K |████████████████████████████████| 292 kB 2.0 MB/s eta 0:00:01 [?25hRequirement already satisfied: pandas>=0.23 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.4.1) Requirement already satisfied: numpy>=1.15 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.22.3) Collecting matplotlib>=2.2 Downloading matplotlib-3.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB) [K |████████████████████████████████| 11.3 MB 5.7 MB/s eta 0:00:01 |██████████████████████ | 7.7 MB 5.7 MB/s eta 0:00:01 |█████████████████████████ | 8.8 MB 5.7 MB/s eta 0:00:01 [?25hCollecting scipy>=1.0 Downloading scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB) [K |████████████████████████████████| 41.6 MB 27 kB/s eta 0:00:011 |███▊ | 4.9 MB 5.0 MB/s eta 0:00:08 |██████ | 7.8 MB 7.9 MB/s eta 0:00:05 |██████████████████ | 23.3 MB 4.5 MB/s eta 0:00:05 |██████████████████▎ | 23.7 MB 4.5 MB/s eta 0:00:04 |█████████████████████▍ | 27.8 MB 10.0 MB/s eta 0:00:02 |███████████████████████████▏ | 35.3 MB 7.2 MB/s eta 0:00:01 |████████████████████████████▍ | 36.9 MB 7.2 MB/s eta 0:00:01 [?25hRequirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2022.1) Collecting fonttools>=4.22.0 Downloading fonttools-4.31.1-py3-none-any.whl (899 kB) [K |████████████████████████████████| 899 kB 3.2 MB/s eta 0:00:01 [?25hRequirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (7.0.0) Requirement already satisfied: pyparsing>=2.2.1 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (3.0.7) Collecting kiwisolver>=1.0.1 Downloading kiwisolver-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB) [K |████████████████████████████████| 1.2 MB 12.4 MB/s eta 0:00:01 [?25hRequirement already satisfied: packaging>=20.0 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (21.3) Collecting cycler>=0.10 Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB) Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas>=0.23->seaborn) (1.14.0) Installing collected packages: fonttools, kiwisolver, cycler, matplotlib, scipy, seaborn Successfully installed cycler-0.11.0 fonttools-4.31.1 kiwisolver-1.4.0 matplotlib-3.5.1 scipy-1.8.0 seaborn-0.11.2
!head -n 5 understat.csv
,,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846 La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994 La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971 La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999
import pandas as pd
understat = pd.read_csv('understat.csv')
understat
league | year | position | team | matches | wins | draws | loses | scored | missed | ... | xGA | xGA_diff | npxGA | npxGD | ppda_coef | oppda_coef | deep | deep_allowed | xpts | xpts_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | La_liga | 2014 | 1 | Barcelona | 38 | 30 | 4 | 4 | 110 | 21 | ... | 28.444293 | 7.444293 | 24.727907 | 73.049305 | 5.683535 | 16.367593 | 489 | 114 | 94.0813 | 0.0813 |
1 | La_liga | 2014 | 2 | Real Madrid | 38 | 30 | 2 | 6 | 118 | 38 | ... | 42.607198 | 4.607198 | 38.890805 | 47.213090 | 10.209085 | 12.929510 | 351 | 153 | 81.7489 | -10.2511 |
2 | La_liga | 2014 | 3 | Atletico Madrid | 38 | 23 | 9 | 6 | 67 | 29 | ... | 29.069107 | 0.069107 | 26.839271 | 25.748737 | 8.982028 | 9.237091 | 197 | 123 | 73.1353 | -4.8647 |
3 | La_liga | 2014 | 4 | Valencia | 38 | 22 | 11 | 5 | 70 | 32 | ... | 39.392572 | 7.392572 | 33.446477 | 16.257501 | 8.709827 | 7.870225 | 203 | 172 | 63.7068 | -13.2932 |
4 | La_liga | 2014 | 5 | Sevilla | 38 | 23 | 7 | 8 | 71 | 45 | ... | 47.862742 | 2.862742 | 41.916529 | 20.178070 | 8.276148 | 9.477805 | 305 | 168 | 67.3867 | -8.6133 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
679 | RFPL | 2019 | 12 | PFC Sochi | 30 | 8 | 9 | 13 | 40 | 39 | ... | 38.850259 | -0.149741 | 32.780898 | -0.096048 | 12.838079 | 10.562327 | 175 | 206 | 38.6587 | 5.6587 |
680 | RFPL | 2019 | 13 | FK Akhmat | 30 | 7 | 10 | 13 | 27 | 46 | ... | 40.626196 | -5.373804 | 38.363370 | -10.495864 | 11.199502 | 10.806357 | 124 | 206 | 36.5424 | 5.5424 |
681 | RFPL | 2019 | 14 | Krylya Sovetov Samara | 30 | 8 | 7 | 15 | 33 | 40 | ... | 42.980693 | 2.980693 | 37.550114 | -7.777201 | 11.949903 | 10.080858 | 103 | 215 | 36.3363 | 5.3363 |
682 | RFPL | 2019 | 15 | FC Tambov | 30 | 9 | 4 | 17 | 37 | 41 | ... | 39.747938 | -1.252062 | 34.468003 | -12.231948 | 14.666049 | 9.192768 | 150 | 270 | 29.2413 | -1.7587 |
683 | RFPL | 2019 | 16 | FC Orenburg | 30 | 7 | 6 | 17 | 28 | 52 | ... | 37.169797 | -14.830203 | 32.644130 | 0.201339 | 12.830908 | 9.464581 | 153 | 215 | 39.2364 | 12.2364 |
684 rows × 24 columns
understat.describe(include='all')
league | year | position | team | matches | wins | draws | loses | scored | missed | ... | xGA | xGA_diff | npxGA | npxGD | ppda_coef | oppda_coef | deep | deep_allowed | xpts | xpts_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 684 | 684.000000 | 684.000000 | 684 | 684.000000 | 684.000000 | 684.000000 | 684.000000 | 684.000000 | 684.000000 | ... | 684.000000 | 684.000000 | 684.000000 | 6.840000e+02 | 684.000000 | 684.000000 | 684.000000 | 684.000000 | 684.000000 | 684.000000 |
unique | 6 | NaN | NaN | 168 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
top | La_liga | NaN | NaN | Barcelona | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
freq | 120 | NaN | NaN | 6 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mean | NaN | 2016.500000 | 10.061404 | NaN | 35.935673 | 13.434211 | 9.067251 | 13.434211 | 48.190058 | 48.190058 | ... | 47.064744 | -1.125315 | 42.902596 | -4.155221e-17 | 10.911784 | 10.911772 | 208.676901 | 208.676901 | 49.539598 | 0.169715 |
std | NaN | 1.709075 | 5.580165 | NaN | 3.203487 | 5.880962 | 2.941824 | 5.510278 | 17.605374 | 13.866509 | ... | 11.781399 | 6.663632 | 11.002013 | 1.929269e+01 | 2.521398 | 3.301410 | 83.888073 | 54.713624 | 13.559213 | 7.156998 |
min | NaN | 2014.000000 | 1.000000 | NaN | 27.000000 | 2.000000 | 2.000000 | 1.000000 | 13.000000 | 15.000000 | ... | 16.838674 | -29.175087 | 16.084399 | -4.220877e+01 | 5.683535 | 4.394458 | 76.000000 | 83.000000 | 17.907700 | -24.721600 |
25% | NaN | 2015.000000 | 5.000000 | NaN | 34.000000 | 9.000000 | 7.000000 | 9.000000 | 36.000000 | 38.000000 | ... | 38.916186 | -5.698828 | 35.474606 | -1.325816e+01 | 9.090617 | 8.809866 | 151.750000 | 170.000000 | 39.466550 | -4.498400 |
50% | NaN | 2016.500000 | 10.000000 | NaN | 38.000000 | 12.000000 | 9.000000 | 14.000000 | 45.000000 | 48.000000 | ... | 47.310924 | -0.918895 | 43.031911 | -3.127901e+00 | 10.562543 | 10.347047 | 188.000000 | 205.000000 | 47.102100 | 0.116050 |
75% | NaN | 2018.000000 | 15.000000 | NaN | 38.000000 | 16.000000 | 11.000000 | 17.000000 | 56.000000 | 58.000000 | ... | 54.834899 | 3.381834 | 50.263465 | 9.740049e+00 | 12.434874 | 12.187434 | 242.000000 | 246.250000 | 56.942025 | 4.912775 |
max | NaN | 2019.000000 | 20.000000 | NaN | 38.000000 | 32.000000 | 18.000000 | 29.000000 | 118.000000 | 94.000000 | ... | 88.432186 | 16.370737 | 78.535447 | 7.304931e+01 | 21.896752 | 30.468113 | 582.000000 | 375.000000 | 94.380000 | 23.047500 |
11 rows × 24 columns
!wc -l understat.csv
685 understat.csv
!head -n 5 understat.csv
,,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846 La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994 La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971 La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999
!cut -f 5 -d "," understat.csv | sort | uniq -c
2 27 18 28 96 30 108 34 10 37 450 38 1 matches
! grep -P "^$" -n understat.csv