{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ad6b7dc7", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: kaggle in /home/osboxes/.local/lib/python3.8/site-packages (1.5.12)\n", "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n", "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n", "Requirement already satisfied: python-dateutil in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (2.8.2)\n", "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n", "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n", "Requirement already satisfied: tqdm in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n", "Requirement already satisfied: python-slugify in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n", "Requirement already satisfied: text-unidecode>=1.3 in /home/osboxes/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n" ] } ], "source": [ "!pip install --user kaggle" ] }, { "cell_type": "code", "execution_count": 2, "id": "4ab2c14f", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas in /home/osboxes/.local/lib/python3.8/site-packages (1.4.1)\r\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2.8.2)\r\n", "Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2022.1)\r\n", "Requirement already satisfied: numpy>=1.18.5; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (1.22.3)\r\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas) (1.14.0)\r\n" ] } ], "source": [ "!pip install --user pandas" ] }, { "cell_type": "code", "execution_count": 4, "id": "c0597767", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory ‘/home/osboxes/.kaggle’: File exists\r\n" ] } ], "source": [ "!mkdir ~/.kaggle" ] }, { "cell_type": "code", "execution_count": 6, "id": "2465b1e9", "metadata": {}, "outputs": [], "source": [ "!cp /home/osboxes/Downloads/kaggle.json /home/osboxes/.kaggle/kaggle.json" ] }, { "cell_type": "code", "execution_count": 7, "id": "faa7e821", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/osboxes/.kaggle/kaggle.json'\n", "Downloading extended-football-stats-for-european-leagues-xg.zip to /home/osboxes/jupyter_dir/notebooks/IUM\n", " 73%|███████████████████████████▋ | 1.00M/1.37M [00:00<00:00, 4.92MB/s]\n", "100%|██████████████████████████████████████| 1.37M/1.37M [00:00<00:00, 6.55MB/s]\n" ] } ], "source": [ "!kaggle datasets download -d slehkyi/extended-football-stats-for-european-leagues-xg" ] }, { "cell_type": "code", "execution_count": 12, "id": "d5b18a91", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: extended-football-stats-for-european-leagues-xg.zip\n", " inflating: understat.com.csv \n", " inflating: understat_per_game.csv \n" ] } ], "source": [ "!unzip -o extended-football-stats-for-european-leagues-xg.zip" ] }, { "cell_type": "code", "execution_count": 9, "id": "0283db51", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting seaborn\n", " Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n", "\u001b[K |████████████████████████████████| 292 kB 2.0 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: pandas>=0.23 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.4.1)\n", "Requirement already satisfied: numpy>=1.15 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.22.3)\n", "Collecting matplotlib>=2.2\n", " Downloading matplotlib-3.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)\n", "\u001b[K |████████████████████████████████| 11.3 MB 5.7 MB/s eta 0:00:01 |██████████████████████ | 7.7 MB 5.7 MB/s eta 0:00:01 |█████████████████████████ | 8.8 MB 5.7 MB/s eta 0:00:01\n", "\u001b[?25hCollecting scipy>=1.0\n", " Downloading scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)\n", "\u001b[K |████████████████████████████████| 41.6 MB 27 kB/s eta 0:00:011 |███▊ | 4.9 MB 5.0 MB/s eta 0:00:08 |██████ | 7.8 MB 7.9 MB/s eta 0:00:05 |██████████████████ | 23.3 MB 4.5 MB/s eta 0:00:05 |██████████████████▎ | 23.7 MB 4.5 MB/s eta 0:00:04 |█████████████████████▍ | 27.8 MB 10.0 MB/s eta 0:00:02 |███████████████████████████▏ | 35.3 MB 7.2 MB/s eta 0:00:01 |████████████████████████████▍ | 36.9 MB 7.2 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2022.1)\n", "Collecting fonttools>=4.22.0\n", " Downloading fonttools-4.31.1-py3-none-any.whl (899 kB)\n", "\u001b[K |████████████████████████████████| 899 kB 3.2 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (7.0.0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (3.0.7)\n", "Collecting kiwisolver>=1.0.1\n", " Downloading kiwisolver-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)\n", "\u001b[K |████████████████████████████████| 1.2 MB 12.4 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (21.3)\n", "Collecting cycler>=0.10\n", " Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas>=0.23->seaborn) (1.14.0)\n", "Installing collected packages: fonttools, kiwisolver, cycler, matplotlib, scipy, seaborn\n", "Successfully installed cycler-0.11.0 fonttools-4.31.1 kiwisolver-1.4.0 matplotlib-3.5.1 scipy-1.8.0 seaborn-0.11.2\n" ] } ], "source": [ "!pip install --user seaborn" ] }, { "cell_type": "code", "execution_count": 13, "id": "2cd1e392", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff\r\n", "La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846\r\n", "La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994\r\n", "La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971\r\n", "La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999\r\n" ] } ], "source": [ "!head -n 5 understat.csv" ] }, { "cell_type": "code", "execution_count": 10, "id": "12a3ddce", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>league</th>\n", " <th>year</th>\n", " <th>position</th>\n", " <th>team</th>\n", " <th>matches</th>\n", " <th>wins</th>\n", " <th>draws</th>\n", " <th>loses</th>\n", " <th>scored</th>\n", " <th>missed</th>\n", " <th>...</th>\n", " <th>xGA</th>\n", " <th>xGA_diff</th>\n", " <th>npxGA</th>\n", " <th>npxGD</th>\n", " <th>ppda_coef</th>\n", " <th>oppda_coef</th>\n", " <th>deep</th>\n", " <th>deep_allowed</th>\n", " <th>xpts</th>\n", " <th>xpts_diff</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>La_liga</td>\n", " <td>2014</td>\n", " <td>1</td>\n", " <td>Barcelona</td>\n", " <td>38</td>\n", " <td>30</td>\n", " <td>4</td>\n", " <td>4</td>\n", " <td>110</td>\n", " <td>21</td>\n", " <td>...</td>\n", " <td>28.444293</td>\n", " <td>7.444293</td>\n", " <td>24.727907</td>\n", " <td>73.049305</td>\n", " <td>5.683535</td>\n", " <td>16.367593</td>\n", " <td>489</td>\n", " <td>114</td>\n", " <td>94.0813</td>\n", " <td>0.0813</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>La_liga</td>\n", " <td>2014</td>\n", " <td>2</td>\n", " <td>Real Madrid</td>\n", " <td>38</td>\n", " <td>30</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>118</td>\n", " <td>38</td>\n", " <td>...</td>\n", " <td>42.607198</td>\n", " <td>4.607198</td>\n", " <td>38.890805</td>\n", " <td>47.213090</td>\n", " <td>10.209085</td>\n", " <td>12.929510</td>\n", " <td>351</td>\n", " <td>153</td>\n", " <td>81.7489</td>\n", " <td>-10.2511</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>La_liga</td>\n", " <td>2014</td>\n", " <td>3</td>\n", " <td>Atletico Madrid</td>\n", " <td>38</td>\n", " <td>23</td>\n", " <td>9</td>\n", " <td>6</td>\n", " <td>67</td>\n", " <td>29</td>\n", " <td>...</td>\n", " <td>29.069107</td>\n", " <td>0.069107</td>\n", " <td>26.839271</td>\n", " <td>25.748737</td>\n", " <td>8.982028</td>\n", " <td>9.237091</td>\n", " <td>197</td>\n", " <td>123</td>\n", " <td>73.1353</td>\n", " <td>-4.8647</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>La_liga</td>\n", " <td>2014</td>\n", " <td>4</td>\n", " <td>Valencia</td>\n", " <td>38</td>\n", " <td>22</td>\n", " <td>11</td>\n", " <td>5</td>\n", " <td>70</td>\n", " <td>32</td>\n", " <td>...</td>\n", " <td>39.392572</td>\n", " <td>7.392572</td>\n", " <td>33.446477</td>\n", " <td>16.257501</td>\n", " <td>8.709827</td>\n", " <td>7.870225</td>\n", " <td>203</td>\n", " <td>172</td>\n", " <td>63.7068</td>\n", " <td>-13.2932</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>La_liga</td>\n", " <td>2014</td>\n", " <td>5</td>\n", " <td>Sevilla</td>\n", " <td>38</td>\n", " <td>23</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>71</td>\n", " <td>45</td>\n", " <td>...</td>\n", " <td>47.862742</td>\n", " <td>2.862742</td>\n", " <td>41.916529</td>\n", " <td>20.178070</td>\n", " <td>8.276148</td>\n", " <td>9.477805</td>\n", " <td>305</td>\n", " <td>168</td>\n", " <td>67.3867</td>\n", " <td>-8.6133</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>679</th>\n", " <td>RFPL</td>\n", " <td>2019</td>\n", " <td>12</td>\n", " <td>PFC Sochi</td>\n", " <td>30</td>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>13</td>\n", " <td>40</td>\n", " <td>39</td>\n", " <td>...</td>\n", " <td>38.850259</td>\n", " <td>-0.149741</td>\n", " <td>32.780898</td>\n", " <td>-0.096048</td>\n", " <td>12.838079</td>\n", " <td>10.562327</td>\n", " <td>175</td>\n", " <td>206</td>\n", " <td>38.6587</td>\n", " <td>5.6587</td>\n", " </tr>\n", " <tr>\n", " <th>680</th>\n", " <td>RFPL</td>\n", " <td>2019</td>\n", " <td>13</td>\n", " <td>FK Akhmat</td>\n", " <td>30</td>\n", " <td>7</td>\n", " <td>10</td>\n", " <td>13</td>\n", " <td>27</td>\n", " <td>46</td>\n", " <td>...</td>\n", " <td>40.626196</td>\n", " <td>-5.373804</td>\n", " <td>38.363370</td>\n", " <td>-10.495864</td>\n", " <td>11.199502</td>\n", " <td>10.806357</td>\n", " <td>124</td>\n", " <td>206</td>\n", " <td>36.5424</td>\n", " <td>5.5424</td>\n", " </tr>\n", " <tr>\n", " <th>681</th>\n", " <td>RFPL</td>\n", " <td>2019</td>\n", " <td>14</td>\n", " <td>Krylya Sovetov Samara</td>\n", " <td>30</td>\n", " <td>8</td>\n", " <td>7</td>\n", " <td>15</td>\n", " <td>33</td>\n", " <td>40</td>\n", " <td>...</td>\n", " <td>42.980693</td>\n", " <td>2.980693</td>\n", " <td>37.550114</td>\n", " <td>-7.777201</td>\n", " <td>11.949903</td>\n", " <td>10.080858</td>\n", " <td>103</td>\n", " <td>215</td>\n", " <td>36.3363</td>\n", " <td>5.3363</td>\n", " </tr>\n", " <tr>\n", " <th>682</th>\n", " <td>RFPL</td>\n", " <td>2019</td>\n", " <td>15</td>\n", " <td>FC Tambov</td>\n", " <td>30</td>\n", " <td>9</td>\n", " <td>4</td>\n", " <td>17</td>\n", " <td>37</td>\n", " <td>41</td>\n", " <td>...</td>\n", " <td>39.747938</td>\n", " <td>-1.252062</td>\n", " <td>34.468003</td>\n", " <td>-12.231948</td>\n", " <td>14.666049</td>\n", " <td>9.192768</td>\n", " <td>150</td>\n", " <td>270</td>\n", " <td>29.2413</td>\n", " <td>-1.7587</td>\n", " </tr>\n", " <tr>\n", " <th>683</th>\n", " <td>RFPL</td>\n", " <td>2019</td>\n", " <td>16</td>\n", " <td>FC Orenburg</td>\n", " <td>30</td>\n", " <td>7</td>\n", " <td>6</td>\n", " <td>17</td>\n", " <td>28</td>\n", " <td>52</td>\n", " <td>...</td>\n", " <td>37.169797</td>\n", " <td>-14.830203</td>\n", " <td>32.644130</td>\n", " <td>0.201339</td>\n", " <td>12.830908</td>\n", " <td>9.464581</td>\n", " <td>153</td>\n", " <td>215</td>\n", " <td>39.2364</td>\n", " <td>12.2364</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>684 rows × 24 columns</p>\n", "</div>" ], "text/plain": [ " league year position team matches wins draws \\\n", "0 La_liga 2014 1 Barcelona 38 30 4 \n", "1 La_liga 2014 2 Real Madrid 38 30 2 \n", "2 La_liga 2014 3 Atletico Madrid 38 23 9 \n", "3 La_liga 2014 4 Valencia 38 22 11 \n", "4 La_liga 2014 5 Sevilla 38 23 7 \n", ".. ... ... ... ... ... ... ... \n", "679 RFPL 2019 12 PFC Sochi 30 8 9 \n", "680 RFPL 2019 13 FK Akhmat 30 7 10 \n", "681 RFPL 2019 14 Krylya Sovetov Samara 30 8 7 \n", "682 RFPL 2019 15 FC Tambov 30 9 4 \n", "683 RFPL 2019 16 FC Orenburg 30 7 6 \n", "\n", " loses scored missed ... xGA xGA_diff npxGA npxGD \\\n", "0 4 110 21 ... 28.444293 7.444293 24.727907 73.049305 \n", "1 6 118 38 ... 42.607198 4.607198 38.890805 47.213090 \n", "2 6 67 29 ... 29.069107 0.069107 26.839271 25.748737 \n", "3 5 70 32 ... 39.392572 7.392572 33.446477 16.257501 \n", "4 8 71 45 ... 47.862742 2.862742 41.916529 20.178070 \n", ".. ... ... ... ... ... ... ... ... \n", "679 13 40 39 ... 38.850259 -0.149741 32.780898 -0.096048 \n", "680 13 27 46 ... 40.626196 -5.373804 38.363370 -10.495864 \n", "681 15 33 40 ... 42.980693 2.980693 37.550114 -7.777201 \n", "682 17 37 41 ... 39.747938 -1.252062 34.468003 -12.231948 \n", "683 17 28 52 ... 37.169797 -14.830203 32.644130 0.201339 \n", "\n", " ppda_coef oppda_coef deep deep_allowed xpts xpts_diff \n", "0 5.683535 16.367593 489 114 94.0813 0.0813 \n", "1 10.209085 12.929510 351 153 81.7489 -10.2511 \n", "2 8.982028 9.237091 197 123 73.1353 -4.8647 \n", "3 8.709827 7.870225 203 172 63.7068 -13.2932 \n", "4 8.276148 9.477805 305 168 67.3867 -8.6133 \n", ".. ... ... ... ... ... ... \n", "679 12.838079 10.562327 175 206 38.6587 5.6587 \n", "680 11.199502 10.806357 124 206 36.5424 5.5424 \n", "681 11.949903 10.080858 103 215 36.3363 5.3363 \n", "682 14.666049 9.192768 150 270 29.2413 -1.7587 \n", "683 12.830908 9.464581 153 215 39.2364 12.2364 \n", "\n", "[684 rows x 24 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "understat = pd.read_csv('understat.csv')\n", "understat" ] }, { "cell_type": "code", "execution_count": 11, "id": "e969975f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>league</th>\n", " <th>year</th>\n", " <th>position</th>\n", " <th>team</th>\n", " <th>matches</th>\n", " <th>wins</th>\n", " <th>draws</th>\n", " <th>loses</th>\n", " <th>scored</th>\n", " <th>missed</th>\n", " <th>...</th>\n", " <th>xGA</th>\n", " <th>xGA_diff</th>\n", " <th>npxGA</th>\n", " <th>npxGD</th>\n", " <th>ppda_coef</th>\n", " <th>oppda_coef</th>\n", " <th>deep</th>\n", " <th>deep_allowed</th>\n", " <th>xpts</th>\n", " <th>xpts_diff</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>684</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>...</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>6.840000e+02</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " <td>684.000000</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>168</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>La_liga</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>Barcelona</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>120</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>6</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>NaN</td>\n", " <td>2016.500000</td>\n", " <td>10.061404</td>\n", " <td>NaN</td>\n", " <td>35.935673</td>\n", " <td>13.434211</td>\n", " <td>9.067251</td>\n", " <td>13.434211</td>\n", " <td>48.190058</td>\n", " <td>48.190058</td>\n", " <td>...</td>\n", " <td>47.064744</td>\n", " <td>-1.125315</td>\n", " <td>42.902596</td>\n", " <td>-4.155221e-17</td>\n", " <td>10.911784</td>\n", " <td>10.911772</td>\n", " <td>208.676901</td>\n", " <td>208.676901</td>\n", " <td>49.539598</td>\n", " <td>0.169715</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>NaN</td>\n", " <td>1.709075</td>\n", " <td>5.580165</td>\n", " <td>NaN</td>\n", " <td>3.203487</td>\n", " <td>5.880962</td>\n", " <td>2.941824</td>\n", " <td>5.510278</td>\n", " <td>17.605374</td>\n", " <td>13.866509</td>\n", " <td>...</td>\n", " <td>11.781399</td>\n", " <td>6.663632</td>\n", " <td>11.002013</td>\n", " <td>1.929269e+01</td>\n", " <td>2.521398</td>\n", " <td>3.301410</td>\n", " <td>83.888073</td>\n", " <td>54.713624</td>\n", " <td>13.559213</td>\n", " <td>7.156998</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>NaN</td>\n", " <td>2014.000000</td>\n", " <td>1.000000</td>\n", " <td>NaN</td>\n", " <td>27.000000</td>\n", " <td>2.000000</td>\n", " <td>2.000000</td>\n", " <td>1.000000</td>\n", " <td>13.000000</td>\n", " <td>15.000000</td>\n", " <td>...</td>\n", " <td>16.838674</td>\n", " <td>-29.175087</td>\n", " <td>16.084399</td>\n", " <td>-4.220877e+01</td>\n", " <td>5.683535</td>\n", " <td>4.394458</td>\n", " <td>76.000000</td>\n", " <td>83.000000</td>\n", " <td>17.907700</td>\n", " <td>-24.721600</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>NaN</td>\n", " <td>2015.000000</td>\n", " <td>5.000000</td>\n", " <td>NaN</td>\n", " <td>34.000000</td>\n", " <td>9.000000</td>\n", " <td>7.000000</td>\n", " <td>9.000000</td>\n", " <td>36.000000</td>\n", " <td>38.000000</td>\n", " <td>...</td>\n", " <td>38.916186</td>\n", " <td>-5.698828</td>\n", " <td>35.474606</td>\n", " <td>-1.325816e+01</td>\n", " <td>9.090617</td>\n", " <td>8.809866</td>\n", " <td>151.750000</td>\n", " <td>170.000000</td>\n", " <td>39.466550</td>\n", " <td>-4.498400</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>NaN</td>\n", " <td>2016.500000</td>\n", " <td>10.000000</td>\n", " <td>NaN</td>\n", " <td>38.000000</td>\n", " <td>12.000000</td>\n", " <td>9.000000</td>\n", " <td>14.000000</td>\n", " <td>45.000000</td>\n", " <td>48.000000</td>\n", " <td>...</td>\n", " <td>47.310924</td>\n", " <td>-0.918895</td>\n", " <td>43.031911</td>\n", " <td>-3.127901e+00</td>\n", " <td>10.562543</td>\n", " <td>10.347047</td>\n", " <td>188.000000</td>\n", " <td>205.000000</td>\n", " <td>47.102100</td>\n", " <td>0.116050</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>NaN</td>\n", " <td>2018.000000</td>\n", " <td>15.000000</td>\n", " <td>NaN</td>\n", " <td>38.000000</td>\n", " <td>16.000000</td>\n", " <td>11.000000</td>\n", " <td>17.000000</td>\n", " <td>56.000000</td>\n", " <td>58.000000</td>\n", " <td>...</td>\n", " <td>54.834899</td>\n", " <td>3.381834</td>\n", " <td>50.263465</td>\n", " <td>9.740049e+00</td>\n", " <td>12.434874</td>\n", " <td>12.187434</td>\n", " <td>242.000000</td>\n", " <td>246.250000</td>\n", " <td>56.942025</td>\n", " <td>4.912775</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>NaN</td>\n", " <td>2019.000000</td>\n", " <td>20.000000</td>\n", " <td>NaN</td>\n", " <td>38.000000</td>\n", " <td>32.000000</td>\n", " <td>18.000000</td>\n", " <td>29.000000</td>\n", " <td>118.000000</td>\n", " <td>94.000000</td>\n", " <td>...</td>\n", " <td>88.432186</td>\n", " <td>16.370737</td>\n", " <td>78.535447</td>\n", " <td>7.304931e+01</td>\n", " <td>21.896752</td>\n", " <td>30.468113</td>\n", " <td>582.000000</td>\n", " <td>375.000000</td>\n", " <td>94.380000</td>\n", " <td>23.047500</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>11 rows × 24 columns</p>\n", "</div>" ], "text/plain": [ " league year position team matches wins \\\n", "count 684 684.000000 684.000000 684 684.000000 684.000000 \n", "unique 6 NaN NaN 168 NaN NaN \n", "top La_liga NaN NaN Barcelona NaN NaN \n", "freq 120 NaN NaN 6 NaN NaN \n", "mean NaN 2016.500000 10.061404 NaN 35.935673 13.434211 \n", "std NaN 1.709075 5.580165 NaN 3.203487 5.880962 \n", "min NaN 2014.000000 1.000000 NaN 27.000000 2.000000 \n", "25% NaN 2015.000000 5.000000 NaN 34.000000 9.000000 \n", "50% NaN 2016.500000 10.000000 NaN 38.000000 12.000000 \n", "75% NaN 2018.000000 15.000000 NaN 38.000000 16.000000 \n", "max NaN 2019.000000 20.000000 NaN 38.000000 32.000000 \n", "\n", " draws loses scored missed ... xGA \\\n", "count 684.000000 684.000000 684.000000 684.000000 ... 684.000000 \n", "unique NaN NaN NaN NaN ... NaN \n", "top NaN NaN NaN NaN ... NaN \n", "freq NaN NaN NaN NaN ... NaN \n", "mean 9.067251 13.434211 48.190058 48.190058 ... 47.064744 \n", "std 2.941824 5.510278 17.605374 13.866509 ... 11.781399 \n", "min 2.000000 1.000000 13.000000 15.000000 ... 16.838674 \n", "25% 7.000000 9.000000 36.000000 38.000000 ... 38.916186 \n", "50% 9.000000 14.000000 45.000000 48.000000 ... 47.310924 \n", "75% 11.000000 17.000000 56.000000 58.000000 ... 54.834899 \n", "max 18.000000 29.000000 118.000000 94.000000 ... 88.432186 \n", "\n", " xGA_diff npxGA npxGD ppda_coef oppda_coef \\\n", "count 684.000000 684.000000 6.840000e+02 684.000000 684.000000 \n", "unique NaN NaN NaN NaN NaN \n", "top NaN NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN NaN \n", "mean -1.125315 42.902596 -4.155221e-17 10.911784 10.911772 \n", "std 6.663632 11.002013 1.929269e+01 2.521398 3.301410 \n", "min -29.175087 16.084399 -4.220877e+01 5.683535 4.394458 \n", "25% -5.698828 35.474606 -1.325816e+01 9.090617 8.809866 \n", "50% -0.918895 43.031911 -3.127901e+00 10.562543 10.347047 \n", "75% 3.381834 50.263465 9.740049e+00 12.434874 12.187434 \n", "max 16.370737 78.535447 7.304931e+01 21.896752 30.468113 \n", "\n", " deep deep_allowed xpts xpts_diff \n", "count 684.000000 684.000000 684.000000 684.000000 \n", "unique NaN NaN NaN NaN \n", "top NaN NaN NaN NaN \n", "freq NaN NaN NaN NaN \n", "mean 208.676901 208.676901 49.539598 0.169715 \n", "std 83.888073 54.713624 13.559213 7.156998 \n", "min 76.000000 83.000000 17.907700 -24.721600 \n", "25% 151.750000 170.000000 39.466550 -4.498400 \n", "50% 188.000000 205.000000 47.102100 0.116050 \n", "75% 242.000000 246.250000 56.942025 4.912775 \n", "max 582.000000 375.000000 94.380000 23.047500 \n", "\n", "[11 rows x 24 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "understat.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 6, "id": "136925a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "685 understat.csv\r\n" ] } ], "source": [ "!wc -l understat.csv" ] }, { "cell_type": "code", "execution_count": 7, "id": "9478eaa2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ",,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff\r\n", "La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846\r\n", "La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994\r\n", "La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971\r\n", "La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999\r\n" ] } ], "source": [ "!head -n 5 understat.csv" ] }, { "cell_type": "code", "execution_count": 8, "id": "60448bf7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2 27\r\n", " 18 28\r\n", " 96 30\r\n", " 108 34\r\n", " 10 37\r\n", " 450 38\r\n", " 1 matches\r\n" ] } ], "source": [ "!cut -f 5 -d \",\" understat.csv | sort | uniq -c" ] }, { "cell_type": "code", "execution_count": 9, "id": "f7668054", "metadata": {}, "outputs": [], "source": [ "! grep -P \"^$\" -n understat.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "cba7e932", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }