{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ad6b7dc7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: kaggle in /home/osboxes/.local/lib/python3.8/site-packages (1.5.12)\n",
      "Requirement already satisfied: six>=1.10 in /usr/lib/python3/dist-packages (from kaggle) (1.14.0)\n",
      "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from kaggle) (1.25.8)\n",
      "Requirement already satisfied: python-dateutil in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (2.8.2)\n",
      "Requirement already satisfied: certifi in /usr/lib/python3/dist-packages (from kaggle) (2019.11.28)\n",
      "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from kaggle) (2.22.0)\n",
      "Requirement already satisfied: tqdm in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (4.63.0)\n",
      "Requirement already satisfied: python-slugify in /home/osboxes/.local/lib/python3.8/site-packages (from kaggle) (6.1.1)\n",
      "Requirement already satisfied: text-unidecode>=1.3 in /home/osboxes/.local/lib/python3.8/site-packages (from python-slugify->kaggle) (1.3)\n"
     ]
    }
   ],
   "source": [
    "!pip install --user kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4ab2c14f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in /home/osboxes/.local/lib/python3.8/site-packages (1.4.1)\r\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2.8.2)\r\n",
      "Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (2022.1)\r\n",
      "Requirement already satisfied: numpy>=1.18.5; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /home/osboxes/.local/lib/python3.8/site-packages (from pandas) (1.22.3)\r\n",
      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas) (1.14.0)\r\n"
     ]
    }
   ],
   "source": [
    "!pip install --user pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c0597767",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘/home/osboxes/.kaggle’: File exists\r\n"
     ]
    }
   ],
   "source": [
    "!mkdir ~/.kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2465b1e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!cp /home/osboxes/Downloads/kaggle.json /home/osboxes/.kaggle/kaggle.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "faa7e821",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/osboxes/.kaggle/kaggle.json'\n",
      "Downloading extended-football-stats-for-european-leagues-xg.zip to /home/osboxes/jupyter_dir/notebooks/IUM\n",
      " 73%|███████████████████████████▋          | 1.00M/1.37M [00:00<00:00, 4.92MB/s]\n",
      "100%|██████████████████████████████████████| 1.37M/1.37M [00:00<00:00, 6.55MB/s]\n"
     ]
    }
   ],
   "source": [
    "!kaggle datasets download -d slehkyi/extended-football-stats-for-european-leagues-xg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d5b18a91",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  extended-football-stats-for-european-leagues-xg.zip\n",
      "  inflating: understat.com.csv       \n",
      "  inflating: understat_per_game.csv  \n"
     ]
    }
   ],
   "source": [
    "!unzip -o extended-football-stats-for-european-leagues-xg.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "0283db51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting seaborn\n",
      "  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)\n",
      "\u001b[K     |████████████████████████████████| 292 kB 2.0 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: pandas>=0.23 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.4.1)\n",
      "Requirement already satisfied: numpy>=1.15 in /home/osboxes/.local/lib/python3.8/site-packages (from seaborn) (1.22.3)\n",
      "Collecting matplotlib>=2.2\n",
      "  Downloading matplotlib-3.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)\n",
      "\u001b[K     |████████████████████████████████| 11.3 MB 5.7 MB/s eta 0:00:01     |██████████████████████          | 7.7 MB 5.7 MB/s eta 0:00:01     |█████████████████████████       | 8.8 MB 5.7 MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting scipy>=1.0\n",
      "  Downloading scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)\n",
      "\u001b[K     |████████████████████████████████| 41.6 MB 27 kB/s  eta 0:00:011   |███▊                            | 4.9 MB 5.0 MB/s eta 0:00:08     |██████                          | 7.8 MB 7.9 MB/s eta 0:00:05     |██████████████████              | 23.3 MB 4.5 MB/s eta 0:00:05     |██████████████████▎             | 23.7 MB 4.5 MB/s eta 0:00:04     |█████████████████████▍          | 27.8 MB 10.0 MB/s eta 0:00:02     |███████████████████████████▏    | 35.3 MB 7.2 MB/s eta 0:00:01     |████████████████████████████▍   | 36.9 MB 7.2 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /home/osboxes/.local/lib/python3.8/site-packages (from pandas>=0.23->seaborn) (2022.1)\n",
      "Collecting fonttools>=4.22.0\n",
      "  Downloading fonttools-4.31.1-py3-none-any.whl (899 kB)\n",
      "\u001b[K     |████████████████████████████████| 899 kB 3.2 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: pillow>=6.2.0 in /usr/lib/python3/dist-packages (from matplotlib>=2.2->seaborn) (7.0.0)\n",
      "Requirement already satisfied: pyparsing>=2.2.1 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (3.0.7)\n",
      "Collecting kiwisolver>=1.0.1\n",
      "  Downloading kiwisolver-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)\n",
      "\u001b[K     |████████████████████████████████| 1.2 MB 12.4 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /home/osboxes/.local/lib/python3.8/site-packages (from matplotlib>=2.2->seaborn) (21.3)\n",
      "Collecting cycler>=0.10\n",
      "  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)\n",
      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas>=0.23->seaborn) (1.14.0)\n",
      "Installing collected packages: fonttools, kiwisolver, cycler, matplotlib, scipy, seaborn\n",
      "Successfully installed cycler-0.11.0 fonttools-4.31.1 kiwisolver-1.4.0 matplotlib-3.5.1 scipy-1.8.0 seaborn-0.11.2\n"
     ]
    }
   ],
   "source": [
    "!pip install --user seaborn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2cd1e392",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ",,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff\r\n",
      "La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846\r\n",
      "La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994\r\n",
      "La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971\r\n",
      "La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 5 understat.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "12a3ddce",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>league</th>\n",
       "      <th>year</th>\n",
       "      <th>position</th>\n",
       "      <th>team</th>\n",
       "      <th>matches</th>\n",
       "      <th>wins</th>\n",
       "      <th>draws</th>\n",
       "      <th>loses</th>\n",
       "      <th>scored</th>\n",
       "      <th>missed</th>\n",
       "      <th>...</th>\n",
       "      <th>xGA</th>\n",
       "      <th>xGA_diff</th>\n",
       "      <th>npxGA</th>\n",
       "      <th>npxGD</th>\n",
       "      <th>ppda_coef</th>\n",
       "      <th>oppda_coef</th>\n",
       "      <th>deep</th>\n",
       "      <th>deep_allowed</th>\n",
       "      <th>xpts</th>\n",
       "      <th>xpts_diff</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>2014</td>\n",
       "      <td>1</td>\n",
       "      <td>Barcelona</td>\n",
       "      <td>38</td>\n",
       "      <td>30</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>110</td>\n",
       "      <td>21</td>\n",
       "      <td>...</td>\n",
       "      <td>28.444293</td>\n",
       "      <td>7.444293</td>\n",
       "      <td>24.727907</td>\n",
       "      <td>73.049305</td>\n",
       "      <td>5.683535</td>\n",
       "      <td>16.367593</td>\n",
       "      <td>489</td>\n",
       "      <td>114</td>\n",
       "      <td>94.0813</td>\n",
       "      <td>0.0813</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>2014</td>\n",
       "      <td>2</td>\n",
       "      <td>Real Madrid</td>\n",
       "      <td>38</td>\n",
       "      <td>30</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>118</td>\n",
       "      <td>38</td>\n",
       "      <td>...</td>\n",
       "      <td>42.607198</td>\n",
       "      <td>4.607198</td>\n",
       "      <td>38.890805</td>\n",
       "      <td>47.213090</td>\n",
       "      <td>10.209085</td>\n",
       "      <td>12.929510</td>\n",
       "      <td>351</td>\n",
       "      <td>153</td>\n",
       "      <td>81.7489</td>\n",
       "      <td>-10.2511</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>2014</td>\n",
       "      <td>3</td>\n",
       "      <td>Atletico Madrid</td>\n",
       "      <td>38</td>\n",
       "      <td>23</td>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>67</td>\n",
       "      <td>29</td>\n",
       "      <td>...</td>\n",
       "      <td>29.069107</td>\n",
       "      <td>0.069107</td>\n",
       "      <td>26.839271</td>\n",
       "      <td>25.748737</td>\n",
       "      <td>8.982028</td>\n",
       "      <td>9.237091</td>\n",
       "      <td>197</td>\n",
       "      <td>123</td>\n",
       "      <td>73.1353</td>\n",
       "      <td>-4.8647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>2014</td>\n",
       "      <td>4</td>\n",
       "      <td>Valencia</td>\n",
       "      <td>38</td>\n",
       "      <td>22</td>\n",
       "      <td>11</td>\n",
       "      <td>5</td>\n",
       "      <td>70</td>\n",
       "      <td>32</td>\n",
       "      <td>...</td>\n",
       "      <td>39.392572</td>\n",
       "      <td>7.392572</td>\n",
       "      <td>33.446477</td>\n",
       "      <td>16.257501</td>\n",
       "      <td>8.709827</td>\n",
       "      <td>7.870225</td>\n",
       "      <td>203</td>\n",
       "      <td>172</td>\n",
       "      <td>63.7068</td>\n",
       "      <td>-13.2932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>Sevilla</td>\n",
       "      <td>38</td>\n",
       "      <td>23</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>71</td>\n",
       "      <td>45</td>\n",
       "      <td>...</td>\n",
       "      <td>47.862742</td>\n",
       "      <td>2.862742</td>\n",
       "      <td>41.916529</td>\n",
       "      <td>20.178070</td>\n",
       "      <td>8.276148</td>\n",
       "      <td>9.477805</td>\n",
       "      <td>305</td>\n",
       "      <td>168</td>\n",
       "      <td>67.3867</td>\n",
       "      <td>-8.6133</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>679</th>\n",
       "      <td>RFPL</td>\n",
       "      <td>2019</td>\n",
       "      <td>12</td>\n",
       "      <td>PFC Sochi</td>\n",
       "      <td>30</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>13</td>\n",
       "      <td>40</td>\n",
       "      <td>39</td>\n",
       "      <td>...</td>\n",
       "      <td>38.850259</td>\n",
       "      <td>-0.149741</td>\n",
       "      <td>32.780898</td>\n",
       "      <td>-0.096048</td>\n",
       "      <td>12.838079</td>\n",
       "      <td>10.562327</td>\n",
       "      <td>175</td>\n",
       "      <td>206</td>\n",
       "      <td>38.6587</td>\n",
       "      <td>5.6587</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>680</th>\n",
       "      <td>RFPL</td>\n",
       "      <td>2019</td>\n",
       "      <td>13</td>\n",
       "      <td>FK Akhmat</td>\n",
       "      <td>30</td>\n",
       "      <td>7</td>\n",
       "      <td>10</td>\n",
       "      <td>13</td>\n",
       "      <td>27</td>\n",
       "      <td>46</td>\n",
       "      <td>...</td>\n",
       "      <td>40.626196</td>\n",
       "      <td>-5.373804</td>\n",
       "      <td>38.363370</td>\n",
       "      <td>-10.495864</td>\n",
       "      <td>11.199502</td>\n",
       "      <td>10.806357</td>\n",
       "      <td>124</td>\n",
       "      <td>206</td>\n",
       "      <td>36.5424</td>\n",
       "      <td>5.5424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>681</th>\n",
       "      <td>RFPL</td>\n",
       "      <td>2019</td>\n",
       "      <td>14</td>\n",
       "      <td>Krylya Sovetov Samara</td>\n",
       "      <td>30</td>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>15</td>\n",
       "      <td>33</td>\n",
       "      <td>40</td>\n",
       "      <td>...</td>\n",
       "      <td>42.980693</td>\n",
       "      <td>2.980693</td>\n",
       "      <td>37.550114</td>\n",
       "      <td>-7.777201</td>\n",
       "      <td>11.949903</td>\n",
       "      <td>10.080858</td>\n",
       "      <td>103</td>\n",
       "      <td>215</td>\n",
       "      <td>36.3363</td>\n",
       "      <td>5.3363</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>682</th>\n",
       "      <td>RFPL</td>\n",
       "      <td>2019</td>\n",
       "      <td>15</td>\n",
       "      <td>FC Tambov</td>\n",
       "      <td>30</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>17</td>\n",
       "      <td>37</td>\n",
       "      <td>41</td>\n",
       "      <td>...</td>\n",
       "      <td>39.747938</td>\n",
       "      <td>-1.252062</td>\n",
       "      <td>34.468003</td>\n",
       "      <td>-12.231948</td>\n",
       "      <td>14.666049</td>\n",
       "      <td>9.192768</td>\n",
       "      <td>150</td>\n",
       "      <td>270</td>\n",
       "      <td>29.2413</td>\n",
       "      <td>-1.7587</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>683</th>\n",
       "      <td>RFPL</td>\n",
       "      <td>2019</td>\n",
       "      <td>16</td>\n",
       "      <td>FC Orenburg</td>\n",
       "      <td>30</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>17</td>\n",
       "      <td>28</td>\n",
       "      <td>52</td>\n",
       "      <td>...</td>\n",
       "      <td>37.169797</td>\n",
       "      <td>-14.830203</td>\n",
       "      <td>32.644130</td>\n",
       "      <td>0.201339</td>\n",
       "      <td>12.830908</td>\n",
       "      <td>9.464581</td>\n",
       "      <td>153</td>\n",
       "      <td>215</td>\n",
       "      <td>39.2364</td>\n",
       "      <td>12.2364</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>684 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      league  year  position                   team  matches  wins  draws  \\\n",
       "0    La_liga  2014         1              Barcelona       38    30      4   \n",
       "1    La_liga  2014         2            Real Madrid       38    30      2   \n",
       "2    La_liga  2014         3        Atletico Madrid       38    23      9   \n",
       "3    La_liga  2014         4               Valencia       38    22     11   \n",
       "4    La_liga  2014         5                Sevilla       38    23      7   \n",
       "..       ...   ...       ...                    ...      ...   ...    ...   \n",
       "679     RFPL  2019        12              PFC Sochi       30     8      9   \n",
       "680     RFPL  2019        13              FK Akhmat       30     7     10   \n",
       "681     RFPL  2019        14  Krylya Sovetov Samara       30     8      7   \n",
       "682     RFPL  2019        15              FC Tambov       30     9      4   \n",
       "683     RFPL  2019        16            FC Orenburg       30     7      6   \n",
       "\n",
       "     loses  scored  missed  ...        xGA   xGA_diff      npxGA      npxGD  \\\n",
       "0        4     110      21  ...  28.444293   7.444293  24.727907  73.049305   \n",
       "1        6     118      38  ...  42.607198   4.607198  38.890805  47.213090   \n",
       "2        6      67      29  ...  29.069107   0.069107  26.839271  25.748737   \n",
       "3        5      70      32  ...  39.392572   7.392572  33.446477  16.257501   \n",
       "4        8      71      45  ...  47.862742   2.862742  41.916529  20.178070   \n",
       "..     ...     ...     ...  ...        ...        ...        ...        ...   \n",
       "679     13      40      39  ...  38.850259  -0.149741  32.780898  -0.096048   \n",
       "680     13      27      46  ...  40.626196  -5.373804  38.363370 -10.495864   \n",
       "681     15      33      40  ...  42.980693   2.980693  37.550114  -7.777201   \n",
       "682     17      37      41  ...  39.747938  -1.252062  34.468003 -12.231948   \n",
       "683     17      28      52  ...  37.169797 -14.830203  32.644130   0.201339   \n",
       "\n",
       "     ppda_coef  oppda_coef  deep  deep_allowed     xpts  xpts_diff  \n",
       "0     5.683535   16.367593   489           114  94.0813     0.0813  \n",
       "1    10.209085   12.929510   351           153  81.7489   -10.2511  \n",
       "2     8.982028    9.237091   197           123  73.1353    -4.8647  \n",
       "3     8.709827    7.870225   203           172  63.7068   -13.2932  \n",
       "4     8.276148    9.477805   305           168  67.3867    -8.6133  \n",
       "..         ...         ...   ...           ...      ...        ...  \n",
       "679  12.838079   10.562327   175           206  38.6587     5.6587  \n",
       "680  11.199502   10.806357   124           206  36.5424     5.5424  \n",
       "681  11.949903   10.080858   103           215  36.3363     5.3363  \n",
       "682  14.666049    9.192768   150           270  29.2413    -1.7587  \n",
       "683  12.830908    9.464581   153           215  39.2364    12.2364  \n",
       "\n",
       "[684 rows x 24 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "understat = pd.read_csv('understat.csv')\n",
    "understat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e969975f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>league</th>\n",
       "      <th>year</th>\n",
       "      <th>position</th>\n",
       "      <th>team</th>\n",
       "      <th>matches</th>\n",
       "      <th>wins</th>\n",
       "      <th>draws</th>\n",
       "      <th>loses</th>\n",
       "      <th>scored</th>\n",
       "      <th>missed</th>\n",
       "      <th>...</th>\n",
       "      <th>xGA</th>\n",
       "      <th>xGA_diff</th>\n",
       "      <th>npxGA</th>\n",
       "      <th>npxGD</th>\n",
       "      <th>ppda_coef</th>\n",
       "      <th>oppda_coef</th>\n",
       "      <th>deep</th>\n",
       "      <th>deep_allowed</th>\n",
       "      <th>xpts</th>\n",
       "      <th>xpts_diff</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>684</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>6.840000e+02</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "      <td>684.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>168</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>La_liga</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Barcelona</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>120</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2016.500000</td>\n",
       "      <td>10.061404</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35.935673</td>\n",
       "      <td>13.434211</td>\n",
       "      <td>9.067251</td>\n",
       "      <td>13.434211</td>\n",
       "      <td>48.190058</td>\n",
       "      <td>48.190058</td>\n",
       "      <td>...</td>\n",
       "      <td>47.064744</td>\n",
       "      <td>-1.125315</td>\n",
       "      <td>42.902596</td>\n",
       "      <td>-4.155221e-17</td>\n",
       "      <td>10.911784</td>\n",
       "      <td>10.911772</td>\n",
       "      <td>208.676901</td>\n",
       "      <td>208.676901</td>\n",
       "      <td>49.539598</td>\n",
       "      <td>0.169715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1.709075</td>\n",
       "      <td>5.580165</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.203487</td>\n",
       "      <td>5.880962</td>\n",
       "      <td>2.941824</td>\n",
       "      <td>5.510278</td>\n",
       "      <td>17.605374</td>\n",
       "      <td>13.866509</td>\n",
       "      <td>...</td>\n",
       "      <td>11.781399</td>\n",
       "      <td>6.663632</td>\n",
       "      <td>11.002013</td>\n",
       "      <td>1.929269e+01</td>\n",
       "      <td>2.521398</td>\n",
       "      <td>3.301410</td>\n",
       "      <td>83.888073</td>\n",
       "      <td>54.713624</td>\n",
       "      <td>13.559213</td>\n",
       "      <td>7.156998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2014.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>13.000000</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>16.838674</td>\n",
       "      <td>-29.175087</td>\n",
       "      <td>16.084399</td>\n",
       "      <td>-4.220877e+01</td>\n",
       "      <td>5.683535</td>\n",
       "      <td>4.394458</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>83.000000</td>\n",
       "      <td>17.907700</td>\n",
       "      <td>-24.721600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2015.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>36.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>38.916186</td>\n",
       "      <td>-5.698828</td>\n",
       "      <td>35.474606</td>\n",
       "      <td>-1.325816e+01</td>\n",
       "      <td>9.090617</td>\n",
       "      <td>8.809866</td>\n",
       "      <td>151.750000</td>\n",
       "      <td>170.000000</td>\n",
       "      <td>39.466550</td>\n",
       "      <td>-4.498400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2016.500000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>48.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>47.310924</td>\n",
       "      <td>-0.918895</td>\n",
       "      <td>43.031911</td>\n",
       "      <td>-3.127901e+00</td>\n",
       "      <td>10.562543</td>\n",
       "      <td>10.347047</td>\n",
       "      <td>188.000000</td>\n",
       "      <td>205.000000</td>\n",
       "      <td>47.102100</td>\n",
       "      <td>0.116050</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2018.000000</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>17.000000</td>\n",
       "      <td>56.000000</td>\n",
       "      <td>58.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>54.834899</td>\n",
       "      <td>3.381834</td>\n",
       "      <td>50.263465</td>\n",
       "      <td>9.740049e+00</td>\n",
       "      <td>12.434874</td>\n",
       "      <td>12.187434</td>\n",
       "      <td>242.000000</td>\n",
       "      <td>246.250000</td>\n",
       "      <td>56.942025</td>\n",
       "      <td>4.912775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2019.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>32.000000</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>118.000000</td>\n",
       "      <td>94.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>88.432186</td>\n",
       "      <td>16.370737</td>\n",
       "      <td>78.535447</td>\n",
       "      <td>7.304931e+01</td>\n",
       "      <td>21.896752</td>\n",
       "      <td>30.468113</td>\n",
       "      <td>582.000000</td>\n",
       "      <td>375.000000</td>\n",
       "      <td>94.380000</td>\n",
       "      <td>23.047500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         league         year    position       team     matches        wins  \\\n",
       "count       684   684.000000  684.000000        684  684.000000  684.000000   \n",
       "unique        6          NaN         NaN        168         NaN         NaN   \n",
       "top     La_liga          NaN         NaN  Barcelona         NaN         NaN   \n",
       "freq        120          NaN         NaN          6         NaN         NaN   \n",
       "mean        NaN  2016.500000   10.061404        NaN   35.935673   13.434211   \n",
       "std         NaN     1.709075    5.580165        NaN    3.203487    5.880962   \n",
       "min         NaN  2014.000000    1.000000        NaN   27.000000    2.000000   \n",
       "25%         NaN  2015.000000    5.000000        NaN   34.000000    9.000000   \n",
       "50%         NaN  2016.500000   10.000000        NaN   38.000000   12.000000   \n",
       "75%         NaN  2018.000000   15.000000        NaN   38.000000   16.000000   \n",
       "max         NaN  2019.000000   20.000000        NaN   38.000000   32.000000   \n",
       "\n",
       "             draws       loses      scored      missed  ...         xGA  \\\n",
       "count   684.000000  684.000000  684.000000  684.000000  ...  684.000000   \n",
       "unique         NaN         NaN         NaN         NaN  ...         NaN   \n",
       "top            NaN         NaN         NaN         NaN  ...         NaN   \n",
       "freq           NaN         NaN         NaN         NaN  ...         NaN   \n",
       "mean      9.067251   13.434211   48.190058   48.190058  ...   47.064744   \n",
       "std       2.941824    5.510278   17.605374   13.866509  ...   11.781399   \n",
       "min       2.000000    1.000000   13.000000   15.000000  ...   16.838674   \n",
       "25%       7.000000    9.000000   36.000000   38.000000  ...   38.916186   \n",
       "50%       9.000000   14.000000   45.000000   48.000000  ...   47.310924   \n",
       "75%      11.000000   17.000000   56.000000   58.000000  ...   54.834899   \n",
       "max      18.000000   29.000000  118.000000   94.000000  ...   88.432186   \n",
       "\n",
       "          xGA_diff       npxGA         npxGD   ppda_coef  oppda_coef  \\\n",
       "count   684.000000  684.000000  6.840000e+02  684.000000  684.000000   \n",
       "unique         NaN         NaN           NaN         NaN         NaN   \n",
       "top            NaN         NaN           NaN         NaN         NaN   \n",
       "freq           NaN         NaN           NaN         NaN         NaN   \n",
       "mean     -1.125315   42.902596 -4.155221e-17   10.911784   10.911772   \n",
       "std       6.663632   11.002013  1.929269e+01    2.521398    3.301410   \n",
       "min     -29.175087   16.084399 -4.220877e+01    5.683535    4.394458   \n",
       "25%      -5.698828   35.474606 -1.325816e+01    9.090617    8.809866   \n",
       "50%      -0.918895   43.031911 -3.127901e+00   10.562543   10.347047   \n",
       "75%       3.381834   50.263465  9.740049e+00   12.434874   12.187434   \n",
       "max      16.370737   78.535447  7.304931e+01   21.896752   30.468113   \n",
       "\n",
       "              deep  deep_allowed        xpts   xpts_diff  \n",
       "count   684.000000    684.000000  684.000000  684.000000  \n",
       "unique         NaN           NaN         NaN         NaN  \n",
       "top            NaN           NaN         NaN         NaN  \n",
       "freq           NaN           NaN         NaN         NaN  \n",
       "mean    208.676901    208.676901   49.539598    0.169715  \n",
       "std      83.888073     54.713624   13.559213    7.156998  \n",
       "min      76.000000     83.000000   17.907700  -24.721600  \n",
       "25%     151.750000    170.000000   39.466550   -4.498400  \n",
       "50%     188.000000    205.000000   47.102100    0.116050  \n",
       "75%     242.000000    246.250000   56.942025    4.912775  \n",
       "max     582.000000    375.000000   94.380000   23.047500  \n",
       "\n",
       "[11 rows x 24 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "understat.describe(include='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "136925a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "685 understat.csv\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l understat.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9478eaa2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ",,position,team,matches,wins,draws,loses,scored,missed,pts,xG,xG_diff,npxG,xGA,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff\r\n",
      "La_liga,2014,1,Barcelona,38,30,4,4,110,21,94,102.98015200000002,-7.019847999999982,97.77721200000002,28.44429270000001,7.444292700000009,24.727906700000005,73.04930530000001,5.683534703382723,16.367592989090525,489,114,94.08129999999998,0.0812999999999846\r\n",
      "La_liga,2014,2,Real Madrid,38,30,2,6,118,38,92,95.76624299999999,-22.23375700000001,86.10389499999998,42.607198000000004,4.607198000000004,38.890805,47.213090000000015,10.209085456325049,12.929510106152211,351,153,81.7489,-10.251099999999994\r\n",
      "La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,78,57.047670000000004,-9.952329999999996,52.588007999999995,29.069107100000004,0.06910710000000364,26.839271100000005,25.748736900000008,8.982028430893806,9.237090640679776,197,123,73.13530000000003,-4.864699999999971\r\n",
      "La_liga,2014,4,Valencia,38,22,11,5,70,32,77,55.06250000000001,-14.937499999999993,49.703978,39.392571999999994,7.392571999999994,33.44647700000001,16.257500999999998,8.709827299105736,7.870224725817145,203,172,63.7068,-13.293199999999999\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 5 understat.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "60448bf7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      2 27\r\n",
      "     18 28\r\n",
      "     96 30\r\n",
      "    108 34\r\n",
      "     10 37\r\n",
      "    450 38\r\n",
      "      1 matches\r\n"
     ]
    }
   ],
   "source": [
    "!cut -f 5 -d \",\" understat.csv | sort | uniq -c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f7668054",
   "metadata": {},
   "outputs": [],
   "source": [
    "! grep -P \"^$\" -n understat.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cba7e932",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}