{ "cells": [ { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "All data: 376884\n", "Train size: 226125\n", "Test size: 75384\n", "Validate size: 75375\n", " date home_team away_team home_score away_score tournament \\\n", "count 41876 41876 41876 41876.000000 41876.000000 41876 \n", "unique 15232 308 306 NaN NaN 112 \n", "top 2012-02-29 brazil uruguay NaN NaN friendly \n", "freq 66 570 543 NaN NaN 17136 \n", "mean NaN NaN NaN 1.744293 1.186503 NaN \n", "std NaN NaN NaN 1.752248 1.403053 NaN \n", "min NaN NaN NaN 0.000000 0.000000 NaN \n", "25% NaN NaN NaN 1.000000 0.000000 NaN \n", "50% NaN NaN NaN 1.000000 1.000000 NaN \n", "75% NaN NaN NaN 2.000000 2.000000 NaN \n", "max NaN NaN NaN 31.000000 21.000000 NaN \n", "\n", " city country neutral \n", "count 41876 41876 41876 \n", "unique 2026 266 2 \n", "top kuala lumpur united states False \n", "freq 589 1160 31557 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", "min NaN NaN NaN \n", "25% NaN NaN NaN \n", "50% NaN NaN NaN \n", "75% NaN NaN NaN \n", "max NaN NaN NaN \n", "376884\n", "\n", " date\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " home_team\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " away_team\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " home_score\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " away_score\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " tournament\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " city\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " country\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n", "\n", " neutral\n", "AxesSubplot(0.125,0.125;0.775x0.755)\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEICAYAAABfz4NwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAASmklEQVR4nO3df6zddX3H8eeLVrFTYfy4ENYW20mTCWxW6bouJIuz2+h0WTGD7JIpXdKkhtREM2MCZpvuRxfYokSSQYLBUQgTOtTQTNgkRWNcGHghzFKw80YQajtaAbH+AG1974/zufH0cnp/tvdcOM9H8s35nvf3+/ne9zdp87rf7+d7zk1VIUnSCf1uQJI0PxgIkiTAQJAkNQaCJAkwECRJjYEgSQJgYb8bmKnTTz+9li1b1u82JOkV5aGHHvpeVQ312vaKDYRly5YxMjLS7zYk6RUlyXeOts1bRpIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1LxiP5j2SrHsyi/2u4VXlSevfne/W5BetbxCkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmkkDIcnrkjyY5H+S7EryN61+apJ7k3yrvZ7SNeaqJKNJdie5qKt+QZKdbdt1SdLqJya5o9UfSLLsOJyrJGkCU7lCeAl4Z1W9FVgJrEuyBrgS2FFVK4Ad7T1JzgWGgfOAdcD1SRa0Y90AbAJWtGVdq28Enq+qc4BrgWtmf2qSpOmYNBCq44ft7WvaUsB6YGurbwUubuvrgdur6qWqegIYBVYnOQs4qarur6oCbhk3ZuxYdwJrx64eJElzY0pzCEkWJHkE2A/cW1UPAGdW1T6A9npG230x8HTX8D2ttritj68fMaaqDgEvAKfN4HwkSTM0pUCoqsNVtRJYQue3/fMn2L3Xb/Y1QX2iMUceONmUZCTJyIEDBybpWpI0HdN6yqiqvg98hc69/2fabSDa6/622x5gadewJcDeVl/So37EmCQLgZOB53r8/BuralVVrRoaGppO65KkSUzlKaOhJL/c1hcBvwd8E9gObGi7bQDuauvbgeH25NByOpPHD7bbSgeTrGnzA5ePGzN2rEuA+9o8gyRpjkzl207PAra2J4VOALZV1b8nuR/YlmQj8BRwKUBV7UqyDXgMOARsrqrD7VhXADcDi4B72gJwE3BrklE6VwbDx+LkJElTN2kgVNU3gLf1qD8LrD3KmC3Alh71EeBl8w9V9SItUCRJ/eEnlSVJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKmZNBCSLE3y5SSPJ9mV5IOt/vEk303ySFve1TXmqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7DicqyRpAlO5QjgEfLiq3gKsATYnObdtu7aqVrblboC2bRg4D1gHXJ9kQdv/BmATsKIt61p9I/B8VZ0DXAtcM/tTkyRNx6SBUFX7qurhtn4QeBxYPMGQ9cDtVfVSVT0BjAKrk5wFnFRV91dVAbcAF3eN2drW7wTWjl09SJLmxrTmENqtnLcBD7TSB5J8I8lnkpzSaouBp7uG7Wm1xW19fP2IMVV1CHgBOK3Hz9+UZCTJyIEDB6bTuiRpElMOhCRvAD4HfKiqfkDn9s+bgZXAPuATY7v2GF4T1Ccac2Sh6saqWlVVq4aGhqbauiRpCqYUCEleQycMbquqzwNU1TNVdbiqfg58Gljddt8DLO0avgTY2+pLetSPGJNkIXAy8NxMTkiSNDNTecoowE3A41X1ya76WV27vQd4tK1vB4bbk0PL6UweP1hV+4CDSda0Y14O3NU1ZkNbvwS4r80zSJLmyMIp7HMh8D5gZ5JHWu2jwGVJVtK5tfMk8H6AqtqVZBvwGJ0nlDZX1eE27grgZmARcE9boBM4tyYZpXNlMDybk5IkTd+kgVBVX6P3Pf67JxizBdjSoz4CnN+j/iJw6WS9SJKOHz+pLEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJmEIgJFma5MtJHk+yK8kHW/3UJPcm+VZ7PaVrzFVJRpPsTnJRV/2CJDvbtuuSpNVPTHJHqz+QZNlxOFdJ0gSmcoVwCPhwVb0FWANsTnIucCWwo6pWADvae9q2YeA8YB1wfZIF7Vg3AJuAFW1Z1+obgeer6hzgWuCaY3BukqRpmDQQqmpfVT3c1g8CjwOLgfXA1rbbVuDitr4euL2qXqqqJ4BRYHWSs4CTqur+qirglnFjxo51J7B27OpBkjQ3pjWH0G7lvA14ADizqvZBJzSAM9pui4Gnu4btabXFbX18/YgxVXUIeAE4bTq9SZJmZ8qBkOQNwOeAD1XVDybatUetJqhPNGZ8D5uSjCQZOXDgwGQtS5KmYUqBkOQ1dMLgtqr6fCs/024D0V73t/oeYGnX8CXA3lZf0qN+xJgkC4GTgefG91FVN1bVqqpaNTQ0NJXWJUlTNJWnjALcBDxeVZ/s2rQd2NDWNwB3ddWH25NDy+lMHj/YbisdTLKmHfPycWPGjnUJcF+bZ5AkzZGFU9jnQuB9wM4kj7TaR4GrgW1JNgJPAZcCVNWuJNuAx+g8obS5qg63cVcANwOLgHvaAp3AuTXJKJ0rg+HZnZYkabomDYSq+hq97/EDrD3KmC3Alh71EeD8HvUXaYEiSeoPP6ksSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJzaSBkOQzSfYnebSr9vEk303ySFve1bXtqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7BifoyRpCqZyhXAzsK5H/dqqWtmWuwGSnAsMA+e1MdcnWdD2vwHYBKxoy9gxNwLPV9U5wLXANTM8F0nSLEwaCFX1VeC5KR5vPXB7Vb1UVU8Ao8DqJGcBJ1XV/VVVwC3AxV1jtrb1O4G1Y1cPkqS5M5s5hA8k+Ua7pXRKqy0Gnu7aZ0+rLW7r4+tHjKmqQ8ALwGmz6EuSNAMzDYQbgDcDK4F9wCdavddv9jVBfaIxL5NkU5KRJCMHDhyYVsOSpInNKBCq6pmqOlxVPwc+Daxum/YAS7t2XQLsbfUlPepHjEmyEDiZo9yiqqobq2pVVa0aGhqaSeuSpKOYUSC0OYEx7wHGnkDaDgy3J4eW05k8frCq9gEHk6xp8wOXA3d1jdnQ1i8B7mvzDJKkObRwsh2SfBZ4B3B6kj3Ax4B3JFlJ59bOk8D7AapqV5JtwGPAIWBzVR1uh7qCzhNLi4B72gJwE3BrklE6VwbDx+C8JEnTNGkgVNVlPco3TbD/FmBLj/oIcH6P+ovApZP1IUk6vvyksiQJMBAkSY2BIEkCDARJUmMgSJKAKTxlJOnVadmVX+x3C68qT1797n63MGteIUiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUTBoIST6TZH+SR7tqpya5N8m32uspXduuSjKaZHeSi7rqFyTZ2bZdlyStfmKSO1r9gSTLjvE5SpKmYCpXCDcD68bVrgR2VNUKYEd7T5JzgWHgvDbm+iQL2pgbgE3AiraMHXMj8HxVnQNcC1wz05ORJM3cpIFQVV8FnhtXXg9sbetbgYu76rdX1UtV9QQwCqxOchZwUlXdX1UF3DJuzNix7gTWjl09SJLmzkznEM6sqn0A7fWMVl8MPN21355WW9zWx9ePGFNVh4AXgNNm2JckaYaO9aRyr9/sa4L6RGNefvBkU5KRJCMHDhyYYYuSpF5mGgjPtNtAtNf9rb4HWNq13xJgb6sv6VE/YkyShcDJvPwWFQBVdWNVraqqVUNDQzNsXZLUy0wDYTuwoa1vAO7qqg+3J4eW05k8frDdVjqYZE2bH7h83JixY10C3NfmGSRJc2jhZDsk+SzwDuD0JHuAjwFXA9uSbASeAi4FqKpdSbYBjwGHgM1Vdbgd6go6TywtAu5pC8BNwK1JRulcGQwfkzOTJE3LpIFQVZcdZdPao+y/BdjSoz4CnN+j/iItUCRJ/eMnlSVJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKmZVSAkeTLJziSPJBlptVOT3JvkW+31lK79r0oymmR3kou66he044wmuS5JZtOXJGn6jsUVwu9W1cqqWtXeXwnsqKoVwI72niTnAsPAecA64PokC9qYG4BNwIq2rDsGfUmSpuF43DJaD2xt61uBi7vqt1fVS1X1BDAKrE5yFnBSVd1fVQXc0jVGkjRHZhsIBXwpyUNJNrXamVW1D6C9ntHqi4Gnu8buabXFbX18XZI0hxbOcvyFVbU3yRnAvUm+OcG+veYFaoL6yw/QCZ1NAGefffZ0e5UkTWBWVwhVtbe97ge+AKwGnmm3gWiv+9vue4ClXcOXAHtbfUmPeq+fd2NVraqqVUNDQ7NpXZI0zowDIcnrk7xxbB34A+BRYDuwoe22AbirrW8HhpOcmGQ5ncnjB9ttpYNJ1rSniy7vGiNJmiOzuWV0JvCF9oToQuBfq+o/knwd2JZkI/AUcClAVe1Ksg14DDgEbK6qw+1YVwA3A4uAe9oiSZpDMw6Eqvo28NYe9WeBtUcZswXY0qM+Apw/014kSbPnJ5UlSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpmTeBkGRdkt1JRpNc2e9+JGnQzItASLIA+GfgD4FzgcuSnNvfriRpsMyLQABWA6NV9e2q+ilwO7C+zz1J0kCZL4GwGHi66/2eVpMkzZGF/W6gSY9avWynZBOwqb39YZLdx7WrwXI68L1+NzGZXNPvDtQH/ts8tt50tA3zJRD2AEu73i8B9o7fqapuBG6cq6YGSZKRqlrV7z6k8fy3OXfmyy2jrwMrkixP8lpgGNje554kaaDMiyuEqjqU5APAfwILgM9U1a4+tyVJA2VeBAJAVd0N3N3vPgaYt+I0X/lvc46k6mVzt5KkATRf5hAkSX1mIEiSAANB0jyTjvcm+ev2/uwkq/vd1yAwEAZYkl9K8ldJPt3er0jyR/3uSwPveuC3gcva+4N0vutMx5mBMNj+BXiJzn8+6HxA8O/7144EwG9V1WbgRYCqeh54bX9bGgwGwmB7c1X9I/AzgKr6Cb2/RkSaSz9r34BcAEmGgJ/3t6XBYCAMtp8mWcQv/uO9mc4Vg9RP1wFfAM5IsgX4GvAP/W1pMPg5hAGW5PeBv6TzNyi+BFwI/HlVfaWffUlJfg1YS+eKdUdVPd7nlgaCgTDgkpwGrKHzH++/q2ref6ukXt2SnN2rXlVPzXUvg8ZAGGBJLgQeqaofJXkv8HbgU1X1nT63pgGWZCed25gBXgcsB3ZX1Xl9bWwAOIcw2G4AfpzkrcBHgO8At/S3JQ26qvr1qvqN9rqCzl9U/Fq/+xoEBsJgO1SdS8T1wHVV9SngjX3uSTpCVT0M/Ga/+xgE8+bbTtUXB5NcBbwX+J32qN9r+tyTBlySv+h6ewKdW5kH+tTOQPEKYbD9KZ3HTDdW1f/R+TvW/9TfliTe2LWcCHyRzlWsjjMnlSXNG+0q9eqq+ki/exlE3jIaQEkO0j6MNn4TUFV10hy3JJFkYfvriW/vdy+DykAYQFXlxLHmowfpzBc8kmQ78G/Aj8Y2VtXn+9XYoDAQRJIz6DzvDfgBIPXdqcCzwDv5xecRCjAQjjMDYYAl+WPgE8CvAPuBNwGPA34ASP1wRnvC6FF+EQRjnOycAz5lNNj+js7XVvxvVS2n890x/9XfljTAFgBvaMsbu9bHFh1nXiEMtp9V1bNJTkhyQlV9Ock1/W5KA2tfVf1tv5sYZAbCYPt+kjcAXwVuS7IfONTnnjS4/FscfebnEAZQkrOr6qkkrwd+QufW4Z8BJwO3VdWzfW1QAynJqVX1XL/7GGQGwgBK8nBVvb2tf66q/qTfPUnqPyeVB1P3pfmv9q0LSfOKgTCY6ijrkgaYt4wGUJLDdD4BGmAR8OOxTfjVFdLAMhAkSYC3jCRJjYEgSQIMBElSYyBIkgADQZLU/D9Olzuv8JG2MwAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import preprocessing\n", "import kaggle\n", "\n", "kaggle.api.authenticate()\n", "\n", "kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)\n", "\n", "results = pd.read_csv('results.csv')\n", "\n", "#brak wierszy z NaN\n", "results.dropna()\n", "\n", "#normalizacja itp\n", "for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:\n", " results[collumn] = results[collumn].str.lower()\n", " \n", "# Podział zbioru 6:1:1\n", "train, test = train_test_split(results, test_size= 1 - 0.6)\n", "\n", "valid, test = train_test_split(test, test_size=0.5) \n", "\n", "print(\"All data: \", results.size)\n", "print(\"Train size: \", train.size)\n", "print(\"Test size: \", test.size)\n", "print(\"Validate size: \", valid.size)\n", "print(results.describe(include='all'))\n", "\n", "# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy\n", "print(train.size+test.size+valid.size)\n", "\n", "for col in results.columns:\n", " column = results[col].value_counts().plot(kind=\"bar\")\n", " print(\"\\n\", col)\n", " print(column)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }