153 lines
11 KiB
Plaintext
153 lines
11 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 69,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"All data: 376884\n",
|
||
|
"Train size: 226125\n",
|
||
|
"Test size: 75384\n",
|
||
|
"Validate size: 75375\n",
|
||
|
" date home_team away_team home_score away_score tournament \\\n",
|
||
|
"count 41876 41876 41876 41876.000000 41876.000000 41876 \n",
|
||
|
"unique 15232 308 306 NaN NaN 112 \n",
|
||
|
"top 2012-02-29 brazil uruguay NaN NaN friendly \n",
|
||
|
"freq 66 570 543 NaN NaN 17136 \n",
|
||
|
"mean NaN NaN NaN 1.744293 1.186503 NaN \n",
|
||
|
"std NaN NaN NaN 1.752248 1.403053 NaN \n",
|
||
|
"min NaN NaN NaN 0.000000 0.000000 NaN \n",
|
||
|
"25% NaN NaN NaN 1.000000 0.000000 NaN \n",
|
||
|
"50% NaN NaN NaN 1.000000 1.000000 NaN \n",
|
||
|
"75% NaN NaN NaN 2.000000 2.000000 NaN \n",
|
||
|
"max NaN NaN NaN 31.000000 21.000000 NaN \n",
|
||
|
"\n",
|
||
|
" city country neutral \n",
|
||
|
"count 41876 41876 41876 \n",
|
||
|
"unique 2026 266 2 \n",
|
||
|
"top kuala lumpur united states False \n",
|
||
|
"freq 589 1160 31557 \n",
|
||
|
"mean NaN NaN NaN \n",
|
||
|
"std NaN NaN NaN \n",
|
||
|
"min NaN NaN NaN \n",
|
||
|
"25% NaN NaN NaN \n",
|
||
|
"50% NaN NaN NaN \n",
|
||
|
"75% NaN NaN NaN \n",
|
||
|
"max NaN NaN NaN \n",
|
||
|
"376884\n",
|
||
|
"\n",
|
||
|
" date\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" home_team\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" away_team\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" home_score\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" away_score\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" tournament\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" city\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" country\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
|
||
|
"\n",
|
||
|
" neutral\n",
|
||
|
"AxesSubplot(0.125,0.125;0.775x0.755)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEICAYAAABfz4NwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAASmklEQVR4nO3df6zddX3H8eeLVrFTYfy4ENYW20mTCWxW6bouJIuz2+h0WTGD7JIpXdKkhtREM2MCZpvuRxfYokSSQYLBUQgTOtTQTNgkRWNcGHghzFKw80YQajtaAbH+AG1974/zufH0cnp/tvdcOM9H8s35nvf3+/ne9zdp87rf7+d7zk1VIUnSCf1uQJI0PxgIkiTAQJAkNQaCJAkwECRJjYEgSQJgYb8bmKnTTz+9li1b1u82JOkV5aGHHvpeVQ312vaKDYRly5YxMjLS7zYk6RUlyXeOts1bRpIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1LxiP5j2SrHsyi/2u4VXlSevfne/W5BetbxCkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmkkDIcnrkjyY5H+S7EryN61+apJ7k3yrvZ7SNeaqJKNJdie5qKt+QZKdbdt1SdLqJya5o9UfSLLsOJyrJGkCU7lCeAl4Z1W9FVgJrEuyBrgS2FFVK4Ad7T1JzgWGgfOAdcD1SRa0Y90AbAJWtGVdq28Enq+qc4BrgWtmf2qSpOmYNBCq44ft7WvaUsB6YGurbwUubuvrgdur6qWqegIYBVYnOQs4qarur6oCbhk3ZuxYdwJrx64eJElzY0pzCEkWJHkE2A/cW1UPAGdW1T6A9npG230x8HTX8D2ttritj68fMaaqDgEvAKfN4HwkSTM0pUCoqsNVtRJYQue3/fMn2L3Xb/Y1QX2iMUceONmUZCTJyIEDBybpWpI0HdN6yqiqvg98hc69/2fabSDa6/622x5gadewJcDeVl/So37EmCQLgZOB53r8/BuralVVrRoaGppO65KkSUzlKaOhJL/c1hcBvwd8E9gObGi7bQDuauvbgeH25NByOpPHD7bbSgeTrGnzA5ePGzN2rEuA+9o8gyRpjkzl207PAra2J4VOALZV1b8nuR/YlmQj8BRwKUBV7UqyDXgMOARsrqrD7VhXADcDi4B72gJwE3BrklE6VwbDx+LkJElTN2kgVNU3gLf1qD8LrD3KmC3Alh71EeBl8w9V9SItUCRJ/eEnlSVJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKmZNBCSLE3y5SSPJ9mV5IOt/vEk303ySFve1TXmqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7DicqyRpAlO5QjgEfLiq3gKsATYnObdtu7aqVrblboC2bRg4D1gHXJ9kQdv/BmATsKIt61p9I/B8VZ0DXAtcM/tTkyRNx6SBUFX7qurhtn4QeBxYPMGQ9cDtVfVSVT0BjAKrk5wFnFRV91dVAbcAF3eN2drW7wTWjl09SJLmxrTmENqtnLcBD7TSB5J8I8lnkpzSaouBp7uG7Wm1xW19fP2IMVV1CHgBOK3Hz9+UZCTJyIEDB6bTuiRpElMOhCRvAD4HfKiqfkDn9s+bgZXAPuATY7v2GF4T1Ccac2Sh6saqWlVVq4aGhqbauiRpCqYUCEleQycMbquqzwNU1TNVdbiqfg58Gljddt8DLO0avgTY2+pLetSPGJNkIXAy8NxMTkiSNDNTecoowE3A41X1ya76WV27vQd4tK1vB4bbk0PL6UweP1hV+4CDSda0Y14O3NU1ZkNbvwS4r80zSJLmyMIp7HMh8D5gZ5JHWu2jwGVJVtK5tfMk8H6AqtqVZBvwGJ0nlDZX1eE27grgZmARcE9boBM4tyYZpXNlMDybk5IkTd+kgVBVX6P3Pf67JxizBdjSoz4CnN+j/iJw6WS9SJKOHz+pLEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJmEIgJFma5MtJHk+yK8kHW/3UJPcm+VZ7PaVrzFVJRpPsTnJRV/2CJDvbtuuSpNVPTHJHqz+QZNlxOFdJ0gSmcoVwCPhwVb0FWANsTnIucCWwo6pWADvae9q2YeA8YB1wfZIF7Vg3AJuAFW1Z1+obgeer6hzgWuCaY3BukqRpmDQQqmpfVT3c1g8CjwOLgfXA1rbbVuDitr4euL2qXqqqJ4BRYHWSs4CTqur+qirglnFjxo51J7B27OpBkjQ3pjWH0G7lvA14ADizqvZBJzSAM9pui4Gnu4btabXFbX18/YgxVXUIeAE4bTq9SZJmZ8qBkOQNwOeAD1XVDybatUetJqhPNGZ8D5uSjCQZOXDgwGQtS5KmYUqBkOQ1dMLgtqr6fCs/024D0V73t/oeYGnX8CXA3lZf0qN+xJgkC4GTgefG91FVN1bVqqpaNTQ0NJXWJUlTNJWnjALcBDxeVZ/s2rQd2NDWNwB3ddWH25NDy+lMHj/YbisdTLKmHfPycWPGjnUJcF+bZ5AkzZGFU9jnQuB9wM4kj7TaR4GrgW1JNgJPAZcCVNWuJNuAx+g8obS5qg63cVcANwOLgHvaAp3AuTXJKJ0rg+HZnZYkabomDYSq+hq97/EDrD3KmC3Alh71EeD8HvUXaYEiSeoPP6ksSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJzaSBkOQzSfYnebSr9vEk303ySFve1bXtqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7BifoyRpCqZyhXAzsK5H/dqqWtmWuwGSnAsMA+e1MdcnWdD2vwHYBKxoy9gxNwLPV9U5wLXANTM8F0nSLEwaCFX1VeC5KR5vPXB7Vb1UVU8Ao8DqJGcBJ1XV/VVVwC3AxV1jtrb1O4G1Y1cPkqS5M5s5hA8k+Ua7pXRKqy0Gnu7aZ0+rLW7r4+tHjKmqQ8ALwGmz6EuSNAMzDYQbgDcDK4F9wCdavddv9jVBfaIxL5NkU5KRJCMHDhyYVsOSpInNKBCq6pmqOlxVPwc+Daxum/YAS7t2XQLsbfUlPepHjEmyEDiZo9yiqqobq2pVVa0aGhqaSeuSpKOYUSC0OYEx7wHGnkDaDgy3J4eW05k8frCq9gEHk6xp8wOXA3d1jdnQ1i8B7mvzDJKkObRwsh2SfBZ4B3B6kj3Ax4B3JFlJ59bOk8D7AapqV5JtwGPAIWBzVR1uh7qCzhNLi4B72gJwE3BrklE6VwbDx+C8JEnTNGkgVNVlPco3TbD/FmBLj/oIcH6P+ovApZP1IUk6vvyksiQJMBAkSY2BIEkCDARJUmMgSJKAKTxlJOnVadmVX+x3C68qT1797n63MGteIUiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUTBoIST6TZH+SR7tqpya5N8m32uspXduuSjKaZHeSi7rqFyTZ2bZdlyStfmKSO1r9gSTLjvE5SpKmYCpXCDcD68bVrgR2VNUKYEd7T5JzgWHgvDbm+iQL2pgbgE3AiraMHXMj8HxVnQNcC1wz05ORJM3cpIFQVV8FnhtXXg9sbetbgYu76rdX1UtV9QQwCqxOchZwUlXdX1UF3DJuzNix7gTWjl09SJLmzkznEM6sqn0A7fWMVl8MPN21355WW9zWx9ePGFNVh4AXgNNm2JckaYaO9aRyr9/sa4L6RGNefvBkU5KRJCMHDhyYYYuSpF5
|
||
|
"text/plain": [
|
||
|
"<Figure size 432x288 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {
|
||
|
"needs_background": "light"
|
||
|
},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from sklearn import preprocessing\n",
|
||
|
"import kaggle\n",
|
||
|
"\n",
|
||
|
"kaggle.api.authenticate()\n",
|
||
|
"\n",
|
||
|
"kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)\n",
|
||
|
"\n",
|
||
|
"results = pd.read_csv('results.csv')\n",
|
||
|
"\n",
|
||
|
"#brak wierszy z NaN\n",
|
||
|
"results.dropna()\n",
|
||
|
"\n",
|
||
|
"#normalizacja itp\n",
|
||
|
"for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:\n",
|
||
|
" results[collumn] = results[collumn].str.lower()\n",
|
||
|
" \n",
|
||
|
"# Podział zbioru 6:1:1\n",
|
||
|
"train, test = train_test_split(results, test_size= 1 - 0.6)\n",
|
||
|
"\n",
|
||
|
"valid, test = train_test_split(test, test_size=0.5) \n",
|
||
|
"\n",
|
||
|
"print(\"All data: \", results.size)\n",
|
||
|
"print(\"Train size: \", train.size)\n",
|
||
|
"print(\"Test size: \", test.size)\n",
|
||
|
"print(\"Validate size: \", valid.size)\n",
|
||
|
"print(results.describe(include='all'))\n",
|
||
|
"\n",
|
||
|
"# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy\n",
|
||
|
"print(train.size+test.size+valid.size)\n",
|
||
|
"\n",
|
||
|
"for col in results.columns:\n",
|
||
|
" column = results[col].value_counts().plot(kind=\"bar\")\n",
|
||
|
" print(\"\\n\", col)\n",
|
||
|
" print(column)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.5"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 4
|
||
|
}
|