ium_434732/skrypt.ipynb

153 lines
11 KiB
Plaintext
Raw Normal View History

2021-03-21 16:18:50 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All data: 376884\n",
"Train size: 226125\n",
"Test size: 75384\n",
"Validate size: 75375\n",
" date home_team away_team home_score away_score tournament \\\n",
"count 41876 41876 41876 41876.000000 41876.000000 41876 \n",
"unique 15232 308 306 NaN NaN 112 \n",
"top 2012-02-29 brazil uruguay NaN NaN friendly \n",
"freq 66 570 543 NaN NaN 17136 \n",
"mean NaN NaN NaN 1.744293 1.186503 NaN \n",
"std NaN NaN NaN 1.752248 1.403053 NaN \n",
"min NaN NaN NaN 0.000000 0.000000 NaN \n",
"25% NaN NaN NaN 1.000000 0.000000 NaN \n",
"50% NaN NaN NaN 1.000000 1.000000 NaN \n",
"75% NaN NaN NaN 2.000000 2.000000 NaN \n",
"max NaN NaN NaN 31.000000 21.000000 NaN \n",
"\n",
" city country neutral \n",
"count 41876 41876 41876 \n",
"unique 2026 266 2 \n",
"top kuala lumpur united states False \n",
"freq 589 1160 31557 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
"376884\n",
"\n",
" date\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" home_team\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" away_team\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" home_score\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" away_score\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" tournament\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" city\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" country\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n",
"\n",
" neutral\n",
"AxesSubplot(0.125,0.125;0.775x0.755)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEICAYAAABfz4NwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAASmklEQVR4nO3df6zddX3H8eeLVrFTYfy4ENYW20mTCWxW6bouJIuz2+h0WTGD7JIpXdKkhtREM2MCZpvuRxfYokSSQYLBUQgTOtTQTNgkRWNcGHghzFKw80YQajtaAbH+AG1974/zufH0cnp/tvdcOM9H8s35nvf3+/ne9zdp87rf7+d7zk1VIUnSCf1uQJI0PxgIkiTAQJAkNQaCJAkwECRJjYEgSQJgYb8bmKnTTz+9li1b1u82JOkV5aGHHvpeVQ312vaKDYRly5YxMjLS7zYk6RUlyXeOts1bRpIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1LxiP5j2SrHsyi/2u4VXlSevfne/W5BetbxCkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmkkDIcnrkjyY5H+S7EryN61+apJ7k3yrvZ7SNeaqJKNJdie5qKt+QZKdbdt1SdLqJya5o9UfSLLsOJyrJGkCU7lCeAl4Z1W9FVgJrEuyBrgS2FFVK4Ad7T1JzgWGgfOAdcD1SRa0Y90AbAJWtGVdq28Enq+qc4BrgWtmf2qSpOmYNBCq44ft7WvaUsB6YGurbwUubuvrgdur6qWqegIYBVYnOQs4qarur6oCbhk3ZuxYdwJrx64eJElzY0pzCEkWJHkE2A/cW1UPAGdW1T6A9npG230x8HTX8D2ttritj68fMaaqDgEvAKfN4HwkSTM0pUCoqsNVtRJYQue3/fMn2L3Xb/Y1QX2iMUceONmUZCTJyIEDBybpWpI0HdN6yqiqvg98hc69/2fabSDa6/622x5gadewJcDeVl/So37EmCQLgZOB53r8/BuralVVrRoaGppO65KkSUzlKaOhJL/c1hcBvwd8E9gObGi7bQDuauvbgeH25NByOpPHD7bbSgeTrGnzA5ePGzN2rEuA+9o8gyRpjkzl207PAra2J4VOALZV1b8nuR/YlmQj8BRwKUBV7UqyDXgMOARsrqrD7VhXADcDi4B72gJwE3BrklE6VwbDx+LkJElTN2kgVNU3gLf1qD8LrD3KmC3Alh71EeBl8w9V9SItUCRJ/eEnlSVJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKmZNBCSLE3y5SSPJ9mV5IOt/vEk303ySFve1TXmqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7DicqyRpAlO5QjgEfLiq3gKsATYnObdtu7aqVrblboC2bRg4D1gHXJ9kQdv/BmATsKIt61p9I/B8VZ0DXAtcM/tTkyRNx6SBUFX7qurhtn4QeBxYPMGQ9cDtVfVSVT0BjAKrk5wFnFRV91dVAbcAF3eN2drW7wTWjl09SJLmxrTmENqtnLcBD7TSB5J8I8lnkpzSaouBp7uG7Wm1xW19fP2IMVV1CHgBOK3Hz9+UZCTJyIEDB6bTuiRpElMOhCRvAD4HfKiqfkDn9s+bgZXAPuATY7v2GF4T1Ccac2Sh6saqWlVVq4aGhqbauiRpCqYUCEleQycMbquqzwNU1TNVdbiqfg58Gljddt8DLO0avgTY2+pLetSPGJNkIXAy8NxMTkiSNDNTecoowE3A41X1ya76WV27vQd4tK1vB4bbk0PL6UweP1hV+4CDSda0Y14O3NU1ZkNbvwS4r80zSJLmyMIp7HMh8D5gZ5JHWu2jwGVJVtK5tfMk8H6AqtqVZBvwGJ0nlDZX1eE27grgZmARcE9boBM4tyYZpXNlMDybk5IkTd+kgVBVX6P3Pf67JxizBdjSoz4CnN+j/iJw6WS9SJKOHz+pLEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJmEIgJFma5MtJHk+yK8kHW/3UJPcm+VZ7PaVrzFVJRpPsTnJRV/2CJDvbtuuSpNVPTHJHqz+QZNlxOFdJ0gSmcoVwCPhwVb0FWANsTnIucCWwo6pWADvae9q2YeA8YB1wfZIF7Vg3AJuAFW1Z1+obgeer6hzgWuCaY3BukqRpmDQQqmpfVT3c1g8CjwOLgfXA1rbbVuDitr4euL2qXqqqJ4BRYHWSs4CTqur+qirglnFjxo51J7B27OpBkjQ3pjWH0G7lvA14ADizqvZBJzSAM9pui4Gnu4btabXFbX18/YgxVXUIeAE4bTq9SZJmZ8qBkOQNwOeAD1XVDybatUetJqhPNGZ8D5uSjCQZOXDgwGQtS5KmYUqBkOQ1dMLgtqr6fCs/024D0V73t/oeYGnX8CXA3lZf0qN+xJgkC4GTgefG91FVN1bVqqpaNTQ0NJXWJUlTNJWnjALcBDxeVZ/s2rQd2NDWNwB3ddWH25NDy+lMHj/YbisdTLKmHfPycWPGjnUJcF+bZ5AkzZGFU9jnQuB9wM4kj7TaR4GrgW1JNgJPAZcCVNWuJNuAx+g8obS5qg63cVcANwOLgHvaAp3AuTXJKJ0rg+HZnZYkabomDYSq+hq97/EDrD3KmC3Alh71EeD8HvUXaYEiSeoPP6ksSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJzaSBkOQzSfYnebSr9vEk303ySFve1bXtqiSjSXYnuairfkGSnW3bdUnS6icmuaPVH0iy7BifoyRpCqZyhXAzsK5H/dqqWtmWuwGSnAsMA+e1MdcnWdD2vwHYBKxoy9gxNwLPV9U5wLXANTM8F0nSLEwaCFX1VeC5KR5vPXB7Vb1UVU8Ao8DqJGcBJ1XV/VVVwC3AxV1jtrb1O4G1Y1cPkqS5M5s5hA8k+Ua7pXRKqy0Gnu7aZ0+rLW7r4+tHjKmqQ8ALwGmz6EuSNAMzDYQbgDcDK4F9wCdavddv9jVBfaIxL5NkU5KRJCMHDhyYVsOSpInNKBCq6pmqOlxVPwc+Daxum/YAS7t2XQLsbfUlPepHjEmyEDiZo9yiqqobq2pVVa0aGhqaSeuSpKOYUSC0OYEx7wHGnkDaDgy3J4eW05k8frCq9gEHk6xp8wOXA3d1jdnQ1i8B7mvzDJKkObRwsh2SfBZ4B3B6kj3Ax4B3JFlJ59bOk8D7AapqV5JtwGPAIWBzVR1uh7qCzhNLi4B72gJwE3BrklE6VwbDx+C8JEnTNGkgVNVlPco3TbD/FmBLj/oIcH6P+ovApZP1IUk6vvyksiQJMBAkSY2BIEkCDARJUmMgSJKAKTxlJOnVadmVX+x3C68qT1797n63MGteIUiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUTBoIST6TZH+SR7tqpya5N8m32uspXduuSjKaZHeSi7rqFyTZ2bZdlyStfmKSO1r9gSTLjvE5SpKmYCpXCDcD68bVrgR2VNUKYEd7T5JzgWHgvDbm+iQL2pgbgE3AiraMHXMj8HxVnQNcC1wz05ORJM3cpIFQVV8FnhtXXg9sbetbgYu76rdX1UtV9QQwCqxOchZwUlXdX1UF3DJuzNix7gTWjl09SJLmzkznEM6sqn0A7fWMVl8MPN21355WW9zWx9ePGFNVh4AXgNNm2JckaYaO9aRyr9/sa4L6RGNefvBkU5KRJCMHDhyYYYuSpF5
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import preprocessing\n",
"import kaggle\n",
"\n",
"kaggle.api.authenticate()\n",
"\n",
"kaggle.api.dataset_download_files('martj42/international-football-results-from-1872-to-2017', path='.', unzip=True)\n",
"\n",
"results = pd.read_csv('results.csv')\n",
"\n",
"#brak wierszy z NaN\n",
"results.dropna()\n",
"\n",
"#normalizacja itp\n",
"for collumn in ['home_team', 'away_team', 'tournament', 'city', 'country']:\n",
" results[collumn] = results[collumn].str.lower()\n",
" \n",
"# Podział zbioru 6:1:1\n",
"train, test = train_test_split(results, test_size= 1 - 0.6)\n",
"\n",
"valid, test = train_test_split(test, test_size=0.5) \n",
"\n",
"print(\"All data: \", results.size)\n",
"print(\"Train size: \", train.size)\n",
"print(\"Test size: \", test.size)\n",
"print(\"Validate size: \", valid.size)\n",
"print(results.describe(include='all'))\n",
"\n",
"# sprawdzenie czy cały dataset oraz podział na podzbiory jest równy\n",
"print(train.size+test.size+valid.size)\n",
"\n",
"for col in results.columns:\n",
" column = results[col].value_counts().plot(kind=\"bar\")\n",
" print(\"\\n\", col)\n",
" print(column)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}