{ "cells": [ { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: matches.zip\n", " creating: matches/\n", " inflating: matches/international_matches.csv \n" ] } ], "source": [ "!unzip matches.zip" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Projekt do przewidywania, wygranego zespołu w meczu na podstawie fifa points\n", "## Używamy datasetu:\n", "1. Wyniki meczów podczas fifa world cup (1993-2022)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "# points = pd.read_csv('ranking/fifa_ranking-2022-10-06.csv')\n", "matches = pd.read_csv('matches/international_matches.csv')\n" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "# points = points[[\"country_full\", \"total_points\", \"previous_points\", \"rank_date\"]]\n", "matches = matches[[\"date\", \"home_team\", \"away_team\", \"home_team_fifa_rank\",\"away_team_fifa_rank\", \"home_team_score\", \"away_team_score\", \"home_team_result\"]]" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "# points" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datehome_teamaway_teamhome_team_fifa_rankaway_team_fifa_rankhome_team_scoreaway_team_scorehome_team_result
01993-08-08BoliviaUruguay592231Win
11993-08-08BrazilMexico81411Draw
21993-08-08EcuadorVenezuela359450Win
31993-08-08GuineaSierra Leone658610Win
41993-08-08ParaguayArgentina67513Lose
...........................
239162022-06-14MoldovaAndorra18015321Win
239172022-06-14LiechtensteinLatvia19213502Lose
239182022-06-14ChileGhana286000Lose
239192022-06-14JapanTunisia233503Lose
239202022-06-14Korea RepublicEgypt293241Win
\n", "

23921 rows × 8 columns

\n", "
" ], "text/plain": [ " date home_team away_team home_team_fifa_rank \\\n", "0 1993-08-08 Bolivia Uruguay 59 \n", "1 1993-08-08 Brazil Mexico 8 \n", "2 1993-08-08 Ecuador Venezuela 35 \n", "3 1993-08-08 Guinea Sierra Leone 65 \n", "4 1993-08-08 Paraguay Argentina 67 \n", "... ... ... ... ... \n", "23916 2022-06-14 Moldova Andorra 180 \n", "23917 2022-06-14 Liechtenstein Latvia 192 \n", "23918 2022-06-14 Chile Ghana 28 \n", "23919 2022-06-14 Japan Tunisia 23 \n", "23920 2022-06-14 Korea Republic Egypt 29 \n", "\n", " away_team_fifa_rank home_team_score away_team_score home_team_result \n", "0 22 3 1 Win \n", "1 14 1 1 Draw \n", "2 94 5 0 Win \n", "3 86 1 0 Win \n", "4 5 1 3 Lose \n", "... ... ... ... ... \n", "23916 153 2 1 Win \n", "23917 135 0 2 Lose \n", "23918 60 0 0 Lose \n", "23919 35 0 3 Lose \n", "23920 32 4 1 Win \n", "\n", "[23921 rows x 8 columns]" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matches" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Na początku zrobimy naiwne założenia, zespół który ma więcej fifa points wygrywa, jeśli mają tyle samo zakładamy remis" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.21809288909326532\n" ] } ], "source": [ "p_true = 0\n", "p_false = 0\n", "for i, m in matches.iterrows():\n", " if m[\"home_team_fifa_rank\"] > m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Win\":\n", " p_true +=1\n", " elif m[\"home_team_fifa_rank\"] < m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Lose\":\n", " p_true +=1\n", " elif m[\"home_team_fifa_rank\"] == m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Draw\":\n", " p_true +=1\n", " else:\n", " p_false +=1\n", "print(\"Accuracy: \", p_true/(p_true + p_false))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Teraz za pomocą regresji logistycznej i fifa points będziemy przewidywać wygrany zespół" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "matches = matches[[\"home_team_fifa_rank\", \"away_team_fifa_rank\", \"home_team_result\"]]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_26917/2591599491.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 2 if x == 'Win' else x)\n" ] } ], "source": [ "matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 2 if x == 'Win' else x)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_26917/1297296633.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 1 if x == 'Draw' else x)\n" ] } ], "source": [ "matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 1 if x == 'Draw' else x)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_26917/2245446894.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 0 if x == 'Lose' else x)\n" ] } ], "source": [ "matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 0 if x == 'Lose' else x)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
home_team_fifa_rankaway_team_fifa_rankhome_team_result
059222
18141
235942
365862
46750
............
239161801532
239171921350
2391828600
2391923350
2392029322
\n", "

23921 rows × 3 columns

\n", "
" ], "text/plain": [ " home_team_fifa_rank away_team_fifa_rank home_team_result\n", "0 59 22 2\n", "1 8 14 1\n", "2 35 94 2\n", "3 65 86 2\n", "4 67 5 0\n", "... ... ... ...\n", "23916 180 153 2\n", "23917 192 135 0\n", "23918 28 60 0\n", "23919 23 35 0\n", "23920 29 32 2\n", "\n", "[23921 rows x 3 columns]" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matches" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "\n", "X = matches[[\"home_team_fifa_rank\", \"away_team_fifa_rank\"]]\n", "Y = matches[\"home_team_result\"]\n", "data = np.array(matches)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5674174829974418" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.datasets import load_iris\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "clf = LogisticRegression(random_state=1).fit(X_train, y_train)\n", "clf.score(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "y_pred = clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5744869521155308" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import accuracy_score\n", "accuracy_score(y_test, y_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dzięki zastostosowaniu regresji logistycznej uzyskaliśmy accuracy 57%, biorąc pod uwagę, że przewidywaliśmy 3 klasy wygraną, remis i przegraną nie jest to najgorszy wynik\n", "\n", "### Ranking fifa points wydaję się być dobry baselinem do modelów predykcyjnych" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.12 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 2 }