log regression using fifa points

This commit is contained in:
Mikołaj Pokrywka 2022-12-19 19:12:11 +01:00
commit 4ad691fa07
3 changed files with 637 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
matches
ranking

BIN
matches.zip Normal file

Binary file not shown.

635
predict.ipynb Normal file
View File

@ -0,0 +1,635 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: matches.zip\n",
" creating: matches/\n",
" inflating: matches/international_matches.csv \n"
]
}
],
"source": [
"!unzip matches.zip"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Używamy datasetu:\n",
"1. Wyniki meczów podczas fifa world cup (1993-2022)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"# points = pd.read_csv('ranking/fifa_ranking-2022-10-06.csv')\n",
"matches = pd.read_csv('matches/international_matches.csv')\n"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"# points = points[[\"country_full\", \"total_points\", \"previous_points\", \"rank_date\"]]\n",
"matches = matches[[\"date\", \"home_team\", \"away_team\", \"home_team_fifa_rank\",\"away_team_fifa_rank\", \"home_team_score\", \"away_team_score\", \"home_team_result\"]]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"# points"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>home_team</th>\n",
" <th>away_team</th>\n",
" <th>home_team_fifa_rank</th>\n",
" <th>away_team_fifa_rank</th>\n",
" <th>home_team_score</th>\n",
" <th>away_team_score</th>\n",
" <th>home_team_result</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1993-08-08</td>\n",
" <td>Bolivia</td>\n",
" <td>Uruguay</td>\n",
" <td>59</td>\n",
" <td>22</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Win</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1993-08-08</td>\n",
" <td>Brazil</td>\n",
" <td>Mexico</td>\n",
" <td>8</td>\n",
" <td>14</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Draw</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1993-08-08</td>\n",
" <td>Ecuador</td>\n",
" <td>Venezuela</td>\n",
" <td>35</td>\n",
" <td>94</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>Win</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1993-08-08</td>\n",
" <td>Guinea</td>\n",
" <td>Sierra Leone</td>\n",
" <td>65</td>\n",
" <td>86</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>Win</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1993-08-08</td>\n",
" <td>Paraguay</td>\n",
" <td>Argentina</td>\n",
" <td>67</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Lose</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23916</th>\n",
" <td>2022-06-14</td>\n",
" <td>Moldova</td>\n",
" <td>Andorra</td>\n",
" <td>180</td>\n",
" <td>153</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Win</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23917</th>\n",
" <td>2022-06-14</td>\n",
" <td>Liechtenstein</td>\n",
" <td>Latvia</td>\n",
" <td>192</td>\n",
" <td>135</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>Lose</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23918</th>\n",
" <td>2022-06-14</td>\n",
" <td>Chile</td>\n",
" <td>Ghana</td>\n",
" <td>28</td>\n",
" <td>60</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>Lose</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23919</th>\n",
" <td>2022-06-14</td>\n",
" <td>Japan</td>\n",
" <td>Tunisia</td>\n",
" <td>23</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Lose</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23920</th>\n",
" <td>2022-06-14</td>\n",
" <td>Korea Republic</td>\n",
" <td>Egypt</td>\n",
" <td>29</td>\n",
" <td>32</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>Win</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>23921 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" date home_team away_team home_team_fifa_rank \\\n",
"0 1993-08-08 Bolivia Uruguay 59 \n",
"1 1993-08-08 Brazil Mexico 8 \n",
"2 1993-08-08 Ecuador Venezuela 35 \n",
"3 1993-08-08 Guinea Sierra Leone 65 \n",
"4 1993-08-08 Paraguay Argentina 67 \n",
"... ... ... ... ... \n",
"23916 2022-06-14 Moldova Andorra 180 \n",
"23917 2022-06-14 Liechtenstein Latvia 192 \n",
"23918 2022-06-14 Chile Ghana 28 \n",
"23919 2022-06-14 Japan Tunisia 23 \n",
"23920 2022-06-14 Korea Republic Egypt 29 \n",
"\n",
" away_team_fifa_rank home_team_score away_team_score home_team_result \n",
"0 22 3 1 Win \n",
"1 14 1 1 Draw \n",
"2 94 5 0 Win \n",
"3 86 1 0 Win \n",
"4 5 1 3 Lose \n",
"... ... ... ... ... \n",
"23916 153 2 1 Win \n",
"23917 135 0 2 Lose \n",
"23918 60 0 0 Lose \n",
"23919 35 0 3 Lose \n",
"23920 32 4 1 Win \n",
"\n",
"[23921 rows x 8 columns]"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matches"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Na początku zrobimy naiwne założenie, że zespoł, który ma więszy ranking fifa to zakładamy, że wygra"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.21809288909326532\n"
]
}
],
"source": [
"p_true = 0\n",
"p_false = 0\n",
"for i, m in matches.iterrows():\n",
" if m[\"home_team_fifa_rank\"] > m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Win\":\n",
" p_true +=1\n",
" elif m[\"home_team_fifa_rank\"] < m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Lose\":\n",
" p_true +=1\n",
" elif m[\"home_team_fifa_rank\"] == m[\"away_team_fifa_rank\"] and m[\"home_team_result\"] == \"Draw\":\n",
" p_true +=1\n",
" else:\n",
" p_false +=1\n",
"print(\"Accuracy: \", p_true/(p_true + p_false))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Teraz za pomocą regresji logistycznej na podstawie home_team_fifa_rank\taway_team_fifa_rank\t"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"matches = matches[[\"home_team_fifa_rank\", \"away_team_fifa_rank\", \"home_team_result\"]]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_26917/2591599491.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 2 if x == 'Win' else x)\n"
]
}
],
"source": [
"matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 2 if x == 'Win' else x)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_26917/1297296633.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 1 if x == 'Draw' else x)\n"
]
}
],
"source": [
"matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 1 if x == 'Draw' else x)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_26917/2245446894.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 0 if x == 'Lose' else x)\n"
]
}
],
"source": [
"matches[\"home_team_result\"] = matches[\"home_team_result\"].apply(lambda x: 0 if x == 'Lose' else x)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>home_team_fifa_rank</th>\n",
" <th>away_team_fifa_rank</th>\n",
" <th>home_team_result</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>59</td>\n",
" <td>22</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8</td>\n",
" <td>14</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>35</td>\n",
" <td>94</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>65</td>\n",
" <td>86</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>67</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23916</th>\n",
" <td>180</td>\n",
" <td>153</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23917</th>\n",
" <td>192</td>\n",
" <td>135</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23918</th>\n",
" <td>28</td>\n",
" <td>60</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23919</th>\n",
" <td>23</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23920</th>\n",
" <td>29</td>\n",
" <td>32</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>23921 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" home_team_fifa_rank away_team_fifa_rank home_team_result\n",
"0 59 22 2\n",
"1 8 14 1\n",
"2 35 94 2\n",
"3 65 86 2\n",
"4 67 5 0\n",
"... ... ... ...\n",
"23916 180 153 2\n",
"23917 192 135 0\n",
"23918 28 60 0\n",
"23919 23 35 0\n",
"23920 29 32 2\n",
"\n",
"[23921 rows x 3 columns]"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matches"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = matches[[\"home_team_fifa_rank\", \"away_team_fifa_rank\"]]\n",
"Y = matches[\"home_team_result\"]\n",
"data = np.array(matches)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5674174829974418"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.datasets import load_iris\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"clf = LogisticRegression(random_state=1).fit(X_train, y_train)\n",
"clf.score(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"y_pred = clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5744869521155308"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dzięki zastostosowaniu regresji logistycznej uzyskaliśmy accuracy 57%, biorąc pod uwagę, że przewidywaliśmy 3 klasy wygraną, remis i przegraną nie jest to najgorszy wynik\n",
"\n",
"### Ranking fifa points wydaję się być dobry baselinem do modelów predykcyjnych"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.12 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}