19 KiB
19 KiB
import pandas as pd
!unzip matches.zip
Archive: matches.zip creating: matches/ inflating: matches/international_matches.csv
Projekt do przewidywania, wygranego zespołu w meczu na podstawie fifa points
Używamy datasetu:
- Wyniki meczów podczas fifa world cup (1993-2022)
# points = pd.read_csv('ranking/fifa_ranking-2022-10-06.csv')
matches = pd.read_csv('matches/international_matches.csv')
# points = points[["country_full", "total_points", "previous_points", "rank_date"]]
matches = matches[["date", "home_team", "away_team", "home_team_fifa_rank","away_team_fifa_rank", "home_team_score", "away_team_score", "home_team_result"]]
# points
matches
date | home_team | away_team | home_team_fifa_rank | away_team_fifa_rank | home_team_score | away_team_score | home_team_result | |
---|---|---|---|---|---|---|---|---|
0 | 1993-08-08 | Bolivia | Uruguay | 59 | 22 | 3 | 1 | Win |
1 | 1993-08-08 | Brazil | Mexico | 8 | 14 | 1 | 1 | Draw |
2 | 1993-08-08 | Ecuador | Venezuela | 35 | 94 | 5 | 0 | Win |
3 | 1993-08-08 | Guinea | Sierra Leone | 65 | 86 | 1 | 0 | Win |
4 | 1993-08-08 | Paraguay | Argentina | 67 | 5 | 1 | 3 | Lose |
... | ... | ... | ... | ... | ... | ... | ... | ... |
23916 | 2022-06-14 | Moldova | Andorra | 180 | 153 | 2 | 1 | Win |
23917 | 2022-06-14 | Liechtenstein | Latvia | 192 | 135 | 0 | 2 | Lose |
23918 | 2022-06-14 | Chile | Ghana | 28 | 60 | 0 | 0 | Lose |
23919 | 2022-06-14 | Japan | Tunisia | 23 | 35 | 0 | 3 | Lose |
23920 | 2022-06-14 | Korea Republic | Egypt | 29 | 32 | 4 | 1 | Win |
23921 rows × 8 columns
Na początku zrobimy naiwne założenia, zespół który ma więcej fifa points wygrywa, jeśli mają tyle samo zakładamy remis
p_true = 0
p_false = 0
for i, m in matches.iterrows():
if m["home_team_fifa_rank"] > m["away_team_fifa_rank"] and m["home_team_result"] == "Win":
p_true +=1
elif m["home_team_fifa_rank"] < m["away_team_fifa_rank"] and m["home_team_result"] == "Lose":
p_true +=1
elif m["home_team_fifa_rank"] == m["away_team_fifa_rank"] and m["home_team_result"] == "Draw":
p_true +=1
else:
p_false +=1
print("Accuracy: ", p_true/(p_true + p_false))
Accuracy: 0.21809288909326532
Teraz za pomocą regresji logistycznej i fifa points będziemy przewidywać wygrany zespół
matches = matches[["home_team_fifa_rank", "away_team_fifa_rank", "home_team_result"]]
matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 2 if x == 'Win' else x)
/tmp/ipykernel_26917/2591599491.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 2 if x == 'Win' else x)
matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 1 if x == 'Draw' else x)
/tmp/ipykernel_26917/1297296633.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 1 if x == 'Draw' else x)
matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 0 if x == 'Lose' else x)
/tmp/ipykernel_26917/2245446894.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy matches["home_team_result"] = matches["home_team_result"].apply(lambda x: 0 if x == 'Lose' else x)
matches
home_team_fifa_rank | away_team_fifa_rank | home_team_result | |
---|---|---|---|
0 | 59 | 22 | 2 |
1 | 8 | 14 | 1 |
2 | 35 | 94 | 2 |
3 | 65 | 86 | 2 |
4 | 67 | 5 | 0 |
... | ... | ... | ... |
23916 | 180 | 153 | 2 |
23917 | 192 | 135 | 0 |
23918 | 28 | 60 | 0 |
23919 | 23 | 35 | 0 |
23920 | 29 | 32 | 2 |
23921 rows × 3 columns
import numpy as np
from sklearn.model_selection import train_test_split
X = matches[["home_team_fifa_rank", "away_team_fifa_rank"]]
Y = matches["home_team_result"]
data = np.array(matches)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1).fit(X_train, y_train)
clf.score(X_train, y_train)
0.5674174829974418
y_pred = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
0.5744869521155308