188 KiB
188 KiB
Importy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time
Wczytanie danych
df = pd.read_csv('data4.csv')
y = pd.DataFrame(df['isGoal'])
X = df.drop(['isGoal'], axis=1)
X.head()
match_minute | match_second | position_x | position_y | play_type | BodyPart | Number_Intervening_Opponents | Number_Intervening_Teammates | Interference_on_Shooter | outcome | ... | Interference_on_Shooter_Code | distance_to_goalM | distance_to_centerM | angle | isFoot | isHead | header_distance_to_goalM | High | Low | Medium | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 29 | 54 | 23.69 | 4.99 | Open Play | Left | 4 | 2 | Medium | Missed | ... | 2 | 24.212265 | 5.001769 | 11.922004 | 1 | 0 | 0.000000 | 0 | 0 | 1 |
1 | 11 | 33 | 28.93 | -11.22 | Open Play | Left | 4 | 1 | Low | Missed | ... | 1 | 31.039134 | 11.246462 | 21.243463 | 1 | 0 | 0.000000 | 0 | 1 | 0 |
2 | 61 | 25 | 9.98 | -5.24 | Open Play | Head | 3 | 1 | High | Missed | ... | 3 | 11.277751 | 5.252358 | 27.757313 | 0 | 1 | 11.277751 | 1 | 0 | 0 |
3 | 73 | 45 | 4.49 | -5.74 | Open Play | Right | 2 | 0 | Low | Missed | ... | 1 | 7.298171 | 5.753538 | 52.031899 | 1 | 0 | 0.000000 | 0 | 1 | 0 |
4 | 44 | 40 | 7.98 | -12.97 | Open Play | Right | 1 | 0 | Medium | Saved | ... | 2 | 15.254368 | 13.000590 | 58.457635 | 1 | 0 | 0.000000 | 0 | 0 | 1 |
5 rows × 29 columns
y.head()
isGoal | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
3 | 0 |
4 | 0 |
Przygotowanie danych
Uwzględnienie wybranych cech:
- Współrzędna x strzelającego,
- Współrzędna y strzelającego,
- Dystans do bramki,
- Kąt do bramki,
- Minuta meczu,
- Liczba przeciwników przed piłką,
- Liczba zawodników ze swojej drużyny przed piłką,
- Część ciała.
X.columns
Index(['match_minute', 'match_second', 'position_x', 'position_y', 'play_type', 'BodyPart', 'Number_Intervening_Opponents', 'Number_Intervening_Teammates', 'Interference_on_Shooter', 'outcome', 'position_xM', 'position_yM', 'position_xM_r', 'position_yM_r', 'position_xM_std', 'position_yM_std', 'position_xM_std_r', 'position_yM_std_r', 'BodyPartCode', 'Interference_on_Shooter_Code', 'distance_to_goalM', 'distance_to_centerM', 'angle', 'isFoot', 'isHead', 'header_distance_to_goalM', 'High', 'Low', 'Medium'], dtype='object')
X_extracted = X[['position_x',
'position_y',
'distance_to_goalM',
'angle',
'match_minute',
'Number_Intervening_Opponents',
'Number_Intervening_Teammates',
'isFoot',
'isHead']]
X_extracted['isFoot'] = X_extracted['isFoot'].astype('category')
X_extracted['isHead'] = X_extracted['isHead'].astype('category')
C:\Users\s478991\AppData\Local\temp\ipykernel_3956\2392787789.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_extracted['isFoot'] = X_extracted['isFoot'].astype('category') C:\Users\s478991\AppData\Local\temp\ipykernel_3956\2392787789.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_extracted['isHead'] = X_extracted['isHead'].astype('category')
X_extracted.head()
position_x | position_y | distance_to_goalM | angle | match_minute | Number_Intervening_Opponents | Number_Intervening_Teammates | isFoot | isHead | |
---|---|---|---|---|---|---|---|---|---|
0 | 23.69 | 4.99 | 24.212265 | 11.922004 | 29 | 4 | 2 | 1 | 0 |
1 | 28.93 | -11.22 | 31.039134 | 21.243463 | 11 | 4 | 1 | 1 | 0 |
2 | 9.98 | -5.24 | 11.277751 | 27.757313 | 61 | 3 | 1 | 0 | 1 |
3 | 4.49 | -5.74 | 7.298171 | 52.031899 | 73 | 2 | 0 | 1 | 0 |
4 | 7.98 | -12.97 | 15.254368 | 58.457635 | 44 | 1 | 0 | 1 | 0 |
Podział danych na zbiór treningowy oraz zbiór testowy
X_train, X_test, y_train, y_test = train_test_split(X_extracted, y, test_size=0.2, random_state=1)
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
count_class_0, count_class_1 = y_train.value_counts()
print ('Oddane strzały w zbiorze danych: ', count_class_0)
print ('Gole trafione w zbiorze danych: ', count_class_1)
Oddane strzały w zbiorze danych: 7226 Gole trafione w zbiorze danych: 906
# Class imbalance in training data
scale_pos_weight = count_class_0 / count_class_1
scale_pos_weight
7.975717439293598
Trening danych
from xgboost import XGBClassifier
# Define the xgboost model
xgb_model = XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')
# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01, 0.001, 0.0001],
'max_depth': [3, 5, 7, 8, 9],
'n_estimators': [100, 150, 200, 250, 300],
'scale_pos_weight': [1, scale_pos_weight]}
start_time = time.time()
# Perform nested cross-validation with grid search
grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv_inner, scoring='f1', n_jobs=-1)
scores_xg = cross_val_score(grid_xg, X_train, y_train, cv=cv_outer, scoring='f1', n_jobs=-1)
# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)
best_xgb_model = grid_xg.best_estimator_
# Stopping the timer
stop_time = time.time()
# Training Time
xgb_training_time = stop_time - start_time
# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")
Best parameters: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 7.975717439293598} Model Training Time: 677.443 seconds
Ewaluacja modelu
Dane treningowe
# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, best_xgb_model.predict(X_train))
ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
Text(0.5, 1.0, 'Confusion Matrix - Train Set')
# Classfication report for training data
print (classification_report(y_train, best_xgb_model.predict(X_train)))
precision recall f1-score support 0 0.94 0.84 0.88 7226 1 0.30 0.56 0.39 906 accuracy 0.81 8132 macro avg 0.62 0.70 0.64 8132 weighted avg 0.87 0.81 0.83 8132
# xgb.to_graphviz(best_xgb_model, num_trees=1)
Dane testowe
# Evaluate the performance of the best model on the testing set
y_pred_xgb = best_xgb_model.predict(X_test)
# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_xgb)
ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
Text(0.5, 1.0, 'Confusion Matrix - Test Set')
# Classfication report for testing data
print (classification_report(y_test, y_pred_xgb))
precision recall f1-score support 0 0.93 0.84 0.88 1797 1 0.30 0.50 0.37 236 accuracy 0.80 2033 macro avg 0.61 0.67 0.63 2033 weighted avg 0.85 0.80 0.82 2033
print(f'Zbiór danych testowych zawiera {len(y_test)} oddane strzały, gdzie {y_test.sum()["isGoal"]} to strzały trafione.')
print(f'Dokładność klasyfikacji, czy strzał jest bramką, czy nie, wynosi {best_xgb_model.score(X_test, y_test):.2f}%.')
print(f'klasyfikator uzyskał ROC-AUC na poziomie {roc_auc_score(y_test, best_xgb_model.predict_proba(X_test)[:, 1]):.2f}%.')
Zbiór danych testowych zawiera 2033 oddane strzały, gdzie 236 to strzały trafione. Dokładność klasyfikacji, czy strzał jest bramką, czy nie, wynosi 0.80%. klasyfikator uzyskał ROC-AUC na poziomie 0.75%.
# Plot feature importance
xgb.plot_importance(best_xgb_model)
plt.show()
xgb.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain')
plt.show()
xgb.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight')
plt.show()
Podsumowanie
prec_xgb_train = precision_score(y_train, best_xgb_model.predict(X_train))
prec_xgb_test = precision_score(y_test, y_pred_xgb)
rec_xgb_train = recall_score(y_train, best_xgb_model.predict(X_train))
rec_xgb_test = recall_score(y_test, y_pred_xgb)
acc_xgb_train = accuracy_score(y_train, best_xgb_model.predict(X_train))
acc_xgb_test = accuracy_score(y_test, y_pred_xgb)
train_time = xgb_training_time/60
# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
'Training Accuracy': acc_xgb_train,
'Training Precision': prec_xgb_train,
'Training Recall':rec_xgb_train,
'Testing Accuracy': acc_xgb_test,
'Testing Precision': prec_xgb_test,
'Testing Recall':rec_xgb_test,
'Training Time (mins)': train_time})
summary_df.set_index('Model Name', inplace=True)
# Displaying summary of results
summary_df.style.format(precision =3).set_properties(**{'font-weight': 'bold',
'border': '2.0px solid grey','color': 'white'})
Training Accuracy | Training Precision | Training Recall | Testing Accuracy | Testing Precision | Testing Recall | Training Time (mins) | |
---|---|---|---|---|---|---|---|
Model Name | |||||||
XG Boost | 0.806 | 0.300 | 0.556 | 0.803 | 0.296 | 0.504 | 11.291 |
Zapisywanie modelu
from joblib import dump
dump(best_xgb_model, 'xgboost.joblib')
['xgboost.joblib']
Wczytywanie modelu
from joblib import load
model2 = load('xgboost.joblib')
model2.get_params()
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': True, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.001, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 3, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 7.975717439293598, 'subsample': None, 'tree_method': 'hist', 'validate_parameters': None, 'verbosity': None}