fantastyczne_gole/notebooks/xgboost_dla_xG.ipynb

188 KiB
Raw Blame History

Importy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import precision_score, recall_score, accuracy_score
import time

Wczytanie danych

df = pd.read_csv('data4.csv')
y = pd.DataFrame(df['isGoal'])
X = df.drop(['isGoal'], axis=1)
X.head()
match_minute match_second position_x position_y play_type BodyPart Number_Intervening_Opponents Number_Intervening_Teammates Interference_on_Shooter outcome ... Interference_on_Shooter_Code distance_to_goalM distance_to_centerM angle isFoot isHead header_distance_to_goalM High Low Medium
0 29 54 23.69 4.99 Open Play Left 4 2 Medium Missed ... 2 24.212265 5.001769 11.922004 1 0 0.000000 0 0 1
1 11 33 28.93 -11.22 Open Play Left 4 1 Low Missed ... 1 31.039134 11.246462 21.243463 1 0 0.000000 0 1 0
2 61 25 9.98 -5.24 Open Play Head 3 1 High Missed ... 3 11.277751 5.252358 27.757313 0 1 11.277751 1 0 0
3 73 45 4.49 -5.74 Open Play Right 2 0 Low Missed ... 1 7.298171 5.753538 52.031899 1 0 0.000000 0 1 0
4 44 40 7.98 -12.97 Open Play Right 1 0 Medium Saved ... 2 15.254368 13.000590 58.457635 1 0 0.000000 0 0 1

5 rows × 29 columns

y.head()
isGoal
0 0
1 0
2 0
3 0
4 0

Przygotowanie danych

Uwzględnienie wybranych cech:

  • Współrzędna x strzelającego,
  • Współrzędna y strzelającego,
  • Dystans do bramki,
  • Kąt do bramki,
  • Minuta meczu,
  • Liczba przeciwników przed piłką,
  • Liczba zawodników ze swojej drużyny przed piłką,
  • Część ciała.
X.columns
Index(['match_minute', 'match_second', 'position_x', 'position_y', 'play_type',
       'BodyPart', 'Number_Intervening_Opponents',
       'Number_Intervening_Teammates', 'Interference_on_Shooter', 'outcome',
       'position_xM', 'position_yM', 'position_xM_r', 'position_yM_r',
       'position_xM_std', 'position_yM_std', 'position_xM_std_r',
       'position_yM_std_r', 'BodyPartCode', 'Interference_on_Shooter_Code',
       'distance_to_goalM', 'distance_to_centerM', 'angle', 'isFoot', 'isHead',
       'header_distance_to_goalM', 'High', 'Low', 'Medium'],
      dtype='object')
X_extracted = X[['position_x', 
                 'position_y',
                 'distance_to_goalM', 
                 'angle', 
                 'match_minute', 
                 'Number_Intervening_Opponents', 
                 'Number_Intervening_Teammates', 
                 'isFoot', 
                 'isHead']]
X_extracted['isFoot'] = X_extracted['isFoot'].astype('category')
X_extracted['isHead'] = X_extracted['isHead'].astype('category')
C:\Users\s478991\AppData\Local\temp\ipykernel_3956\2392787789.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_extracted['isFoot'] = X_extracted['isFoot'].astype('category')
C:\Users\s478991\AppData\Local\temp\ipykernel_3956\2392787789.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_extracted['isHead'] = X_extracted['isHead'].astype('category')
X_extracted.head()
position_x position_y distance_to_goalM angle match_minute Number_Intervening_Opponents Number_Intervening_Teammates isFoot isHead
0 23.69 4.99 24.212265 11.922004 29 4 2 1 0
1 28.93 -11.22 31.039134 21.243463 11 4 1 1 0
2 9.98 -5.24 11.277751 27.757313 61 3 1 0 1
3 4.49 -5.74 7.298171 52.031899 73 2 0 1 0
4 7.98 -12.97 15.254368 58.457635 44 1 0 1 0

Podział danych na zbiór treningowy oraz zbiór testowy

X_train, X_test, y_train, y_test = train_test_split(X_extracted, y, test_size=0.2, random_state=1)
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
count_class_0, count_class_1 = y_train.value_counts()
print ('Oddane strzały w zbiorze danych: ', count_class_0)
print ('Gole trafione w zbiorze danych: ', count_class_1)
Oddane strzały w zbiorze danych:  7226
Gole trafione w zbiorze danych:  906
# Class imbalance in training data

scale_pos_weight = count_class_0 / count_class_1
scale_pos_weight
7.975717439293598

Trening danych

from xgboost import XGBClassifier
# Define the xgboost model
xgb_model = XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')
# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01, 0.001, 0.0001],
              'max_depth': [3, 5, 7, 8, 9],
              'n_estimators': [100, 150, 200, 250, 300],
              'scale_pos_weight': [1, scale_pos_weight]}
start_time = time.time()
# Perform nested cross-validation with grid search

grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv_inner, scoring='f1', n_jobs=-1)
scores_xg = cross_val_score(grid_xg, X_train, y_train, cv=cv_outer, scoring='f1', n_jobs=-1)
# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)
best_xgb_model = grid_xg.best_estimator_
# Stopping the timer
stop_time = time.time()

# Training Time
xgb_training_time = stop_time - start_time
# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")
Best parameters:  {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 7.975717439293598}
Model Training Time: 677.443 seconds

Ewaluacja modelu

Dane treningowe

# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, best_xgb_model.predict(X_train))

ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)

ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
Text(0.5, 1.0, 'Confusion Matrix - Train Set')
# Classfication report for training data
print (classification_report(y_train, best_xgb_model.predict(X_train)))
              precision    recall  f1-score   support

           0       0.94      0.84      0.88      7226
           1       0.30      0.56      0.39       906

    accuracy                           0.81      8132
   macro avg       0.62      0.70      0.64      8132
weighted avg       0.87      0.81      0.83      8132

# xgb.to_graphviz(best_xgb_model, num_trees=1)

Dane testowe

# Evaluate the performance of the best model on the testing set
y_pred_xgb = best_xgb_model.predict(X_test)

# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_xgb)

ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)

ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
Text(0.5, 1.0, 'Confusion Matrix - Test Set')
# Classfication report for testing data
print (classification_report(y_test, y_pred_xgb))
              precision    recall  f1-score   support

           0       0.93      0.84      0.88      1797
           1       0.30      0.50      0.37       236

    accuracy                           0.80      2033
   macro avg       0.61      0.67      0.63      2033
weighted avg       0.85      0.80      0.82      2033

print(f'Zbiór danych testowych zawiera {len(y_test)} oddane strzały, gdzie {y_test.sum()["isGoal"]} to strzały trafione.')
print(f'Dokładność klasyfikacji, czy strzał jest bramką, czy nie, wynosi {best_xgb_model.score(X_test, y_test):.2f}%.')
print(f'klasyfikator uzyskał ROC-AUC na poziomie {roc_auc_score(y_test, best_xgb_model.predict_proba(X_test)[:, 1]):.2f}%.')
Zbiór danych testowych zawiera 2033 oddane strzały, gdzie 236 to strzały trafione.
Dokładność klasyfikacji, czy strzał jest bramką, czy nie, wynosi 0.80%.
klasyfikator uzyskał ROC-AUC na poziomie 0.75%.
# Plot feature importance
xgb.plot_importance(best_xgb_model)
plt.show()
xgb.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain')
plt.show()
xgb.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight')
plt.show()

Podsumowanie

prec_xgb_train = precision_score(y_train, best_xgb_model.predict(X_train))
prec_xgb_test = precision_score(y_test, y_pred_xgb)
rec_xgb_train = recall_score(y_train, best_xgb_model.predict(X_train))
rec_xgb_test = recall_score(y_test, y_pred_xgb)
acc_xgb_train = accuracy_score(y_train, best_xgb_model.predict(X_train))
acc_xgb_test = accuracy_score(y_test, y_pred_xgb)
train_time = xgb_training_time/60
# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
                          'Training Accuracy': acc_xgb_train, 
                          'Training Precision': prec_xgb_train,
                          'Training Recall':rec_xgb_train,
                          'Testing Accuracy': acc_xgb_test, 
                          'Testing Precision': prec_xgb_test,
                          'Testing Recall':rec_xgb_test,
                          'Training Time (mins)': train_time})

summary_df.set_index('Model Name', inplace=True)
# Displaying summary of results
summary_df.style.format(precision =3).set_properties(**{'font-weight': 'bold',
            'border': '2.0px solid grey','color': 'white'})
  Training Accuracy Training Precision Training Recall Testing Accuracy Testing Precision Testing Recall Training Time (mins)
Model Name              
XG Boost 0.806 0.300 0.556 0.803 0.296 0.504 11.291

Zapisywanie modelu

from joblib import dump
dump(best_xgb_model, 'xgboost.joblib') 
['xgboost.joblib']

Wczytywanie modelu

from joblib import load

model2 = load('xgboost.joblib')
model2.get_params()
{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.001,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': 7.975717439293598,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}