fantastyczne_gole/notebooks/xgboost_dla_xG.ipynb

364 KiB
Raw Blame History

Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
import xgboost
import time
from joblib import dump, load
import os

Load the data

df = pd.read_csv('final_data.csv')
df.columns
Index(['minute', 'position_name', 'shot_body_part_name', 'shot_technique_name',
       'shot_type_name', 'shot_first_time', 'shot_one_on_one',
       'shot_aerial_won', 'shot_open_goal', 'shot_follows_dribble',
       'shot_redirect', 'x1', 'y1', 'number_of_players_opponents',
       'number_of_players_teammates', 'is_goal', 'angle', 'distance',
       'x_player_opponent_Goalkeeper', 'x_player_opponent_8',
       'x_player_opponent_1', 'x_player_opponent_2', 'x_player_opponent_3',
       'x_player_teammate_1', 'x_player_opponent_4', 'x_player_opponent_5',
       'x_player_opponent_6', 'x_player_teammate_2', 'x_player_opponent_9',
       'x_player_opponent_10', 'x_player_opponent_11', 'x_player_teammate_3',
       'x_player_teammate_4', 'x_player_teammate_5', 'x_player_teammate_6',
       'x_player_teammate_7', 'x_player_teammate_8', 'x_player_teammate_9',
       'x_player_teammate_10', 'y_player_opponent_Goalkeeper',
       'y_player_opponent_8', 'y_player_opponent_1', 'y_player_opponent_2',
       'y_player_opponent_3', 'y_player_teammate_1', 'y_player_opponent_4',
       'y_player_opponent_5', 'y_player_opponent_6', 'y_player_teammate_2',
       'y_player_opponent_9', 'y_player_opponent_10', 'y_player_opponent_11',
       'y_player_teammate_3', 'y_player_teammate_4', 'y_player_teammate_5',
       'y_player_teammate_6', 'y_player_teammate_7', 'y_player_teammate_8',
       'y_player_teammate_9', 'y_player_teammate_10', 'x_player_opponent_7',
       'y_player_opponent_7', 'x_player_teammate_Goalkeeper',
       'y_player_teammate_Goalkeeper'],
      dtype='object')
df.head()
minute position_name shot_body_part_name shot_technique_name shot_type_name shot_first_time shot_one_on_one shot_aerial_won shot_open_goal shot_follows_dribble ... y_player_teammate_5 y_player_teammate_6 y_player_teammate_7 y_player_teammate_8 y_player_teammate_9 y_player_teammate_10 x_player_opponent_7 y_player_opponent_7 x_player_teammate_Goalkeeper y_player_teammate_Goalkeeper
0 0 Right Center Forward Right Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 5 Right Center Forward Left Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 5 Center Midfield Right Foot Half Volley Open Play True False False False False ... 48.9 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 5 Left Center Midfield Right Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5 Right Center Back Left Foot Normal Open Play True False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 64 columns

Data preparation

# df[['minute', 
#     'number_of_players_opponents', 
#     'number_of_players_teammates']] = df[['minute', 
#                                           'number_of_players_opponents', 
#                                           'number_of_players_teammates']].astype(float)
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = enc.fit_transform(df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name',
                                  'shot_body_part_name']])
df[['minute', 
    'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = df[['minute', 
        'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']].astype(int)
df
minute position_name shot_body_part_name shot_technique_name shot_type_name shot_first_time shot_one_on_one shot_aerial_won shot_open_goal shot_follows_dribble ... y_player_teammate_5 y_player_teammate_6 y_player_teammate_7 y_player_teammate_8 y_player_teammate_9 y_player_teammate_10 x_player_opponent_7 y_player_opponent_7 x_player_teammate_Goalkeeper y_player_teammate_Goalkeeper
0 0 18 3 4 3 False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 5 18 1 4 3 False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 5 4 3 2 3 True False False False False ... 48.9 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 5 10 3 4 3 False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5 17 1 4 3 True False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
38337 61 3 3 4 3 True False False False False ... 47.3 42.6 54.7 NaN NaN NaN 21.3 50.9 NaN NaN
38338 66 0 3 4 3 True False False False False ... 47.6 39.4 43.1 NaN NaN NaN 19.0 45.8 NaN NaN
38339 73 3 1 6 3 True False False False False ... 48.9 48.1 41.1 NaN NaN NaN 21.7 29.6 NaN NaN
38340 75 13 1 4 3 False False False False False ... 29.1 33.6 40.9 NaN NaN NaN 21.2 32.4 NaN NaN
38341 90 3 3 4 3 False False False False False ... 62.6 51.0 66.7 NaN NaN NaN 23.0 45.5 NaN NaN

38342 rows × 64 columns

dump(enc,'labelEncoder.joblib')
['labelEncoder.joblib']
enc2 = load('labelEncoder.joblib')
# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])

# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']])
# enc.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])
# ############### NEW ################
# from sklearn.preprocessing import LabelEncoder

# le_posiotion_name = LabelEncoder()
# le_shot_technique_name = LabelEncoder()
# le_shot_type_name = LabelEncoder()
# le_shot_body_part_name = LabelEncoder()

# df['position_name'] = le_posiotion_name.fit_transform(df['position_name'])
# df['shot_technique_name'] = le_shot_technique_name.fit_transform(df['shot_technique_name'])
# df['shot_type_name'] = le_shot_type_name.fit_transform(df['shot_type_name'])
# df['shot_body_part_name'] = le_shot_body_part_name.fit_transform(df['shot_body_part_name'])
# Change the type of categorical features to 'category' 
df[['minute',
    'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'number_of_players_opponents', 
    'number_of_players_teammates', 
    'shot_body_part_name']] = df[['minute',
                                  'position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name', 
                                  'number_of_players_opponents', 
                                  'number_of_players_teammates', 
                                  'shot_body_part_name']].astype('category')
# Splitting the dataset into features (X) and the target variable (y)
y = pd.DataFrame(df['is_goal'])
X = df.drop(['is_goal'], axis=1)

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create cross-validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
count_class_0, count_class_1 = y_train.value_counts()

# Display the count of shots attempted in the training set
print('Shots attempted in the training set:', count_class_0)

# Display the count of successful goals in the training set
print('Goals scored in the training set:', count_class_1)
Shots attempted in the training set: 27085
Goals scored in the training set: 3588
# Class imbalance in training data
scale_pos_weight = count_class_0 / count_class_1
print(f' Class imbalance in training data: {scale_pos_weight:.3f}')
 Class imbalance in training data: 7.549

Training XGBoost model

# Define the xgboost model
xgb_model = xgboost.XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')
# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01],
              'max_depth': [3],
              'n_estimators': [300],
              'scale_pos_weight': [1, scale_pos_weight]}
# Starting the timer
start_time = time.time()

# Perform grid search with cross-validation
grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)

# Take the best parameters for xgboost model
best_xgb_model = grid_xg.best_estimator_

# Stopping the timer
stop_time = time.time()

# Training Time
xgb_training_time = stop_time - start_time
# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")
Best parameters:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'scale_pos_weight': 1}
Model Training Time: 16.402 seconds

Model evaluation

Training set

# Evaluate the model on training set
y_pred_train = best_xgb_model.predict(X_train)

# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, y_pred_train)
ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
plt.show()

Test set

# Evaluate the model on test set
y_pred_test = best_xgb_model.predict(X_test)

# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_test)
ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
plt.show()
# Number of goals in test set
print(f'The test dataset contains {len(y_test)} shots, with {y_test.sum()["is_goal"]} of them being goals.')
The test dataset contains 7669 shots, with 914 of them being goals.

Feature importance

# Plot feature importance with Gain
xgboost.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain', max_num_features=20)
plt.show()
# Plot feature importance with Weight
xgboost.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight', max_num_features=30)
plt.show()

Summary

# Calculating MAE, RMSE and R2 for training and test sets 
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
                          'Training MAE': mae_train, 
                          'Training RMSE': rmse_train,
                          'Training R2':r2_train,
                          'Testing MAE': mae_test, 
                          'Testing RMSE': rmse_test,
                          'Testing R2':r2_test,
                          'Training Time (mins)': xgb_training_time/60})
summary_df.set_index('Model Name', inplace=True)

# Displaying summary of results
summary_df.style.format(precision =5).set_properties(**{'font-weight': 'bold',
            'border': '2.0px solid grey','color': 'white'})
  Training MAE Training RMSE Training R2 Testing MAE Testing RMSE Testing R2 Training Time (mins)
Model Name              
XG Boost 0.10309 0.32107 0.00199 0.10549 0.32479 -0.00488 0.27337

Keeping the xgboost model

# Save the model
best_xgb_model.save_model('xgboost.json')

best_xgb_model.get_params()
{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 300,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': 1,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}
# Save the model
# dump(best_xgb_model, 'xgboost.joblib') 

# # Load the model
# model = load('xgboost.joblib')
best_xgb_model.predict_proba(X)
array([[0.9334918 , 0.06650815],
       [0.9150994 , 0.0849006 ],
       [0.9478227 , 0.05217729],
       ...,
       [0.77691543, 0.2230846 ],
       [0.9318629 , 0.06813709],
       [0.95634604, 0.04365399]], dtype=float32)
new_xgb_model = xgboost.XGBClassifier()
new_xgb_model.load_model('xgboost.json')
new_xgb_model.predict_proba(X)
array([[0.9334918 , 0.06650815],
       [0.9150994 , 0.0849006 ],
       [0.9478227 , 0.05217729],
       ...,
       [0.77691543, 0.2230846 ],
       [0.9318629 , 0.06813709],
       [0.95634604, 0.04365399]], dtype=float32)