fantastyczne_gole/notebooks/xgboost_dla_xG.ipynb
2024-01-20 17:18:19 +01:00

389 KiB
Raw Blame History

Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
import xgboost
import time
from joblib import dump, load
import os

Load the data

df = pd.read_csv('final_data_new.csv')
df.columns
Index(['minute', 'position_name', 'shot_body_part_name', 'shot_technique_name',
       'shot_type_name', 'shot_first_time', 'shot_one_on_one',
       'shot_aerial_won', 'shot_open_goal', 'shot_follows_dribble',
       'shot_redirect', 'x1', 'y1', 'number_of_players_opponents',
       'number_of_players_teammates', 'is_goal', 'angle', 'distance',
       'x_player_opponent_Goalkeeper', 'x_player_opponent_8',
       'x_player_opponent_1', 'x_player_opponent_2', 'x_player_opponent_3',
       'x_player_teammate_1', 'x_player_opponent_4', 'x_player_opponent_5',
       'x_player_opponent_6', 'x_player_teammate_2', 'x_player_opponent_9',
       'x_player_opponent_10', 'x_player_opponent_11', 'x_player_teammate_3',
       'x_player_teammate_4', 'x_player_teammate_5', 'x_player_teammate_6',
       'x_player_teammate_7', 'x_player_teammate_8', 'x_player_teammate_9',
       'x_player_teammate_10', 'y_player_opponent_Goalkeeper',
       'y_player_opponent_8', 'y_player_opponent_1', 'y_player_opponent_2',
       'y_player_opponent_3', 'y_player_teammate_1', 'y_player_opponent_4',
       'y_player_opponent_5', 'y_player_opponent_6', 'y_player_teammate_2',
       'y_player_opponent_9', 'y_player_opponent_10', 'y_player_opponent_11',
       'y_player_teammate_3', 'y_player_teammate_4', 'y_player_teammate_5',
       'y_player_teammate_6', 'y_player_teammate_7', 'y_player_teammate_8',
       'y_player_teammate_9', 'y_player_teammate_10', 'x_player_opponent_7',
       'y_player_opponent_7', 'x_player_teammate_Goalkeeper',
       'y_player_teammate_Goalkeeper'],
      dtype='object')
df.head()
minute position_name shot_body_part_name shot_technique_name shot_type_name shot_first_time shot_one_on_one shot_aerial_won shot_open_goal shot_follows_dribble ... y_player_teammate_5 y_player_teammate_6 y_player_teammate_7 y_player_teammate_8 y_player_teammate_9 y_player_teammate_10 x_player_opponent_7 y_player_opponent_7 x_player_teammate_Goalkeeper y_player_teammate_Goalkeeper
0 2 Left Center Forward Right Foot Half Volley Open Play True False False False False ... NaN NaN NaN NaN NaN NaN 26.6 53.1 NaN NaN
1 5 Left Back Left Foot Volley Open Play True False False False False ... 20.6 32.8 NaN NaN NaN NaN 23.8 31.2 NaN NaN
2 15 Left Center Forward Left Foot Normal Open Play False False False False False ... 29.0 NaN NaN NaN NaN NaN 29.6 55.3 NaN NaN
3 16 Center Forward Head Normal Open Play False False True False False ... NaN NaN NaN NaN NaN NaN 26.7 60.4 NaN NaN
4 18 Right Center Forward Right Foot Normal Open Play False False False False False ... 27.9 31.4 33.4 NaN NaN NaN 16.9 40.1 NaN NaN

5 rows × 64 columns

Data preparation

# df[['minute', 
#     'number_of_players_opponents', 
#     'number_of_players_teammates']] = df[['minute', 
#                                           'number_of_players_opponents', 
#                                           'number_of_players_teammates']].astype(float)
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = enc.fit_transform(df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name',
                                  'shot_body_part_name']])
df[['minute', 
    'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = df[['minute', 
        'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']].astype(int)
df
minute position_name shot_body_part_name shot_technique_name shot_type_name shot_first_time shot_one_on_one shot_aerial_won shot_open_goal shot_follows_dribble ... y_player_teammate_5 y_player_teammate_6 y_player_teammate_7 y_player_teammate_8 y_player_teammate_9 y_player_teammate_10 x_player_opponent_7 y_player_opponent_7 x_player_teammate_Goalkeeper y_player_teammate_Goalkeeper
0 2 9 3 2 3 True False False False False ... NaN NaN NaN NaN NaN NaN 26.6 53.1 NaN NaN
1 5 7 1 6 3 True False False False False ... 20.6 32.8 NaN NaN NaN NaN 23.8 31.2 NaN NaN
2 15 9 1 4 3 False False False False False ... 29.0 NaN NaN NaN NaN NaN 29.6 55.3 NaN NaN
3 16 3 0 4 3 False False True False False ... NaN NaN NaN NaN NaN NaN 26.7 60.4 NaN NaN
4 18 18 3 4 3 False False False False False ... 27.9 31.4 33.4 NaN NaN NaN 16.9 40.1 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
82816 79 0 3 2 3 True False False False False ... 30.9 NaN NaN NaN NaN NaN 30.8 40.3 NaN NaN
82817 80 20 3 4 3 False False False False False ... 60.2 NaN NaN NaN NaN NaN 31.9 47.7 NaN NaN
82818 82 0 3 4 3 True False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
82819 84 21 3 4 3 False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
82820 88 8 1 2 3 False False False False False ... NaN NaN NaN NaN NaN NaN 20.0 44.5 NaN NaN

82821 rows × 64 columns

dump(enc,'labelEncoder.joblib')
['labelEncoder.joblib']
enc2 = load('labelEncoder.joblib')
# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])

# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']])
# enc.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])
# ############### NEW ################
# from sklearn.preprocessing import LabelEncoder

# le_posiotion_name = LabelEncoder()
# le_shot_technique_name = LabelEncoder()
# le_shot_type_name = LabelEncoder()
# le_shot_body_part_name = LabelEncoder()

# df['position_name'] = le_posiotion_name.fit_transform(df['position_name'])
# df['shot_technique_name'] = le_shot_technique_name.fit_transform(df['shot_technique_name'])
# df['shot_type_name'] = le_shot_type_name.fit_transform(df['shot_type_name'])
# df['shot_body_part_name'] = le_shot_body_part_name.fit_transform(df['shot_body_part_name'])
# Change the type of categorical features to 'category' 
df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'number_of_players_opponents', 
    'number_of_players_teammates', 
    'shot_body_part_name']] = df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name', 
                                  'number_of_players_opponents', 
                                  'number_of_players_teammates', 
                                  'shot_body_part_name']].astype('category')
# Splitting the dataset into features (X) and the target variable (y)
y = pd.DataFrame(df['is_goal'])
X = df.drop(['is_goal'], axis=1)

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create cross-validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
count_class_0, count_class_1 = y_train.value_counts()

# Display the count of shots attempted in the training set
print('Shots attempted in the training set:', count_class_0)

# Display the count of successful goals in the training set
print('Goals scored in the training set:', count_class_1)
Shots attempted in the training set: 58970
Goals scored in the training set: 7286
# Class imbalance in training data
scale_pos_weight = count_class_0 / count_class_1
print(f' Class imbalance in training data: {scale_pos_weight:.3f}')
 Class imbalance in training data: 8.094

Training XGBoost model

# Define the xgboost model
xgb_model = xgboost.XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')
# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01, 0.001, 0.0001],
              'max_depth': [3, 5, 7, 8, 9],
              'n_estimators': [100, 150, 200, 250, 300],
              'scale_pos_weight': [1, scale_pos_weight]}
# Starting the timer
start_time = time.time()

# Perform grid search with cross-validation
grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)

# Take the best parameters for xgboost model
best_xgb_model = grid_xg.best_estimator_

# Stopping the timer
stop_time = time.time()

# Training Time
xgb_training_time = stop_time - start_time
# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")
Best parameters:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'scale_pos_weight': 1}
Model Training Time: 1393.345 seconds

Model evaluation

Training set

# Evaluate the model on training set
y_pred_train = best_xgb_model.predict(X_train)

# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, y_pred_train)
ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
plt.show()

Test set

# Evaluate the model on test set
y_pred_test = best_xgb_model.predict(X_test)

# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_test)
ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
plt.show()
# Number of goals in test set
print(f'The test dataset contains {len(y_test)} shots, with {y_test.sum()["is_goal"]} of them being goals.')
The test dataset contains 16565 shots, with 1891 of them being goals.

Feature importance

# Plot feature importance with Gain
xgboost.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain', max_num_features=20)
plt.show()
# Plot feature importance with Weight
xgboost.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight', max_num_features=30)
plt.show()

Summary

# Calculating MAE, RMSE and R2 for training and test sets 
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
                          'Training MAE': mae_train, 
                          'Training RMSE': rmse_train,
                          'Training R2':r2_train,
                          'Testing MAE': mae_test, 
                          'Testing RMSE': rmse_test,
                          'Testing R2':r2_test,
                          'Training Time (mins)': xgb_training_time/60})
summary_df.set_index('Model Name', inplace=True)

# Displaying summary of results
summary_df.style.format(precision =5).set_properties(**{'font-weight': 'bold',
            'border': '2.0px solid grey','color': 'white'})
  Training MAE Training RMSE Training R2 Testing MAE Testing RMSE Testing R2 Training Time (mins)
Model Name              
XG Boost 0.09646 0.31058 0.01446 0.09919 0.31494 0.01918 23.22242

Keeping the xgboost model

# Save the model
best_xgb_model.save_model('xgboost.json')

best_xgb_model.get_params()
{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 5,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 300,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': 1,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}
# Save the model
# dump(best_xgb_model, 'xgboost.joblib') 

# # Load the model
# model = load('xgboost.joblib')
best_xgb_model.predict_proba(X)
array([[0.9021414 , 0.09785861],
       [0.9434396 , 0.05656038],
       [0.9602713 , 0.0397287 ],
       ...,
       [0.8207403 , 0.17925973],
       [0.88015527, 0.11984473],
       [0.9733864 , 0.02661358]], dtype=float32)
new_xgb_model = xgboost.XGBClassifier()
new_xgb_model.load_model('xgboost.json')
new_xgb_model.predict_proba(X)
array([[0.9021414 , 0.09785861],
       [0.9434396 , 0.05656038],
       [0.9602713 , 0.0397287 ],
       ...,
       [0.8207403 , 0.17925973],
       [0.88015527, 0.11984473],
       [0.9733864 , 0.02661358]], dtype=float32)