fantastyczne_gole/notebooks/xgboost_dla_xG.ipynb
2023-12-28 23:45:45 +01:00

367 KiB
Raw Permalink Blame History

Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
import xgboost
import time
from joblib import dump, load

Load the data

df = pd.read_csv('final_data.txt')
df.columns
Index(['minute', 'position_name', 'shot_body_part_name', 'shot_technique_name',
       'shot_type_name', 'shot_first_time', 'shot_one_on_one',
       'shot_aerial_won', 'shot_deflected', 'shot_open_goal',
       'shot_follows_dribble', 'shot_redirect', 'x1', 'y1',
       'number_of_players_opponents', 'number_of_players_teammates', 'is_goal',
       'angle', 'distance', 'x_player_opponent_Goalkeeper',
       'x_player_opponent_8', 'x_player_opponent_1', 'x_player_opponent_2',
       'x_player_opponent_3', 'x_player_teammate_1', 'x_player_opponent_4',
       'x_player_opponent_5', 'x_player_opponent_6', 'x_player_teammate_2',
       'x_player_opponent_9', 'x_player_opponent_10', 'x_player_opponent_11',
       'x_player_teammate_3', 'x_player_teammate_4', 'x_player_teammate_5',
       'x_player_teammate_6', 'x_player_teammate_7', 'x_player_teammate_8',
       'x_player_teammate_9', 'x_player_teammate_10',
       'y_player_opponent_Goalkeeper', 'y_player_opponent_8',
       'y_player_opponent_1', 'y_player_opponent_2', 'y_player_opponent_3',
       'y_player_teammate_1', 'y_player_opponent_4', 'y_player_opponent_5',
       'y_player_opponent_6', 'y_player_teammate_2', 'y_player_opponent_9',
       'y_player_opponent_10', 'y_player_opponent_11', 'y_player_teammate_3',
       'y_player_teammate_4', 'y_player_teammate_5', 'y_player_teammate_6',
       'y_player_teammate_7', 'y_player_teammate_8', 'y_player_teammate_9',
       'y_player_teammate_10', 'x_player_opponent_7', 'y_player_opponent_7',
       'x_player_teammate_Goalkeeper', 'y_player_teammate_Goalkeeper',
       'shot_kick_off'],
      dtype='object')
df.head()
minute position_name shot_body_part_name shot_technique_name shot_type_name shot_first_time shot_one_on_one shot_aerial_won shot_deflected shot_open_goal ... y_player_teammate_6 y_player_teammate_7 y_player_teammate_8 y_player_teammate_9 y_player_teammate_10 x_player_opponent_7 y_player_opponent_7 x_player_teammate_Goalkeeper y_player_teammate_Goalkeeper shot_kick_off
0 0 Right Center Forward Right Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN False
1 5 Right Center Forward Left Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN False
2 5 Center Midfield Right Foot Half Volley Open Play True False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN False
3 5 Left Center Midfield Right Foot Normal Open Play False False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN False
4 5 Right Center Back Left Foot Normal Open Play True False False False False ... NaN NaN NaN NaN NaN NaN NaN NaN NaN False

5 rows × 66 columns

Data preparation

# Change the type of categorical features to 'category' 
df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'number_of_players_opponents', 
    'number_of_players_teammates', 
    'shot_body_part_name']] = df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name', 
                                  'number_of_players_opponents', 
                                  'number_of_players_teammates', 
                                  'shot_body_part_name']].astype('category')
# Splitting the dataset into features (X) and the target variable (y)
y = pd.DataFrame(df['is_goal'])
X = df.drop(['is_goal'], axis=1)

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create cross-validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
count_class_0, count_class_1 = y_train.value_counts()

# Display the count of shots attempted in the training set
print('Shots attempted in the training set:', count_class_0)

# Display the count of successful goals in the training set
print('Goals scored in the training set:', count_class_1)
Shots attempted in the training set: 27085
Goals scored in the training set: 3588
# Class imbalance in training data
scale_pos_weight = count_class_0 / count_class_1
print(f' Class imbalance in training data: {scale_pos_weight:.3f}')
 Class imbalance in training data: 7.549

Training XGBoost model

# Define the xgboost model
xgb_model = xgboost.XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')
# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01, 0.001, 0.0001],
              'max_depth': [3, 5, 7, 8, 9],
              'n_estimators': [100, 150, 200, 250, 300],
              'scale_pos_weight': [1, scale_pos_weight]}
# Starting the timer
start_time = time.time()

# Perform grid search with cross-validation
grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)

# Take the best parameters for xgboost model
best_xgb_model = grid_xg.best_estimator_

# Stopping the timer
stop_time = time.time()

# Training Time
xgb_training_time = stop_time - start_time
# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")
Best parameters:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'scale_pos_weight': 1}
Model Training Time: 1916.225 seconds

Model evaluation

Training set

# Evaluate the model on training set
y_pred_train = best_xgb_model.predict(X_train)

# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, y_pred_train)
ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
plt.show()

Test set

# Evaluate the model on test set
y_pred_test = best_xgb_model.predict(X_test)

# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_test)
ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
plt.show()
# Number of goals in test set
print(f'The test dataset contains {len(y_test)} shots, with {y_test.sum()["is_goal"]} of them being goals.')
The test dataset contains 7669 shots, with 914 of them being goals.

Feature importance

# Plot feature importance with Gain
xgboost.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain', max_num_features=20)
plt.show()
# Plot feature importance with Weight
xgboost.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight', max_num_features=30)
plt.show()

Summary

# Calculating MAE, RMSE and R2 for training and test sets 
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
                          'Training MAE': mae_train, 
                          'Training RMSE': rmse_train,
                          'Training R2':r2_train,
                          'Testing MAE': mae_test, 
                          'Testing RMSE': rmse_test,
                          'Testing R2':r2_test,
                          'Training Time (mins)': xgb_training_time/60})
summary_df.set_index('Model Name', inplace=True)

# Displaying summary of results
summary_df.style.format(precision =5).set_properties(**{'font-weight': 'bold',
            'border': '2.0px solid grey','color': 'white'})
  Training MAE Training RMSE Training R2 Testing MAE Testing RMSE Testing R2 Training Time (mins)
Model Name              
XG Boost 0.09934 0.31518 0.03828 0.10366 0.32197 0.01251 31.93709

Keeping the xgboost model

# Save the model
dump(best_xgb_model, 'xgboost.joblib') 

# Load the model
model = load('xgboost.joblib')