Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
import xgboost
import time
from joblib import dump, load
import os

Load the data

df = pd.read_csv('final_data_new.csv')

df.columns

Index(['minute', 'position_name', 'shot_body_part_name', 'shot_technique_name',
       'shot_type_name', 'shot_first_time', 'shot_one_on_one',
       'shot_aerial_won', 'shot_open_goal', 'shot_follows_dribble',
       'shot_redirect', 'x1', 'y1', 'number_of_players_opponents',
       'number_of_players_teammates', 'is_goal', 'angle', 'distance',
       'x_player_opponent_Goalkeeper', 'x_player_opponent_8',
       'x_player_opponent_1', 'x_player_opponent_2', 'x_player_opponent_3',
       'x_player_teammate_1', 'x_player_opponent_4', 'x_player_opponent_5',
       'x_player_opponent_6', 'x_player_teammate_2', 'x_player_opponent_9',
       'x_player_opponent_10', 'x_player_opponent_11', 'x_player_teammate_3',
       'x_player_teammate_4', 'x_player_teammate_5', 'x_player_teammate_6',
       'x_player_teammate_7', 'x_player_teammate_8', 'x_player_teammate_9',
       'x_player_teammate_10', 'y_player_opponent_Goalkeeper',
       'y_player_opponent_8', 'y_player_opponent_1', 'y_player_opponent_2',
       'y_player_opponent_3', 'y_player_teammate_1', 'y_player_opponent_4',
       'y_player_opponent_5', 'y_player_opponent_6', 'y_player_teammate_2',
       'y_player_opponent_9', 'y_player_opponent_10', 'y_player_opponent_11',
       'y_player_teammate_3', 'y_player_teammate_4', 'y_player_teammate_5',
       'y_player_teammate_6', 'y_player_teammate_7', 'y_player_teammate_8',
       'y_player_teammate_9', 'y_player_teammate_10', 'x_player_opponent_7',
       'y_player_opponent_7', 'x_player_teammate_Goalkeeper',
       'y_player_teammate_Goalkeeper'],
      dtype='object')

df.head()

	minute	position_name	shot_body_part_name	shot_technique_name	shot_type_name	shot_first_time	shot_one_on_one	shot_aerial_won	shot_open_goal	shot_follows_dribble	...	y_player_teammate_5	y_player_teammate_6	y_player_teammate_7	y_player_teammate_8	y_player_teammate_9	y_player_teammate_10	x_player_opponent_7	y_player_opponent_7	x_player_teammate_Goalkeeper	y_player_teammate_Goalkeeper
0	2	Left Center Forward	Right Foot	Half Volley	Open Play	True	False	False	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	26.6	53.1	NaN	NaN
1	5	Left Back	Left Foot	Volley	Open Play	True	False	False	False	False	...	20.6	32.8	NaN	NaN	NaN	NaN	23.8	31.2	NaN	NaN
2	15	Left Center Forward	Left Foot	Normal	Open Play	False	False	False	False	False	...	29.0	NaN	NaN	NaN	NaN	NaN	29.6	55.3	NaN	NaN
3	16	Center Forward	Head	Normal	Open Play	False	False	True	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	26.7	60.4	NaN	NaN
4	18	Right Center Forward	Right Foot	Normal	Open Play	False	False	False	False	False	...	27.9	31.4	33.4	NaN	NaN	NaN	16.9	40.1	NaN	NaN

5 rows × 64 columns

Data preparation

# df[['minute', 
#     'number_of_players_opponents', 
#     'number_of_players_teammates']] = df[['minute', 
#                                           'number_of_players_opponents', 
#                                           'number_of_players_teammates']].astype(float)

from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = enc.fit_transform(df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name',
                                  'shot_body_part_name']])

df[['minute', 
    'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']] = df[['minute', 
        'position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'shot_body_part_name']].astype(int)

df

	minute	position_name	shot_body_part_name	shot_technique_name	shot_type_name	shot_first_time	shot_one_on_one	shot_aerial_won	shot_open_goal	shot_follows_dribble	...	y_player_teammate_5	y_player_teammate_6	y_player_teammate_7	y_player_teammate_8	y_player_teammate_9	y_player_teammate_10	x_player_opponent_7	y_player_opponent_7	x_player_teammate_Goalkeeper	y_player_teammate_Goalkeeper
0	2	9	3	2	3	True	False	False	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	26.6	53.1	NaN	NaN
1	5	7	1	6	3	True	False	False	False	False	...	20.6	32.8	NaN	NaN	NaN	NaN	23.8	31.2	NaN	NaN
2	15	9	1	4	3	False	False	False	False	False	...	29.0	NaN	NaN	NaN	NaN	NaN	29.6	55.3	NaN	NaN
3	16	3	0	4	3	False	False	True	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	26.7	60.4	NaN	NaN
4	18	18	3	4	3	False	False	False	False	False	...	27.9	31.4	33.4	NaN	NaN	NaN	16.9	40.1	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
82816	79	0	3	2	3	True	False	False	False	False	...	30.9	NaN	NaN	NaN	NaN	NaN	30.8	40.3	NaN	NaN
82817	80	20	3	4	3	False	False	False	False	False	...	60.2	NaN	NaN	NaN	NaN	NaN	31.9	47.7	NaN	NaN
82818	82	0	3	4	3	True	False	False	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
82819	84	21	3	4	3	False	False	False	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
82820	88	8	1	2	3	False	False	False	False	False	...	NaN	NaN	NaN	NaN	NaN	NaN	20.0	44.5	NaN	NaN

82821 rows × 64 columns

dump(enc,'labelEncoder.joblib')

['labelEncoder.joblib']

enc2 = load('labelEncoder.joblib')

# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])

# df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']] = enc2.transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name', 
#     'shot_body_part_name']])

# enc.inverse_transform(df[['position_name', 
#     'shot_technique_name', 
#     'shot_type_name',
#     'shot_body_part_name']])

# ############### NEW ################
# from sklearn.preprocessing import LabelEncoder

# le_posiotion_name = LabelEncoder()
# le_shot_technique_name = LabelEncoder()
# le_shot_type_name = LabelEncoder()
# le_shot_body_part_name = LabelEncoder()

# df['position_name'] = le_posiotion_name.fit_transform(df['position_name'])
# df['shot_technique_name'] = le_shot_technique_name.fit_transform(df['shot_technique_name'])
# df['shot_type_name'] = le_shot_type_name.fit_transform(df['shot_type_name'])
# df['shot_body_part_name'] = le_shot_body_part_name.fit_transform(df['shot_body_part_name'])

# Change the type of categorical features to 'category' 
df[['position_name', 
    'shot_technique_name', 
    'shot_type_name', 
    'number_of_players_opponents', 
    'number_of_players_teammates', 
    'shot_body_part_name']] = df[['position_name', 
                                  'shot_technique_name', 
                                  'shot_type_name', 
                                  'number_of_players_opponents', 
                                  'number_of_players_teammates', 
                                  'shot_body_part_name']].astype('category')

# Splitting the dataset into features (X) and the target variable (y)
y = pd.DataFrame(df['is_goal'])
X = df.drop(['is_goal'], axis=1)

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create cross-validation 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

count_class_0, count_class_1 = y_train.value_counts()

# Display the count of shots attempted in the training set
print('Shots attempted in the training set:', count_class_0)

# Display the count of successful goals in the training set
print('Goals scored in the training set:', count_class_1)

Shots attempted in the training set: 58970
Goals scored in the training set: 7286

# Class imbalance in training data
scale_pos_weight = count_class_0 / count_class_1
print(f' Class imbalance in training data: {scale_pos_weight:.3f}')

 Class imbalance in training data: 8.094

Training XGBoost model

# Define the xgboost model
xgb_model = xgboost.XGBClassifier(enable_categorical=True, tree_method='hist', objective='binary:logistic')

# Defining the hyper-parameter grid for XG Boost
param_grid_xgb = {'learning_rate': [0.01, 0.001, 0.0001],
              'max_depth': [3, 5, 7, 8, 9],
              'n_estimators': [100, 150, 200, 250, 300],
              'scale_pos_weight': [1, scale_pos_weight]}

# Starting the timer
start_time = time.time()

# Perform grid search with cross-validation
grid_xg = GridSearchCV(xgb_model, param_grid=param_grid_xgb, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the best model on the entire training set
grid_xg.fit(X_train, y_train)

# Take the best parameters for xgboost model
best_xgb_model = grid_xg.best_estimator_

# Stopping the timer
stop_time = time.time()

# Training Time
xgb_training_time = stop_time - start_time

# Print the best parameters and training time
print("Best parameters: ", grid_xg.best_params_)
print (f"Model Training Time: {xgb_training_time:.3f} seconds")

Best parameters:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'scale_pos_weight': 1}
Model Training Time: 1393.345 seconds

Model evaluation

Training set

# Evaluate the model on training set
y_pred_train = best_xgb_model.predict(X_train)

# Confusion Matrix for Training Data
cm_train_xg = confusion_matrix(y_train, y_pred_train)
ax = sns.heatmap(cm_train_xg, annot=True, cmap='BuPu', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Train Set')
plt.show()

Test set

# Evaluate the model on test set
y_pred_test = best_xgb_model.predict(X_test)

# Confusion Matrix for Testig Data
cm_test_xgb = confusion_matrix(y_test, y_pred_test)
ax = sns.heatmap(cm_test_xgb, annot=True, cmap='Blues', fmt='g', linewidth=1.5)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Test Set')
plt.show()

# Number of goals in test set
print(f'The test dataset contains {len(y_test)} shots, with {y_test.sum()["is_goal"]} of them being goals.')

The test dataset contains 16565 shots, with 1891 of them being goals.

Feature importance

# Plot feature importance with Gain
xgboost.plot_importance(best_xgb_model, importance_type='gain', xlabel='Gain', max_num_features=20)
plt.show()

# Plot feature importance with Weight
xgboost.plot_importance(best_xgb_model, importance_type='weight', xlabel='Weight', max_num_features=30)
plt.show()

Summary

# Calculating MAE, RMSE and R2 for training and test sets 
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)

mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)

# Creating of dataframe of summary results
summary_df = pd.DataFrame({'Model Name':['XG Boost'],
                          'Training MAE': mae_train, 
                          'Training RMSE': rmse_train,
                          'Training R2':r2_train,
                          'Testing MAE': mae_test, 
                          'Testing RMSE': rmse_test,
                          'Testing R2':r2_test,
                          'Training Time (mins)': xgb_training_time/60})
summary_df.set_index('Model Name', inplace=True)

# Displaying summary of results
summary_df.style.format(precision =5).set_properties(**{'font-weight': 'bold',
            'border': '2.0px solid grey','color': 'white'})

	Training MAE	Training RMSE	Training R2	Testing MAE	Testing RMSE	Testing R2	Training Time (mins)
Model Name
XG Boost	0.09646	0.31058	0.01446	0.09919	0.31494	0.01918	23.22242

Keeping the xgboost model

# Save the model
best_xgb_model.save_model('xgboost.json')

best_xgb_model.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': True,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.01,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 5,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 300,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': 1,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}

# Save the model
# dump(best_xgb_model, 'xgboost.joblib') 

# # Load the model
# model = load('xgboost.joblib')

best_xgb_model.predict_proba(X)

array([[0.9021414 , 0.09785861],
       [0.9434396 , 0.05656038],
       [0.9602713 , 0.0397287 ],
       ...,
       [0.8207403 , 0.17925973],
       [0.88015527, 0.11984473],
       [0.9733864 , 0.02661358]], dtype=float32)

new_xgb_model = xgboost.XGBClassifier()
new_xgb_model.load_model('xgboost.json')

new_xgb_model.predict_proba(X)

array([[0.9021414 , 0.09785861],
       [0.9434396 , 0.05656038],
       [0.9602713 , 0.0397287 ],
       ...,
       [0.8207403 , 0.17925973],
       [0.88015527, 0.11984473],
       [0.9733864 , 0.02661358]], dtype=float32)

389 KiB Raw Blame History Unescape Escape

Imports

Load the data

Data preparation

Training XGBoost model

Model evaluation

Training set

Test set

Feature importance

Summary

Keeping the xgboost model

389 KiB

Raw Blame History