sportowe_wizualizacja/xGStatsbomb.ipynb
2020-05-13 17:47:39 +05:30

391 KiB
Raw Permalink Blame History

Open In Colab

%%time
!git clone https://github.com/statsbomb/open-data.git
Cloning into 'open-data'...
remote: Enumerating objects: 1133, done.
remote: Counting objects: 100% (1133/1133), done.
remote: Compressing objects: 100% (632/632), done.
remote: Total 9855 (delta 925), reused 690 (delta 482), pack-reused 8722
Receiving objects: 100% (9855/9855), 996.16 MiB | 25.43 MiB/s, done.
Resolving deltas: 100% (8672/8672), done.
Checking out files: 100% (1648/1648), done.
CPU times: user 478 ms, sys: 94.4 ms, total: 572 ms
Wall time: 2min 17s
#import all modules
import json
import os
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Arc, Rectangle, ConnectionPatch
from matplotlib.offsetbox import  OffsetImage
from matplotlib.patches import Ellipse
from functools import reduce
import math
%%time
comp = ['FIFA World Cup','La Liga']
main_df = pd.DataFrame(data=None)
path_match = "/content/open-data/data/events/" #location for play by play events
for root, dirs, files in os.walk('/content/open-data/data/matches/'):
     for file in files:
        with open(os.path.join(root, file), "r") as auto:
            with codecs.open(root + str('/') + file,encoding='utf-8') as data_file:
                data = json.load(data_file)
                df = pd.DataFrame(data=None)
                df = pd.json_normalize(data, sep = "_")
            #for x in df.competition_country_name:
            #    if x == 'Spain':
            #        print(df.match_id)
            #print(df['competition_competition_name'])
            for i in range(len(df)):
                if df.iloc[i]['competition_competition_name'] in comp  :
                    match_no = df.iloc[i]['match_id'] #gets match with Spain as country
                    match_no = str(match_no) # from int to str 
                    #print('match list \n',match_no)
                    with codecs.open(path_match + match_no + str(r'.json'),encoding="utf8") as event_file: #open the respective file
                        df_match = json.load(event_file)
                        df_match2 = pd.DataFrame(data=None)
                        df_match2 = pd.json_normalize(df_match,sep="_")    
                        df_match2 =  df_match2[(df_match2['type_name'] == "Shot")]
                    main_df = main_df.append(df_match2,ignore_index=True,sort=False)                    
#print('total matches ',len(match_no)) 
print('Done')
Done
CPU times: user 6min 28s, sys: 1.27 s, total: 6min 29s
Wall time: 6min 30s
main_df.head()
id index period timestamp minute second possession duration type_id type_name possession_team_id possession_team_name play_pattern_id play_pattern_name team_id team_name tactics_formation tactics_lineup related_events location player_id player_name position_id position_name pass_recipient_id pass_recipient_name pass_length pass_angle pass_height_id pass_height_name pass_end_location pass_type_id pass_type_name pass_body_part_id pass_body_part_name carry_end_location under_pressure pass_outcome_id pass_outcome_name ball_receipt_outcome_id ... pass_deflected block_deflection substitution_outcome_id substitution_outcome_name substitution_replacement_id substitution_replacement_name ball_recovery_recovery_failure dribble_overrun 50_50_outcome_id 50_50_outcome_name shot_aerial_won shot_open_goal bad_behaviour_card_id bad_behaviour_card_name pass_no_touch block_offensive foul_committed_offensive shot_saved_off_target goalkeeper_shot_saved_off_target miscontrol_aerial_won goalkeeper_punched_out clearance_other ball_recovery_offensive shot_deflected dribble_no_touch shot_redirect block_save_block injury_stoppage_in_chain half_start_late_video_start player_off_permanent goalkeeper_lost_out goalkeeper_saved_to_post shot_follows_dribble shot_saved_to_post goalkeeper_shot_saved_to_post pass_backheel goalkeeper_lost_in_play goalkeeper_success_out goalkeeper_success_in_play half_end_early_video_end
0 fb785612-71d3-44df-aae4-da6e005756de 117 1 00:02:06.532 2 6 6 1.015179 16 Shot 222 Villarreal 1 Regular Play 222 Villarreal NaN NaN [1febb4f7-0e2c-43f7-96fe-51fbffaaf664] [107.5, 27.2] 11386.0 Santiago Cazorla González 16.0 Left Midfield NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2dfa84fe-3579-4705-8d27-44b8917907e1 499 1 00:09:49.110 9 49 17 1.302674 16 Shot 222 Villarreal 4 From Throw In 222 Villarreal NaN NaN [1af68944-ff3d-49a1-92a0-fe6198e73e78] [88.3, 50.2] 25921.0 Rubén Gracia Calmache 12.0 Right Midfield NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 c9d92f30-2159-4a5a-a5bf-9d1163e4b33f 587 1 00:12:58.407 12 58 24 0.521000 16 Shot 222 Villarreal 5 Other 222 Villarreal NaN NaN [61239f9b-052f-42ab-8a73-3c3c3841d419] [108.0, 40.0] 20135.0 Marcos Antonio Senna da Silva 15.0 Left Center Midfield NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 26b5d67b-5fce-4a5a-8b31-b879adbb61d3 1113 1 00:24:01.266 24 1 42 0.647279 16 Shot 217 Barcelona 7 From Goal Kick 217 Barcelona NaN NaN [1d8506ef-3a55-45c7-aba9-e31204db051e, 354b60e... [107.3, 36.1] 4913.0 Bojan Krkíc Pérez 23.0 Center Forward NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN True NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 20528610-c092-482c-8238-6b9679328680 1343 1 00:28:38.374 28 38 49 1.023005 16 Shot 217 Barcelona 1 Regular Play 217 Barcelona NaN NaN [06385378-493b-4bd8-8731-f983c4ac28d8] [105.3, 29.4] 5503.0 Lionel Andrés Messi Cuccittini 17.0 Right Wing NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 147 columns

"""Distance of shot location to centre of goal"""
def distFormula(coordinate):
  a =(math.sqrt(((coordinate.location[0] - 120)**2) + ((coordinate.location[1] - 36)**2))) 
  b =(math.sqrt(((coordinate.location[0] - 120)**2) + ((coordinate.location[1] - 44)**2))) 
  return ((a+b)/2)
""" near x y (nx,ny) (120,44)and far x y (fx,fy) (120,36)""" 
nx = 120
ny = 44
fx = 120
fy = 36

goalpostLength = 8
def shot_angle(points):
  len1 = (math.sqrt(((points.location[0] - nx)**2) + ((points.location[1] - ny)**2))) 
  len2 = (math.sqrt(((points.location[0] - fx)**2) + ((points.location[1] - fy)**2)))
  ang = (len1**2 + len2**2 - goalpostLength**2)/(2 * len1 * len2)
  if ang > 1:
    ang = 1
  elif ang < -1:
    ang = -1   
  angRad = math.acos(ang)
  return( (angRad * 180)/math.pi) 
"""If shot was taken under Pressure?"""
def under_pressure(coordinate):
  if coordinate['under_pressure'] == True:
    return 1
  return 0
"""The Shot type Id"""
def shot_type(coordinate):
  if coordinate['shot_type_id'] == 61:
    return 1
  if coordinate['shot_type_id'] == 62:
    return 2
  if coordinate['shot_type_id'] == 87:
    return 3
  if coordinate['shot_type_id'] == 88:
    return 4
  return 5
"""The Shot Body part"""
def shot_body_part(coordinate):
  if coordinate['shot_body_part_id'] == 37:
    return 1
  if coordinate['shot_body_part_id'] == 38:
    return 2
  if coordinate['shot_body_part_id'] == 70:
    return 3
  return 4
"""The Shot Technique Id"""
def shot_technique(coordinate):
  if coordinate['shot_technique_id'] == 89:
    return 1
  if coordinate['shot_technique_id'] == 90:
    return 2
  if coordinate['shot_technique_id'] == 91:
    return 3
  if coordinate['shot_technique_id'] == 92:
    return 4
  if coordinate['shot_technique_id'] == 93:
    return 5
  if coordinate['shot_technique_id'] == 94:
    return 6
  return 7
"""If shot was taken first time?"""
def shot_first_time(coordinate):
  if coordinate['shot_first_time'] == True:
    return 1
  return 0
""" To Find if a point is inside the triangle
https://www.geeksforgeeks.org/check-whether-a-given-point-lies-inside-a-triangle-or-not/"""
def Triarea(a,b,c):
#return abs((x1*(y2-y3) + x2*(y3-y1)+ x3*(y1-y2))/2.0);   
  return abs((a[0] * (b[1] - c[1]) + b[0] * (c[1] - a[1]) + c[0] * (a[1] - b[1]))/2.0)

def isInside(a,b,c,p):
 A = Triarea(a,b,c)

 A1 = Triarea(a,b,p)

 A2 = Triarea(p,b,c)

 A3 = Triarea(a,p,c)  
 
 if (round(A,2) == round((A1 + A2 + A3),2)):
   return 1
 return 0 
"""If shot was taken first time?"""
def shot_one_on_one(coordinate):
  if coordinate['shot_one_on_one'] == True:
    return 1
  return 0
def infronofShot(frame):
  if str(type(frame['shot_freeze_frame'])) == '<class \'float\'>':
    return 0
  if not len(frame['shot_freeze_frame']):
    return 0
  #print(type(frame['shot_freeze_frame']),'\n')
  loc = pd.DataFrame(frame['shot_freeze_frame'])
  
  #loc = loc[['location']]
  X = frame['location'][0]
  Y = frame['location'][1]
  countgoal = 0
  
  for i in range(len(loc)):
    
    if isInside((X,Y),(120,36),(120,44),(loc['location'].iloc[i][0],loc['location'].iloc[i][1])) == 1:
      countgoal +=1
  return countgoal      
main_df['Distance'] = main_df.apply(distFormula,axis = 1)
main_df['Angle'] = main_df.apply(shot_angle,axis = 1)
main_df['UnderPressure'] = main_df.apply(under_pressure,axis = 1)
main_df['ShotType'] = main_df.apply(shot_type,axis = 1)
main_df['ShotBodyPart'] = main_df.apply(shot_body_part,axis = 1)
main_df['ShotTechnique'] = main_df.apply(shot_technique,axis = 1)
main_df['ShotFirstTime'] = main_df.apply(shot_first_time,axis = 1)
main_df['ShotOneonOne']= main_df.apply(shot_one_on_one,axis = 1)
main_df['InFrontofGoal'] = main_df.apply(infronofShot,axis = 1)
goals_lst = main_df[main_df['shot_outcome_id'] == 97].index.tolist()
#if shot is a goal 
main_df['isGoal'] = False
goals_lst
main_df.loc[main_df.index.isin(goals_lst),'isGoal'] = True
import statsbombpitch as sb
sb.sb_pitch("#195905","#faf0e6","horizontal","full")
ilocv = 3477
plt.scatter(main_df.iloc[ilocv]['location'][0],main_df.iloc[ilocv]['location'][1],color="#ee3e32",edgecolors="none",zorder=10,alpha=1,s = 40 )
#plt.plt.plot((main_df.iloc[0]['location'][0],120),(main_df.iloc[0]['location'][1],44),color = 'black',zorder = 10)
#(main_df.iloc[1]['location'][1],main_df.iloc[1]['location'][0],120,zorder = 10)
#plt.plot((main_df.iloc[0]['location'][0],120),(main_df.iloc[0]['location'][1],44),color = 'black',zorder = 10)
tri = np.array([[main_df.iloc[ilocv]['location'][0],main_df.iloc[ilocv]['location'][1]],[120,36],[120,44]])
t1 = plt.Polygon(tri, color = 'blue',zorder = 8)
plt.gca().add_patch(t1)
loc = pd.DataFrame(main_df.iloc[ilocv]['shot_freeze_frame'])
for i in range(len(loc)):
  plt.scatter(loc['location'].iloc[i][0],loc['location'].iloc[i][1],color="#ee3e32",edgecolors="black",zorder=10,alpha=1,s = 20 )  
plt.show()
sortxg[50:100]
location Distance Angle UnderPressure ShotType ShotBodyPart ShotTechnique ShotFirstTime InFrontofGoal ShotOneonOne xG
10082 [115.2, 43.6] 6.902760 62.487997 0 3 4 3 1 1 0 0.593055
11940 [118.0, 45.0] 5.727806 50.906141 0 3 4 7 1 0 0 0.588634
5946 [115.1, 37.6] 6.607503 70.644874 0 3 4 7 1 2 0 0.588151
4032 [114.3, 42.8] 7.348970 61.917732 0 3 4 3 1 1 0 0.582283
10411 [116.3, 40.1] 5.449276 94.443109 0 3 1 5 0 2 0 0.578671
10059 [117.9, 44.7] 5.581727 57.994617 0 3 4 3 1 1 0 0.577080
5721 [115.0, 40.0] 6.403124 77.319617 0 3 1 5 0 0 0 0.573219
10216 [114.8, 38.0] 6.755564 70.123128 0 3 1 5 0 1 1 0.563681
4014 [112.3, 38.2] 8.824071 52.934164 0 3 4 3 1 0 0 0.561885
8422 [113.1, 43.4] 8.521922 51.972274 0 3 4 5 1 0 0 0.557730
6056 [114.0, 37.0] 7.651153 58.861028 0 3 4 6 1 1 0 0.552528
1551 [114.4, 39.4] 6.899202 70.664392 0 3 2 3 1 1 0 0.549695
250 [113.1, 42.1] 8.183296 56.874096 0 3 4 5 0 0 0 0.548621
3272 [113.6, 40.1] 7.547661 64.001725 0 3 4 5 0 1 0 0.542347
9799 [113.2, 42.0] 8.078323 57.813206 0 3 4 4 1 1 0 0.538975
10820 [113.6, 41.8] 7.702349 61.154851 0 3 2 5 1 0 0 0.533676
5077 [114.1, 43.6] 7.767437 56.055770 0 3 4 7 1 1 0 0.528097
12541 [113.0, 44.0] 8.815073 48.814075 0 3 4 7 0 1 1 0.527048
11843 [113.0, 38.0] 8.249827 56.546691 0 3 4 7 1 1 0 0.524932
3227 [115.6, 41.8] 6.099730 79.380345 0 3 1 5 0 1 0 0.522069
1710 [113.3, 39.1] 7.841507 61.008967 0 3 4 5 0 1 0 0.514230
1796 [112.4, 39.7] 8.592467 55.459494 0 3 4 5 1 1 0 0.512211
12751 [115.0, 42.0] 6.597707 71.995838 0 3 2 5 0 1 0 0.510614
12635 [118.0, 36.0] 5.123106 75.963757 0 3 1 5 0 1 0 0.509163
5094 [115.6, 43.6] 6.599972 65.125846 0 3 2 5 1 1 0 0.509059
12870 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
12363 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
12622 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
11356 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
11390 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
12918 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
11495 [109.0, 41.0] 11.742400 39.699073 0 4 4 5 0 0 0 0.509021
9319 [114.0, 42.8] 7.593725 59.886267 0 3 4 5 0 1 0 0.508912
5479 [114.3, 42.5] 7.269647 63.495292 0 3 1 5 0 1 1 0.503553
9820 [116.5, 37.5] 5.595149 84.897835 0 3 1 2 0 2 0 0.502442
5385 [112.6, 42.2] 8.634894 53.628856 0 3 4 5 1 1 0 0.496973
9100 [115.8, 44.3] 6.756425 59.073874 0 3 4 5 1 2 0 0.495486
1360 [113.7, 36.1] 8.202624 52.338128 0 3 4 5 1 1 0 0.493024
3664 [116.4, 43.2] 5.868831 75.963757 1 3 2 5 1 1 0 0.491836
8527 [115.0, 43.3] 6.948463 63.561138 0 3 2 5 1 1 0 0.491444
10394 [112.4, 38.0] 8.770864 53.033726 0 3 4 5 1 1 0 0.490224
3910 [112.6, 39.5] 8.423397 56.617007 0 3 2 5 1 0 0 0.486898
4346 [108.3, 40.3] 12.368128 37.728461 0 4 4 5 0 0 0 0.484093
11026 [108.2, 40.1] 12.459894 37.449331 0 4 4 5 0 0 0 0.480520
3793 [112.3, 44.5] 9.592653 44.111835 0 3 4 5 0 1 1 0.479089
6938 [112.2, 42.0] 8.946530 51.949987 0 3 4 5 1 1 0 0.478999
7928 [113.3, 38.2] 7.956833 59.059829 0 3 4 5 1 2 0 0.477739
554 [114.8, 36.7] 7.104802 62.203440 0 3 2 7 1 1 0 0.477342
1305 [108.1, 40.0] 12.554282 37.158541 0 4 4 5 0 0 0 0.476817
3704 [108.1, 40.1] 12.554640 37.156345 0 4 4 5 0 0 0 0.476794
countgoal = 0
#isInside((X,Y),(120,36),(120,44),(loc['location'].iloc[i][0],loc['location'].iloc[i][1])) == 1:
for i in range(len(loc)):
  if isInside((100.7,25.6),(120,36),(120,44),(loc['location'].iloc[i][0],loc['location'].iloc[i][1])) == 1:
    countgoal +=1
countgoal   
9
%%time
main_df['InFrontofGoal'] = main_df.apply(infronofShot,axis = 1)
CPU times: user 16.9 s, sys: 39.6 ms, total: 17 s
Wall time: 16.9 s
main_df[['location','Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal','isGoal']]
location Distance Angle UnderPressure ShotType ShotBodyPart ShotTechnique ShotFirstTime ShotOneonOne InFrontofGoal isGoal
0 [110.5, 36.2] 10.896986 40.593846 1 3 4 5 1 0 1 False
1 [114.2, 48.0] 10.186866 29.611685 0 3 4 7 1 0 2 False
2 [95.3, 48.3] 26.332306 16.596593 0 2 2 5 0 0 4 False
3 [103.3, 61.6] 27.413807 10.378789 0 3 2 5 0 0 1 False
4 [94.9, 55.2] 29.543437 13.366677 0 2 2 5 0 0 4 False
... ... ... ... ... ... ... ... ... ... ... ...
12952 [111.0, 27.0] 15.981653 17.102729 0 3 2 5 0 0 3 False
12953 [114.0, 33.0] 9.619084 34.824489 0 3 2 5 0 0 1 True
12954 [107.0, 32.0] 15.646638 25.606661 0 3 2 5 0 0 1 False
12955 [97.0, 22.0] 29.376742 12.398277 0 3 4 5 0 0 1 False
12956 [109.0, 52.0] 16.508979 19.464104 0 3 4 5 0 0 1 False

12957 rows × 11 columns

#xG Model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn import svm
from sklearn import linear_model
xgModel = main_df[['location','Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal','isGoal']]
X_train,X_test,y_train,y_test = train_test_split(xgModel[['location','Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','InFrontofGoal','ShotOneonOne']],xgModel[['isGoal']],test_size = 0.2,shuffle = True)

Logistic Regression Model

clf = LogisticRegression(random_state=0,max_iter = 5000).fit(X_train[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal']], y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
#model weights
clf.coef_[0]
array([-0.05881247,  0.03193658, -0.45802499,  0.59881635,  0.21925344,
       -0.00195693,  0.21331492,  0.50199851, -0.29032136])

SGD Model

xG = clf.predict_proba(X_test[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne']])[:,1]
#SGD
sgdclf = linear_model.SGDClassifier(loss='log', alpha = 0.17)
sgdclf.fit(X_train[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal']], y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
SGDClassifier(alpha=0.17, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

Xg Boost Model

xgb_model = xgb.XGBClassifier(objective='binary:logistic', max_depth=4, n_estimators=100)
xgb_model.fit(X_train[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal']], y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/_label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/_label.py:268: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

Predict Shot Probability

# change model here sgcclf(SGD) or clf(LR)
xG = xgb_model.predict_proba(X_test[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne','InFrontofGoal']])[:,1]
X_test['xG'] = xG
#X_test.head()
sortxg = X_test.sort_values(by = ['xG'],ascending=False)
sortxg
location Distance Angle UnderPressure ShotType ShotBodyPart ShotTechnique ShotFirstTime InFrontofGoal ShotOneonOne xG
646 [118.8, 37.0] 4.332081 120.077993 0 3 4 5 1 0 0 0.979459
10495 [119.3, 41.4] 4.068882 157.545469 0 3 4 5 1 0 0 0.979459
2681 [119.1, 39.6] 4.100949 154.403626 0 3 4 5 1 0 0 0.979459
5237 [118.7, 39.4] 4.210111 143.294745 0 3 4 3 1 0 0 0.969388
7452 [117.7, 37.8] 4.766741 107.693813 0 3 2 5 1 0 0 0.967390
... ... ... ... ... ... ... ... ... ... ... ...
505 [58.1, 28.1] 63.155775 7.132821 0 3 4 5 0 1 0 0.004881
11160 [61.8, 39.8] 58.337636 7.863251 0 3 4 5 0 1 0 0.004881
3568 [63.0, 30.5] 57.920804 7.813054 0 3 4 5 0 1 0 0.004881
5084 [87.5, 67.5] 42.683234 8.235003 0 3 4 3 0 1 0 0.004615
6085 [58.9, 43.2] 61.313999 7.470866 0 3 4 5 0 2 0 0.004594

2592 rows × 11 columns

import statsbombpitch as sb
sb.sb_pitch("#195905","#faf0e6","horizontal","full")
plt.gca().invert_yaxis()
for i in range(len(sortxg)):
    xe = sortxg.iloc[i]['location'][0]
    ye = sortxg.iloc[i]['location'][1]
    
    if sortxg.iloc[i]['xG'] >= 0.75:
      g = plt.scatter(xe,ye,color="#ee3e32",edgecolors="none",zorder=10,alpha=1,s = 40 )
    elif sortxg.iloc[i]['xG'] < 0.75 and sortxg.iloc[i]['xG'] >=0.5:
      o = plt.scatter(xe,ye,color="#f68838",edgecolors="none",zorder=8,alpha=0.75,s = 30 )
    elif sortxg.iloc[i]['xG'] < 0.5 and sortxg.iloc[i]['xG'] >=0.25:
      a = plt.scatter(xe,ye,color="#fbb021",edgecolors="none",zorder=6,alpha=0.5,s = 20 )  
    else:
      b = plt.scatter(xe,ye,color="#1b8a5a",edgecolors="none",zorder=4,alpha=0.25,s = 10 )    
plt.axis('off')
plt.legend((g,o,a,b),('>=0.75','>=0.5','>=0.25','<0.25'),scatterpoints=1,loc=2,title = 'xG Value',fontsize='small', fancybox=True)
#plt.title('xG SGD model')
plt.savefig('xgXGBmodelFreezeFrame.png')
plt.show()
import StatsbombPitch as sb
sb.sb_pitch("#195905","#faf0e6","vertical","half")
#plt.gca().invert_xaxis()
for i in range(len(sortxg)):
    xe = sortxg.iloc[i]['location'][0]
    ye = sortxg.iloc[i]['location'][1]
    
    if sortxg.iloc[i]['xG'] >= 0.75:
      g = plt.scatter(ye,xe,color="#ee3e32",edgecolors="none",zorder=10,alpha=1,s = 40 )
    elif sortxg.iloc[i]['xG'] < 0.75 and sortxg.iloc[i]['xG'] >=0.5:
      o = plt.scatter(ye,xe,color="#f68838",edgecolors="none",zorder=8,alpha=0.75,s = 30 )
    elif sortxg.iloc[i]['xG'] < 0.5 and sortxg.iloc[i]['xG'] >=0.25:
      a = plt.scatter(ye,xe,color="#fbb021",edgecolors="none",zorder=6,alpha=0.5,s = 20 )  
    else:
      b = plt.scatter(ye,xe,color="#1b8a5a",edgecolors="none",zorder=4,alpha=0.25,s = 10 )    
plt.axis('off')
plt.legend((g,o,a,b),('>=0.75','>=0.5','>=0.25','<0.25'),scatterpoints=1,loc=3,title = 'xG Value',fontsize='small', fancybox=True,edgecolor = 'black',framealpha = 2
           )


#ax = plt.subplot()

#plt.savefig('MessiValverdeEraScatter.png')
plt.show()
import StatsbombPitch as sb
sb.sb_pitch("#195905","#faf0e6","vertical","half")
#plt.gca().invert_xaxis()
for i in range(len(sortxg)):
    xe = sortxg.iloc[i]['location'][0]
    ye = sortxg.iloc[i]['location'][1]
    
    if sortxg.iloc[i]['xG'] >= 0.75:
      g = plt.scatter(ye,xe,color="#ee3e32",edgecolors="none",zorder=10,alpha=1,s = 40 )
    elif sortxg.iloc[i]['xG'] < 0.75 and sortxg.iloc[i]['xG'] >=0.5:
      o = plt.scatter(ye,xe,color="#f68838",edgecolors="none",zorder=8,alpha=0.75,s = 30 )
    elif sortxg.iloc[i]['xG'] < 0.5 and sortxg.iloc[i]['xG'] >=0.25:
      a = plt.scatter(ye,xe,color="#fbb021",edgecolors="none",zorder=6,alpha=0.5,s = 20 )  
    else:
      b = plt.scatter(ye,xe,color="black",edgecolors="none",zorder=4,alpha=0.25,s = 10 )    
plt.axis('off')
plt.legend((g,o,a,b),('>=0.75','>=0.5','>=0.25','<0.25'),scatterpoints=1,loc=3,title = 'xG Value',fontsize='small', fancybox=True,edgecolor = 'black',framealpha = 2
           )


#ax = plt.subplot()

#plt.savefig('MessiValverdeEraScatter.png')
plt.show()
#[['Distance','Angle','UnderPressure','ShotType','ShotBodyPart','ShotTechnique','ShotFirstTime','ShotOneonOne']]
xgb_model.predict_proba(pd.DataFrame([12.55,37.156,0,4,2,2,0,0]))[:,1]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-43-9dff4c73a403> in <module>()
----> 1 xgb_model.predict_proba(pd.DataFrame([12.55,37.156,0,4,2,2,0,0]))[:,1]

/usr/local/lib/python3.6/dist-packages/xgboost/sklearn.py in predict_proba(self, data, ntree_limit, validate_features)
    832         class_probs = self.get_booster().predict(test_dmatrix,
    833                                                  ntree_limit=ntree_limit,
--> 834                                                  validate_features=validate_features)
    835         if self.objective == "multi:softprob":
    836             return class_probs

/usr/local/lib/python3.6/dist-packages/xgboost/core.py in predict(self, data, output_margin, ntree_limit, pred_leaf, pred_contribs, approx_contribs, pred_interactions, validate_features)
   1282 
   1283         if validate_features:
-> 1284             self._validate_features(data)
   1285 
   1286         length = c_bst_ulong()

/usr/local/lib/python3.6/dist-packages/xgboost/core.py in _validate_features(self, data)
   1688 
   1689                 raise ValueError(msg.format(self.feature_names,
-> 1690                                             data.feature_names))
   1691 
   1692     def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):

ValueError: feature_names mismatch: ['Distance', 'Angle', 'UnderPressure', 'ShotType', 'ShotBodyPart', 'ShotTechnique', 'ShotFirstTime', 'ShotOneonOne'] ['0']
expected ShotOneonOne, ShotBodyPart, ShotTechnique, ShotType, ShotFirstTime, Distance, UnderPressure, Angle in input data
training data did not have the following fields: 0