ads/kibice_mlb/kibice.ipynb
2022-10-18 17:35:27 +02:00

392 KiB
Raw Blame History

Analiza zależności ilości kibiców w baseball mlb

import pandas as pd

data = pd.read_csv("baseball_reference_2016_clean.csv")

data
Unnamed: 0 attendance away_team away_team_errors away_team_hits away_team_runs date field_type game_type home_team ... temperature wind_speed wind_direction sky total_runs game_hours_dec season home_team_win home_team_loss home_team_outcome
0 0 40030.0 New York Mets 1 7 3 2016-04-03 on grass Night Game Kansas City Royals ... 74.0 14.0 from Right to Left Sunny 7 3.216667 regular season 1 0 Win
1 1 21621.0 Philadelphia Phillies 0 5 2 2016-04-06 on grass Night Game Cincinnati Reds ... 55.0 24.0 from Right to Left Overcast 5 2.383333 regular season 1 0 Win
2 2 12622.0 Minnesota Twins 0 5 2 2016-04-06 on grass Night Game Baltimore Orioles ... 48.0 7.0 out to Leftfield Unknown 6 3.183333 regular season 1 0 Win
3 3 18531.0 Washington Nationals 0 8 3 2016-04-06 on grass Night Game Atlanta Braves ... 65.0 10.0 from Right to Left Cloudy 4 2.883333 regular season 0 1 Loss
4 4 18572.0 Colorado Rockies 1 8 4 2016-04-06 on grass Day Game Arizona Diamondbacks ... 77.0 0.0 in unknown direction In Dome 7 2.650000 regular season 0 1 Loss
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2458 2458 31042.0 Toronto Blue Jays 2 7 5 2016-04-03 on turf Day Game Tampa Bay Rays ... 72.0 0.0 in unknown direction In Dome 8 2.850000 regular season 0 1 Loss
2459 2459 39500.0 St. Louis Cardinals 0 5 1 2016-04-03 on grass Day Game Pittsburgh Pirates ... 39.0 14.0 out to Leftfield Unknown 5 3.033333 regular season 1 0 Win
2460 2460 20098.0 San Francisco Giants 0 6 3 2016-04-06 on grass Day Game Milwaukee Brewers ... 66.0 0.0 in unknown direction In Dome 7 3.316667 regular season 1 0 Win
2461 2461 17883.0 Detroit Tigers 0 13 7 2016-04-06 on grass Day Game Miami Marlins ... 71.0 0.0 in unknown direction In Dome 10 3.366667 regular season 0 1 Loss
2462 2462 10298.0 Boston Red Sox 1 10 6 2016-04-06 on grass Night Game Cleveland Indians ... 60.0 7.0 out to Leftfield Unknown 13 3.483333 regular season 1 0 Win

2463 rows × 26 columns

data.columns
Index(['Unnamed: 0', 'attendance', 'away_team', 'away_team_errors',
       'away_team_hits', 'away_team_runs', 'date', 'field_type', 'game_type',
       'home_team', 'home_team_errors', 'home_team_hits', 'home_team_runs',
       'start_time', 'venue', 'day_of_week', 'temperature', 'wind_speed',
       'wind_direction', 'sky', 'total_runs', 'game_hours_dec', 'season',
       'home_team_win', 'home_team_loss', 'home_team_outcome'],
      dtype='object')

Pogoda

image

data['sky'].unique()
array(['Sunny', 'Overcast', 'Unknown', 'Cloudy', 'In Dome', 'Drizzle',
       'Rain', 'Night'], dtype=object)
sunny = data[data['sky'] == 'Sunny']
overcast = data[data['sky'] == 'Overcast']
cloudy = data[data['sky'] == 'Cloudy']
in_dome = data[data['sky'] == 'In Dome']
drizzle = data[data['sky'] == 'Drizzle']
rain = data[data['sky'] == 'Rain']
night = data[data['sky'] == 'Night']

Średnia ilość kibiców w zależności od pogody

import matplotlib.pyplot as plt
  
left = [1, 2, 3, 4, 5, 6, 7]

height = [sunny['attendance'].mean(), overcast['attendance'].mean(), cloudy['attendance'].mean(), 
in_dome['attendance'].mean(), drizzle['attendance'].mean(), rain['attendance'].mean(), night['attendance'].mean()]

tick_label = ['sunny', 'overcast', 'cloudy', 'in dome', 'drizzle', 'rain', 'night']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'green', 'red'])
  
plt.xlabel('Weather')
plt.ylabel('Attendance')
plt.title('Attendance - Weather')

plt.show()

Mediana

import matplotlib.pyplot as plt
  
left = [1, 2, 3, 4, 5, 6, 7]

height = [sunny['attendance'].median(), overcast['attendance'].median(), cloudy['attendance'].median(), 
in_dome['attendance'].median(), drizzle['attendance'].median(), rain['attendance'].median(), night['attendance'].median()]

tick_label = ['sunny', 'overcast', 'cloudy', 'in dome', 'drizzle', 'rain', 'night']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'green', 'red'])
  
plt.xlabel('Weather')
plt.ylabel('Attendance')
plt.title('Attendance - Weather')

plt.show()

W nocy prawdopodobnie najwięcej, gdyż większa grupa odbiorców ma dostęp do meczy online z całego świata.
Pod kopułą może być najmniej widzów, gdyż takie stadiony mają mniejsze trybuny.

Dzień tygodnia

image2

data['day_of_week'].unique()
array(['Sunday', 'Wednesday', 'Tuesday', 'Monday', 'Thursday', 'Saturday',
       'Friday'], dtype=object)
monday = data[data['day_of_week'] == 'Monday']
tuesday = data[data['day_of_week'] == 'Tuesday']
wednesday = data[data['day_of_week'] == 'Wednesday']
thursday = data[data['day_of_week'] == 'Thursday']
friday = data[data['day_of_week'] == 'Friday']
saturday = data[data['day_of_week'] == 'Saturday']
sunday = data[data['day_of_week'] == 'Sunday']

Średnia ilość kibiców w danym dniu

import matplotlib.pyplot as plt
  
left = [1, 2, 3, 4, 5, 6, 7]

height = [monday['attendance'].mean(), tuesday['attendance'].mean(), wednesday['attendance'].mean(), 
thursday['attendance'].mean(), friday['attendance'].mean(), saturday['attendance'].mean(), sunday['attendance'].mean()]

tick_label = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'green', 'red'])
  
plt.xlabel('Day')
plt.ylabel('Attendance')
plt.title('Attendance - Day')

plt.show()

Mediana

import matplotlib.pyplot as plt
  
left = [1, 2, 3, 4, 5, 6, 7]

height = [monday['attendance'].median(), tuesday['attendance'].median(), wednesday['attendance'].median(), 
thursday['attendance'].median(), friday['attendance'].median(), saturday['attendance'].median(), sunday['attendance'].median()]

tick_label = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'green', 'red'])
  
plt.xlabel('Day')
plt.ylabel('Attendance')
plt.title('Attendance - Day')

plt.show()

Najwięcej kibiców jest w weekendy.

Zwycięstwo / porażka gospodarzy

image3

data['home_team_outcome'].unique()
array(['Win', 'Loss'], dtype=object)
win = data[data['home_team_outcome'] == 'Win']
loss = data[data['home_team_outcome'] == 'Loss']

Średnia ilość kibiców przy wygraniu/przegraniu gospodarzy

left = [1, 2]

height = [win['attendance'].mean(), loss['attendance'].mean()]

tick_label = ['win', 'loss']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'red'])
  
plt.xlabel('Win')
plt.ylabel('Attendance')
plt.title('Attendance - Win')

plt.show()

Mediana

left = [1, 2]

height = [win['attendance'].median(), loss['attendance'].median()]

tick_label = ['win', 'loss']

plt.bar(left, height, tick_label = tick_label,
        width = 0.8, color = ['blue', 'red'])
  
plt.xlabel('Win')
plt.ylabel('Attendance')
plt.title('Attendance - Win')

plt.show()

Nie ma to wpływu, raczej nie jest tak, że widać przegraną przed końcem i przez to kibice wychodzą. A nawet jeśli to działa to w miarę równomiernie w obie strony.

Zwycięstwa w kolejnych meczach

image4

data['away_team'].unique()
array(['New York Mets', 'Philadelphia Phillies', 'Minnesota Twins',
       'Washington Nationals', 'Colorado Rockies', 'Seattle Mariners',
       'Toronto Blue Jays', 'Los Angeles Dodgers', 'St. Louis Cardinals',
       'Chicago White Sox', 'Houston Astros', 'San Francisco Giants',
       'Detroit Tigers', 'Texas Rangers', 'San Diego Padres',
       'Los Angeles Angels of Anaheim', 'Miami Marlins',
       'Kansas City Royals', 'Pittsburgh Pirates', 'Cincinnati Reds',
       'Atlanta Braves', 'New York Yankees', 'Chicago Cubs',
       'Arizona Diamondbacks', 'Milwaukee Brewers', 'Baltimore Orioles',
       'Cleveland Indians', 'Oakland Athletics', 'Boston Red Sox',
       'Tampa Bay Rays'], dtype=object)
mets = data[data['away_team'] == 'New York Mets']
left = [i for i in range(len(mets))]

height = [i for i in mets['attendance']]

tick_label = ['l' if [i for i in mets['home_team_outcome']][i] == 'Win' else 'w' for i in range(len(mets))]

plt.figure(figsize=(24, 3))  # width:20, height:3
plt.bar(left, height, tick_label = tick_label,
        width = 0.5, color = ['red' if [i for i in mets['home_team_outcome']][i] == 'Win' else 'blue' for i in range(len(mets))])
  
plt.xlabel('Win (w) or Lose (l)')
plt.ylabel('Attendance')
plt.title('Attendance - Win/Lose')
Text(0.5, 1.0, 'Attendance - Win/Lose')
philadelphia = data[data['away_team'] == 'Philadelphia Phillies']
left = [i for i in range(len(philadelphia))]

height = [i for i in philadelphia['attendance']]

tick_label = ['l' if [i for i in philadelphia['home_team_outcome']][i] == 'Win' else 'w' for i in range(len(philadelphia))]

plt.figure(figsize=(24, 3))  # width:20, height:3
plt.bar(left, height, tick_label = tick_label,
        width = 0.5, color = ['red' if [i for i in philadelphia['home_team_outcome']][i] == 'Win' else 'blue' for i in range(len(philadelphia))])
  
plt.xlabel('Win (w) or Lose (l)')
plt.ylabel('Attendance')
plt.title('Attendance - Win/Lose')
Text(0.5, 1.0, 'Attendance - Win/Lose')

Czasami można wywnioskować, że po wygranym meczu przychodzi więcej kibiców na następny, ale nie zawsze, to raczej nie jest częsta zasada.