392 KiB
392 KiB
Analiza zależności ilości kibiców w baseball mlb
import pandas as pd
data = pd.read_csv("baseball_reference_2016_clean.csv")
data
Unnamed: 0 | attendance | away_team | away_team_errors | away_team_hits | away_team_runs | date | field_type | game_type | home_team | ... | temperature | wind_speed | wind_direction | sky | total_runs | game_hours_dec | season | home_team_win | home_team_loss | home_team_outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 40030.0 | New York Mets | 1 | 7 | 3 | 2016-04-03 | on grass | Night Game | Kansas City Royals | ... | 74.0 | 14.0 | from Right to Left | Sunny | 7 | 3.216667 | regular season | 1 | 0 | Win |
1 | 1 | 21621.0 | Philadelphia Phillies | 0 | 5 | 2 | 2016-04-06 | on grass | Night Game | Cincinnati Reds | ... | 55.0 | 24.0 | from Right to Left | Overcast | 5 | 2.383333 | regular season | 1 | 0 | Win |
2 | 2 | 12622.0 | Minnesota Twins | 0 | 5 | 2 | 2016-04-06 | on grass | Night Game | Baltimore Orioles | ... | 48.0 | 7.0 | out to Leftfield | Unknown | 6 | 3.183333 | regular season | 1 | 0 | Win |
3 | 3 | 18531.0 | Washington Nationals | 0 | 8 | 3 | 2016-04-06 | on grass | Night Game | Atlanta Braves | ... | 65.0 | 10.0 | from Right to Left | Cloudy | 4 | 2.883333 | regular season | 0 | 1 | Loss |
4 | 4 | 18572.0 | Colorado Rockies | 1 | 8 | 4 | 2016-04-06 | on grass | Day Game | Arizona Diamondbacks | ... | 77.0 | 0.0 | in unknown direction | In Dome | 7 | 2.650000 | regular season | 0 | 1 | Loss |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2458 | 2458 | 31042.0 | Toronto Blue Jays | 2 | 7 | 5 | 2016-04-03 | on turf | Day Game | Tampa Bay Rays | ... | 72.0 | 0.0 | in unknown direction | In Dome | 8 | 2.850000 | regular season | 0 | 1 | Loss |
2459 | 2459 | 39500.0 | St. Louis Cardinals | 0 | 5 | 1 | 2016-04-03 | on grass | Day Game | Pittsburgh Pirates | ... | 39.0 | 14.0 | out to Leftfield | Unknown | 5 | 3.033333 | regular season | 1 | 0 | Win |
2460 | 2460 | 20098.0 | San Francisco Giants | 0 | 6 | 3 | 2016-04-06 | on grass | Day Game | Milwaukee Brewers | ... | 66.0 | 0.0 | in unknown direction | In Dome | 7 | 3.316667 | regular season | 1 | 0 | Win |
2461 | 2461 | 17883.0 | Detroit Tigers | 0 | 13 | 7 | 2016-04-06 | on grass | Day Game | Miami Marlins | ... | 71.0 | 0.0 | in unknown direction | In Dome | 10 | 3.366667 | regular season | 0 | 1 | Loss |
2462 | 2462 | 10298.0 | Boston Red Sox | 1 | 10 | 6 | 2016-04-06 | on grass | Night Game | Cleveland Indians | ... | 60.0 | 7.0 | out to Leftfield | Unknown | 13 | 3.483333 | regular season | 1 | 0 | Win |
2463 rows × 26 columns
data.columns
Index(['Unnamed: 0', 'attendance', 'away_team', 'away_team_errors', 'away_team_hits', 'away_team_runs', 'date', 'field_type', 'game_type', 'home_team', 'home_team_errors', 'home_team_hits', 'home_team_runs', 'start_time', 'venue', 'day_of_week', 'temperature', 'wind_speed', 'wind_direction', 'sky', 'total_runs', 'game_hours_dec', 'season', 'home_team_win', 'home_team_loss', 'home_team_outcome'], dtype='object')
data['sky'].unique()
array(['Sunny', 'Overcast', 'Unknown', 'Cloudy', 'In Dome', 'Drizzle', 'Rain', 'Night'], dtype=object)
sunny = data[data['sky'] == 'Sunny']
overcast = data[data['sky'] == 'Overcast']
cloudy = data[data['sky'] == 'Cloudy']
in_dome = data[data['sky'] == 'In Dome']
drizzle = data[data['sky'] == 'Drizzle']
rain = data[data['sky'] == 'Rain']
night = data[data['sky'] == 'Night']
Średnia ilość kibiców w zależności od pogody
import matplotlib.pyplot as plt
left = [1, 2, 3, 4, 5, 6, 7]
height = [sunny['attendance'].mean(), overcast['attendance'].mean(), cloudy['attendance'].mean(),
in_dome['attendance'].mean(), drizzle['attendance'].mean(), rain['attendance'].mean(), night['attendance'].mean()]
tick_label = ['sunny', 'overcast', 'cloudy', 'in dome', 'drizzle', 'rain', 'night']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'green', 'red'])
plt.xlabel('Weather')
plt.ylabel('Attendance')
plt.title('Attendance - Weather')
plt.show()
Mediana
import matplotlib.pyplot as plt
left = [1, 2, 3, 4, 5, 6, 7]
height = [sunny['attendance'].median(), overcast['attendance'].median(), cloudy['attendance'].median(),
in_dome['attendance'].median(), drizzle['attendance'].median(), rain['attendance'].median(), night['attendance'].median()]
tick_label = ['sunny', 'overcast', 'cloudy', 'in dome', 'drizzle', 'rain', 'night']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'green', 'red'])
plt.xlabel('Weather')
plt.ylabel('Attendance')
plt.title('Attendance - Weather')
plt.show()
W nocy prawdopodobnie najwięcej, gdyż większa grupa odbiorców ma dostęp do meczy online z całego świata.
Pod kopułą może być najmniej widzów, gdyż takie stadiony mają mniejsze trybuny.
data['day_of_week'].unique()
array(['Sunday', 'Wednesday', 'Tuesday', 'Monday', 'Thursday', 'Saturday', 'Friday'], dtype=object)
monday = data[data['day_of_week'] == 'Monday']
tuesday = data[data['day_of_week'] == 'Tuesday']
wednesday = data[data['day_of_week'] == 'Wednesday']
thursday = data[data['day_of_week'] == 'Thursday']
friday = data[data['day_of_week'] == 'Friday']
saturday = data[data['day_of_week'] == 'Saturday']
sunday = data[data['day_of_week'] == 'Sunday']
Średnia ilość kibiców w danym dniu
import matplotlib.pyplot as plt
left = [1, 2, 3, 4, 5, 6, 7]
height = [monday['attendance'].mean(), tuesday['attendance'].mean(), wednesday['attendance'].mean(),
thursday['attendance'].mean(), friday['attendance'].mean(), saturday['attendance'].mean(), sunday['attendance'].mean()]
tick_label = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'green', 'red'])
plt.xlabel('Day')
plt.ylabel('Attendance')
plt.title('Attendance - Day')
plt.show()
Mediana
import matplotlib.pyplot as plt
left = [1, 2, 3, 4, 5, 6, 7]
height = [monday['attendance'].median(), tuesday['attendance'].median(), wednesday['attendance'].median(),
thursday['attendance'].median(), friday['attendance'].median(), saturday['attendance'].median(), sunday['attendance'].median()]
tick_label = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'green', 'red'])
plt.xlabel('Day')
plt.ylabel('Attendance')
plt.title('Attendance - Day')
plt.show()
Najwięcej kibiców jest w weekendy.
data['home_team_outcome'].unique()
array(['Win', 'Loss'], dtype=object)
win = data[data['home_team_outcome'] == 'Win']
loss = data[data['home_team_outcome'] == 'Loss']
Średnia ilość kibiców przy wygraniu/przegraniu gospodarzy
left = [1, 2]
height = [win['attendance'].mean(), loss['attendance'].mean()]
tick_label = ['win', 'loss']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'red'])
plt.xlabel('Win')
plt.ylabel('Attendance')
plt.title('Attendance - Win')
plt.show()
Mediana
left = [1, 2]
height = [win['attendance'].median(), loss['attendance'].median()]
tick_label = ['win', 'loss']
plt.bar(left, height, tick_label = tick_label,
width = 0.8, color = ['blue', 'red'])
plt.xlabel('Win')
plt.ylabel('Attendance')
plt.title('Attendance - Win')
plt.show()
Nie ma to wpływu, raczej nie jest tak, że widać przegraną przed końcem i przez to kibice wychodzą. A nawet jeśli to działa to w miarę równomiernie w obie strony.
data['away_team'].unique()
array(['New York Mets', 'Philadelphia Phillies', 'Minnesota Twins', 'Washington Nationals', 'Colorado Rockies', 'Seattle Mariners', 'Toronto Blue Jays', 'Los Angeles Dodgers', 'St. Louis Cardinals', 'Chicago White Sox', 'Houston Astros', 'San Francisco Giants', 'Detroit Tigers', 'Texas Rangers', 'San Diego Padres', 'Los Angeles Angels of Anaheim', 'Miami Marlins', 'Kansas City Royals', 'Pittsburgh Pirates', 'Cincinnati Reds', 'Atlanta Braves', 'New York Yankees', 'Chicago Cubs', 'Arizona Diamondbacks', 'Milwaukee Brewers', 'Baltimore Orioles', 'Cleveland Indians', 'Oakland Athletics', 'Boston Red Sox', 'Tampa Bay Rays'], dtype=object)
mets = data[data['away_team'] == 'New York Mets']
left = [i for i in range(len(mets))]
height = [i for i in mets['attendance']]
tick_label = ['l' if [i for i in mets['home_team_outcome']][i] == 'Win' else 'w' for i in range(len(mets))]
plt.figure(figsize=(24, 3)) # width:20, height:3
plt.bar(left, height, tick_label = tick_label,
width = 0.5, color = ['red' if [i for i in mets['home_team_outcome']][i] == 'Win' else 'blue' for i in range(len(mets))])
plt.xlabel('Win (w) or Lose (l)')
plt.ylabel('Attendance')
plt.title('Attendance - Win/Lose')
Text(0.5, 1.0, 'Attendance - Win/Lose')
philadelphia = data[data['away_team'] == 'Philadelphia Phillies']
left = [i for i in range(len(philadelphia))]
height = [i for i in philadelphia['attendance']]
tick_label = ['l' if [i for i in philadelphia['home_team_outcome']][i] == 'Win' else 'w' for i in range(len(philadelphia))]
plt.figure(figsize=(24, 3)) # width:20, height:3
plt.bar(left, height, tick_label = tick_label,
width = 0.5, color = ['red' if [i for i in philadelphia['home_team_outcome']][i] == 'Win' else 'blue' for i in range(len(philadelphia))])
plt.xlabel('Win (w) or Lose (l)')
plt.ylabel('Attendance')
plt.title('Attendance - Win/Lose')
Text(0.5, 1.0, 'Attendance - Win/Lose')
Czasami można wywnioskować, że po wygranym meczu przychodzi więcej kibiców na następny, ale nie zawsze, to raczej nie jest częsta zasada.