100 KiB
100 KiB
import pandas as pd
import numpy as np
scores_data = pd.read_csv('dane/scores.csv')
scores_data
Wk | Day | Date | Time | Home | Score | Away | Attendance | Venue | Referee | Match Report | Notes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | Fri | 2020-08-21 | 18:00 | Zagłębie Lubin | 2–1 | Lech Poznań | 3968.0 | Stadion Zagłębia Lubin | Bartosz Frankowski | Match Report | NaN |
1 | 1.0 | Sat | 2020-08-22 | 15:00 | Cracovia | 2–1 | Pogoń Szczecin | 4053.0 | Stadion Cracovii | Paweł Raczkowski | Match Report | NaN |
2 | 1.0 | Sat | 2020-08-22 | 17:30 | Śląsk Wrocław | 2–0 | Piast Gliwice | 5259.0 | Stadion Miejski | Wojciech Myć | Match Report | NaN |
3 | 1.0 | Sat | 2020-08-22 | 20:00 | RKS Raków | 1–2 | Legia Warsaw | 1985.0 | Stadion GKS-u | Jarosław Przybył | Match Report | NaN |
4 | 1.0 | Sun | 2020-08-23 | 12:30 | Wisła Płock | 1–1 | Stal Mielec | 1318.0 | Stadion im. Kazimierza Górskiego | Sebastian Jarzębak | Match Report | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
275 | 30.0 | Sun | 2021-05-16 | 17:30 | Pogoń Szczecin | 1–3 | RKS Raków | NaN | Stadion Miejski im. Floriana Krygiera | Sebastian Krasny | Match Report | NaN |
276 | 30.0 | Sun | 2021-05-16 | 17:30 | Piast Gliwice | 2–3 | Wisła Kraków | NaN | Stadion Miejski | Mariusz Zlotek | Match Report | NaN |
277 | 30.0 | Sun | 2021-05-16 | 17:30 | Cracovia | 0–1 | Warta Poznań | 3670.0 | Stadion Cracovii | Pawel Malec | Match Report | NaN |
278 | 30.0 | Sun | 2021-05-16 | 17:30 | Śląsk Wrocław | 1–1 | Stal Mielec | NaN | Stadion Miejski | Tomasz Kwiatkowski | Match Report | NaN |
279 | 30.0 | Sun | 2021-05-16 | 17:30 | Wisła Płock | 4–0 | Zagłębie Lubin | NaN | Stadion im. Kazimierza Górskiego | Paweł Raczkowski | Match Report | NaN |
280 rows × 12 columns
scores_data = scores_data[['Day', 'Date', 'Score', 'Attendance', 'Home', 'Away']]
scores_data = scores_data.dropna()
len(scores_data)
45
previous_won = []
win_history = {}
for idx, row in scores_data.iterrows():
score = row["Score"].split("–")
won = score[0] > score[1]
previous_won.append(win_history.get(row["Home"], False))
win_history[row["Home"]] = won
win_history[row["Away"]] = not won
previous_won
[False, False, False, False, False, False, False, False, True, True, True, True, True, False, False, False, True, True, True, False, True, True, True, True, True, True, False, False, True, False, True, True, True, True, True, True, True, False, True, False, False, True, False, True, False]
# PIA Piast Gliwice Simple gold crown.svg 1
# LEG Legia Warszawa 2
# LGD Lechia Gdańsk Simple gold cup.svg 3
# CRA Cracovia 4
# JAG Jagielscores_datalonia Białystok 5
# ZLU Zagłębie Lubin 6
# POG Pogoń Szczecin 7
# LPO Lech Poznań 8
# WKR Wisła Kraków 9
# KOR Korona Kielce 10
# GZA Górnik Zabrze 11
# ARK Arka Gdynia 12
# ŚLĄ Śląsk Wrocław 13
# WPŁ Wisła Płock 14
win_table_20 = [
"Piast Gliwice",
"Legia Warszawa",
"Lechia Gdańsk",
"Cracovia",
"Jagiellonia Białystok",
"Zagłębie Lubin",
"Pogoń Szczecin",
"Lech Poznań",
"Wisła Kraków",
"Korona Kielce",
"Górnik Zabrze",
"Arka Gdynia",
"Śląsk Wrocław",
"Wisła Płock",
]
win_table_4 = [
"Piast Gliwice",
"Legia Warszawa",
"Lechia Gdańsk",
"Cracovia"
]
scores_data['Date'][0]
'2020-08-21'
!pip3 install wolframalpha
Defaulting to user installation because normal site-packages is not writeable Collecting wolframalpha Downloading wolframalpha-5.0.0-py3-none-any.whl (7.5 kB) Collecting jaraco.context Downloading jaraco.context-4.1.2-py3-none-any.whl (4.7 kB) Collecting more-itertools Downloading more_itertools-8.14.0-py3-none-any.whl (52 kB) Collecting xmltodict Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB) Installing collected packages: xmltodict, more-itertools, jaraco.context, wolframalpha Successfully installed jaraco.context-4.1.2 more-itertools-8.14.0 wolframalpha-5.0.0 xmltodict-0.13.0
WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: Ignoring invalid distribution -ip (c:\users\mrpol\appdata\roaming\python\python39\site-packages) WARNING: You are using pip version 21.1.1; however, version 22.2.2 is available. You should consider upgrading via the 'c:\Program Files\Python39\python.exe -m pip install --upgrade pip' command.
import wolframalpha
import time
import re
def check_weather(date: str, city: str = 'Warsaw') -> int:
api_id = '5KAEPX-EXX246XAW7'
question = 'Weather ' + date + ' ' + city
client = wolframalpha.Client(api_id)
# time.sleep(1)
res = client.query(question)
a = str(res)
result = re.findall(r'average: \d+ °C', a)[0]
temp = re.search(r'\d+', result).group()
return int(temp)
check_weather('2020-08-22')
23
scores_data['Day'].unique()
days = {
'Mon': 1,
'Tue': 2,
'Wed': 3,
'Thu': 4,
'Fri': 5,
'Sat': 6,
'Sun': 7,
'nan': 0,
}
days_num = []
for d in scores_data['Day']:
days_num.append(days[d])
is_home_top = []
is_home_top_4 = []
for d in scores_data['Home']:
is_home_top.append(int(d in win_table_20))
is_home_top_4.append(int(d in win_table_4))
is_away_top = []
is_away_top_4 = []
for d in scores_data['Away']:
is_away_top.append(int(d in win_table_20))
is_away_top_4.append(int(d in win_table_4))
# weather = []
# for d in scores_data['Date'][-6:]:
# temp = check_weather(d)
# weather.append(temp)
attendedce = [x for x in scores_data['Attendance']]
#weather
weather = ['23', '23', '23', '23', '20', '20', '20', '19', '17', '17', '18', '18', '18', '17', '17', '17', '14', '14', '16', '16', '16', '17', '17', '18', '12', '12', '12', '12', '12', '13', '13', '15', '19', '19', '19', '19', '19', '11', '14', '14', '17', '17', '18', '14', '14']
len(weather), len(scores_data['Date'])
(45, 45)
final_data = np.array([days_num, is_home_top, is_away_top, weather, attendedce])
final_data
array([['5', '6', '6', '6', '7', '7', '7', '1', '5', '5', '6', '6', '6', '7', '7', '7', '5', '5', '6', '6', '6', '7', '7', '1', '5', '5', '6', '6', '6', '7', '7', '1', '5', '5', '6', '6', '6', '1', '5', '5', '6', '6', '7', '7', '7'], ['1', '1', '1', '0', '1', '0', '1', '0', '1', '0', '1', '1', '0', '1', '0', '1', '0', '1', '1', '0', '1', '1', '1', '0', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '1', '0', '1', '1', '1', '1'], ['1', '1', '1', '0', '0', '1', '0', '1', '0', '1', '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '0'], ['23', '23', '23', '23', '20', '20', '20', '19', '17', '17', '18', '18', '18', '17', '17', '17', '14', '14', '16', '16', '16', '17', '17', '18', '12', '12', '12', '12', '12', '13', '13', '15', '19', '19', '19', '19', '19', '11', '14', '14', '17', '17', '18', '14', '14'], ['3968.0', '4053.0', '5259.0', '1985.0', '1318.0', '2381.0', '7368.0', '5070.0', '2486.0', '3420.0', '5424.0', '5044.0', '10000.0', '2578.0', '4308.0', '8226.0', '5065.0', '1500.0', '5439.0', '1872.0', '11535.0', '1877.0', '10270.0', '1675.0', '4397.0', '7007.0', '4507.0', '1821.0', '11000.0', '2533.0', '17546.0', '2649.0', '1411.0', '12492.0', '5127.0', '4877.0', '5881.0', '2714.0', '1631.0', '6438.0', '2987.0', '2882.0', '3215.0', '4890.0', '3670.0']], dtype='<U32')
import matplotlib.pyplot as plt
weather = [float(t) for t in weather]
plt.plot(weather, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200ef599d00>]
r = np.corrcoef(np.array(weather), np.array(attendedce))
r
array([[ 1. , -0.11323007], [-0.11323007, 1. ]])
import matplotlib.pyplot as plt
plt.plot(days_num, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200ef7433a0>]
days_num = [float(x) for x in days_num]
r = np.corrcoef(np.array(days_num), np.array(attendedce))
r
array([[1. , 0.1964238], [0.1964238, 1. ]])
import matplotlib.pyplot as plt
plt.plot(is_home_top, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x7feb0ee77a00>]
import matplotlib.pyplot as plt
plt.plot(is_home_top_4, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200f07a6ac0>]
plt.plot(is_away_top, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x7feb0ee5edf0>]
plt.plot(is_away_top_4, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200f07dcc10>]
is_home_and_away_top = []
for i, j in zip(is_home_top, is_away_top):
is_home_and_away_top.append(i and j)
plt.plot(is_home_and_away_top, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200f14b06a0>]
temp = [float(x) for x in is_home_and_away_top]
r = np.corrcoef(np.array(temp), np.array(attendedce))
r
array([[1. , 0.1384707], [0.1384707, 1. ]])
plt.plot(previous_won, attendedce, 'bo')
[<matplotlib.lines.Line2D at 0x200f58aaa60>]