TeamRating/scrap.py
jakub.kowalczyk bb11bc87cc Changes
2020-05-22 10:30:55 +02:00

107 lines
4.0 KiB
Python

import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
class scrap:
# Sezon 18/19 kolejka 10
# https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2018&min=1&max=10
# Kolejka 15
# https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 15
# Kolejka 20
# https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 20
# wartość klubów
# https: // www.transfermarkt.pl / pko - ekstraklasa / marktwerteverein / wettbewerb / PL1 / plus /?stichtag = 2018 - 10 - 01
# Poprzedni sezon
# https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2017
def scrapTeamValue(self):
url = 'https://www.transfermarkt.pl/pko-ekstraklasa/marktwerteverein/wettbewerb/PL1/plus/?stichtag=2018-10-01'
headers = {'User-Agent': 'Mozilla/5.0'}
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
body = soup.body
table = body.find('table', {'class': 'items'})
bodyTable = table.find('tbody')
values = bodyTable.findAll('td', {'class': 'rechts'})
teamValues = []
teams = []
for value in values:
a = value.findAll('a')
for item in a:
if item.has_attr('title'):
teams.append(item['title'].rstrip())
match = re.match(r'.+€', value.text)
if match:
num = value.text.replace(',','.')
num = re.match(r'[0-9]+[.][0-9]+',num)
teamValues.append(float(num.group()))
teams = list(OrderedDict.fromkeys(teams))
dictTeams = {}
i=0
for team in teams:
dictTeams[team] = teamValues[i]
i += 2
return dictTeams
def scrapTeamsTable(self):
url = 'https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2017&min=1&max=10'
headers = {'User-Agent': 'Mozilla/5.0'}
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
body = soup.body
body = body.find('div', {'class': 'large-8 columns'})
table = body.findAll('div', {'class': 'responsive-table'})
i = 1
listTeams = []
listProperities = []
for x in table:
x = x.find('tbody')
trTable = x.findAll('tr')
for item in trTable:
item = item.text.replace(u'\xa0', '')
teamProperities = re.split('\n', item)
tab = []
for x in teamProperities:
match = re.match(r'-?[0-9]+', x)
if match:
tab.append(float(match.group()))
match = re.match(r'\D+$', x)
if match:
teamName = match.group().rstrip()
tab[0] = i
i+=1
listTeams.append(teamName)
listProperities.append(tab)
dictTeams = {}
i=0
for team in listTeams:
dictTeams[team] = listProperities[i]
i += 1
return dictTeams
def scrapPreviousPlace(self):
url = 'https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2016'
headers = {'User-Agent': 'Mozilla/5.0'}
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
body = soup.body
table = body.findAll('div', {'class': 'responsive-table'})
listTeams = []
i = 0
for item in table:
td = item.findAll('td', {'class': 'no-border-links hauptlink'})
for t in td:
t = t.text.replace(u'\n', '').rstrip()
listTeams.append(t)
dictTeams = {}
i=1
for team in listTeams:
dictTeams[team] = i
i += 1
return dictTeams