107 lines
4.0 KiB
Python
107 lines
4.0 KiB
Python
import requests
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
from collections import OrderedDict
|
|
|
|
class scrap:
|
|
|
|
# Sezon 18/19 kolejka 10
|
|
# https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2018&min=1&max=10
|
|
# Kolejka 15
|
|
# https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 15
|
|
# Kolejka 20
|
|
# https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 20
|
|
# wartość klubów
|
|
# https: // www.transfermarkt.pl / pko - ekstraklasa / marktwerteverein / wettbewerb / PL1 / plus /?stichtag = 2018 - 10 - 01
|
|
# Poprzedni sezon
|
|
# https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2017
|
|
def scrapTeamValue(self):
|
|
url = 'https://www.transfermarkt.pl/pko-ekstraklasa/marktwerteverein/wettbewerb/PL1/plus/?stichtag=2018-10-01'
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
req = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(req.text, 'html.parser')
|
|
body = soup.body
|
|
table = body.find('table', {'class': 'items'})
|
|
bodyTable = table.find('tbody')
|
|
values = bodyTable.findAll('td', {'class': 'rechts'})
|
|
teamValues = []
|
|
teams = []
|
|
for value in values:
|
|
a = value.findAll('a')
|
|
for item in a:
|
|
if item.has_attr('title'):
|
|
teams.append(item['title'].rstrip())
|
|
match = re.match(r'.+€', value.text)
|
|
if match:
|
|
num = value.text.replace(',','.')
|
|
num = re.match(r'[0-9]+[.][0-9]+',num)
|
|
teamValues.append(float(num.group()))
|
|
teams = list(OrderedDict.fromkeys(teams))
|
|
dictTeams = {}
|
|
i=0
|
|
for team in teams:
|
|
dictTeams[team] = teamValues[i]
|
|
i += 2
|
|
return dictTeams
|
|
|
|
def scrapTeamsTable(self):
|
|
url = 'https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2017&min=1&max=10'
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
req = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(req.text, 'html.parser')
|
|
body = soup.body
|
|
body = body.find('div', {'class': 'large-8 columns'})
|
|
table = body.findAll('div', {'class': 'responsive-table'})
|
|
i = 1
|
|
listTeams = []
|
|
listProperities = []
|
|
for x in table:
|
|
x = x.find('tbody')
|
|
trTable = x.findAll('tr')
|
|
for item in trTable:
|
|
item = item.text.replace(u'\xa0', '')
|
|
teamProperities = re.split('\n', item)
|
|
tab = []
|
|
for x in teamProperities:
|
|
match = re.match(r'-?[0-9]+', x)
|
|
if match:
|
|
tab.append(float(match.group()))
|
|
|
|
match = re.match(r'\D+$', x)
|
|
if match:
|
|
teamName = match.group().rstrip()
|
|
tab[0] = i
|
|
i+=1
|
|
listTeams.append(teamName)
|
|
listProperities.append(tab)
|
|
dictTeams = {}
|
|
i=0
|
|
for team in listTeams:
|
|
dictTeams[team] = listProperities[i]
|
|
i += 1
|
|
|
|
return dictTeams
|
|
|
|
def scrapPreviousPlace(self):
|
|
url = 'https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2016'
|
|
headers = {'User-Agent': 'Mozilla/5.0'}
|
|
req = requests.get(url, headers=headers)
|
|
soup = BeautifulSoup(req.text, 'html.parser')
|
|
body = soup.body
|
|
table = body.findAll('div', {'class': 'responsive-table'})
|
|
listTeams = []
|
|
i = 0
|
|
for item in table:
|
|
td = item.findAll('td', {'class': 'no-border-links hauptlink'})
|
|
for t in td:
|
|
t = t.text.replace(u'\n', '').rstrip()
|
|
listTeams.append(t)
|
|
|
|
dictTeams = {}
|
|
i=1
|
|
for team in listTeams:
|
|
dictTeams[team] = i
|
|
i += 1
|
|
|
|
return dictTeams
|