TeamRating/scrap.py

import requests
import re
from bs4 import BeautifulSoup
from collections import OrderedDict

class scrap:

    # Sezon 18/19 kolejka 10
    # https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2018&min=1&max=10
    # Kolejka 15
    # https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 15
    # Kolejka 20
    # https: // www.transfermarkt.pl / ekstraklasa / formtabelle / wettbewerb / PL1?saison_id = 2018 & min = 1 & max = 20
    # wartość klubów
    # https: // www.transfermarkt.pl / pko - ekstraklasa / marktwerteverein / wettbewerb / PL1 / plus /?stichtag = 2018 - 10 - 01
    # Poprzedni sezon
    # https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2017
    def scrapTeamValue(self):
        url = 'https://www.transfermarkt.pl/pko-ekstraklasa/marktwerteverein/wettbewerb/PL1/plus/?stichtag=2018-10-01'
        headers = {'User-Agent': 'Mozilla/5.0'}
        req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, 'html.parser')
        body = soup.body
        table = body.find('table', {'class': 'items'})
        bodyTable = table.find('tbody')
        values = bodyTable.findAll('td', {'class': 'rechts'})
        teamValues = []
        teams = []
        for value in values:
            a = value.findAll('a')
            for item in a:
                if item.has_attr('title'):
                    teams.append(item['title'].rstrip())
            match = re.match(r'.+€', value.text)
            if match:
                num = value.text.replace(',','.')
                num = re.match(r'[0-9]+[.][0-9]+',num)
                teamValues.append(float(num.group()))
        teams = list(OrderedDict.fromkeys(teams))
        dictTeams = {}
        i=0
        for team in teams:
            dictTeams[team] = teamValues[i]
            i += 2
        return dictTeams

    def scrapTeamsTable(self):
        url = 'https://www.transfermarkt.pl/ekstraklasa/formtabelle/wettbewerb/PL1?saison_id=2017&min=1&max=10'
        headers = {'User-Agent': 'Mozilla/5.0'}
        req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, 'html.parser')
        body = soup.body
        body = body.find('div', {'class': 'large-8 columns'})
        table = body.findAll('div', {'class': 'responsive-table'})
        i = 1
        listTeams = []
        listProperities = []
        for x in table:
            x = x.find('tbody')
            trTable = x.findAll('tr')
            for item in trTable:
                item = item.text.replace(u'\xa0', '')
                teamProperities = re.split('\n', item)
                tab = []
                for x in teamProperities:
                    match = re.match(r'-?[0-9]+', x)
                    if match:
                       tab.append(float(match.group()))

                    match = re.match(r'\D+$', x)
                    if match:
                        teamName = match.group().rstrip()
                tab[0] = i
                i+=1
                listTeams.append(teamName)
                listProperities.append(tab)
        dictTeams = {}
        i=0
        for team in listTeams:
            dictTeams[team] = listProperities[i]
            i += 1

        return dictTeams

    def scrapPreviousPlace(self):
        url = 'https://www.transfermarkt.pl/pko-ekstraklasa/tabelle/wettbewerb/PL1?saison_id=2016'
        headers = {'User-Agent': 'Mozilla/5.0'}
        req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, 'html.parser')
        body = soup.body
        table = body.findAll('div', {'class': 'responsive-table'})
        listTeams = []
        i = 0
        for item in table:
            td = item.findAll('td', {'class': 'no-border-links hauptlink'})
            for t in td:
                t = t.text.replace(u'\n', '').rstrip()
                listTeams.append(t)

        dictTeams = {}
        i=1
        for team in listTeams:
            dictTeams[team] = i
            i += 1

        return dictTeams