Taekwondo-Scraping/taekwondo.py
Maciej Sobkowiak 59013115c0 minor fix
2021-04-13 02:07:39 +02:00

126 lines
3.6 KiB
Python

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import re
import csv
data = []
class DF:
def __init__(self, href, title, date, newspaper):
self.href = href
self.title = title
self.date = date
self.newspaper = newspaper
self.add_to_file()
self.print_data()
def add_to_file(self):
with open('data.csv', 'a', newline='', encoding='UTF-8') as file:
writer = csv.writer(file)
writer.writerow([self.date, self.title, self.newspaper, self.href])
def print_data(self):
print('<' + self.date + ', ' + self.title + ', ' +
self.newspaper + ', ' + self.href + '>')
def parse_date(date: str):
def parse_months(month):
m = 'MM'
month.strip()
if(month == 'styczeń' or month == 'stycznia'):
m = '01'
if(month == 'lutego' or month == 'luty'):
m = '02'
if(month == 'marca' or month == 'marzec'):
m = '03'
if(month == 'kwiecień' or month == 'kwietnia'):
m = '04'
if(month == 'maj' or month == 'maja'):
m = '05'
if(month == 'czerwca' or month == 'czerwiec'):
m = '06'
if(month == 'lipiec' or month == 'lipca'):
m = '07'
if(month == 'sierpnia' or month == 'sierpień'):
m = '08'
if(month == 'wrzesień' or month == 'września'):
m = '09'
if(month == 'październik' or month == 'października'):
m = '10'
if(month == 'listopad' or month == 'listopada'):
m = '11'
if(month == 'grudzień' or month == 'grudnia'):
m = '12'
return m
if (date):
p_date = re.search(
r'^(([0-9]*)-*[0-9]*)\ *([a-zA-ZAaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż]*)\ *([0-9]{4})$', date.strip())
if(p_date):
if(p_date.group(4)):
date = p_date.group(4)
if(p_date.group(3)):
date = date + '-' + parse_months(p_date.group(3))
if(p_date.group(2)):
if(len(p_date.group(2)) == 1):
date = date + '-0' + p_date.group(2)
else:
date = date + '-' + p_date.group(2)
else:
date = 'Invalid date: ' + date
else:
date = 'Invalid date'
return date
def parse_data(df):
href = 'https://www2.pztkd.lublin.pl/' + df.attrs['href']
title = df('div', {'class': "rounded18 glow"})[2].text
res = df('span')[0]
newspaper = re.search(r'>"(.*)"', str(res))
date = re.search(r"<br\/>(.*)<", str(res))
if(date):
date = date.group(1)
else:
date = None
if(newspaper):
newspaper = newspaper.group(1)
else:
newspaper = 'Invalid newspaper'
return DF(href, title.strip(), parse_date(date), newspaper)
def main():
with open('data.csv', 'w', newline='', encoding='UTF-8') as file:
writer = csv.writer(file)
writer.writerow(['Data', 'Tytuł', 'Gazeta', 'Link'])
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www2.pztkd.lublin.pl/archpras.html#z1996")
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
links = soup('a', {'data-lightbox': "roadtrip"})
for link in links:
data.append(parse_data(link))
print('Przetworzono ' + str(len(data)) + ' wycinków gazet.')
driver.quit()
if __name__ == "__main__":
main()