126 lines
3.6 KiB
Python
126 lines
3.6 KiB
Python
|
from selenium import webdriver
|
||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||
|
from bs4 import BeautifulSoup
|
||
|
import time
|
||
|
import re
|
||
|
import csv
|
||
|
|
||
|
data = []
|
||
|
|
||
|
|
||
|
class DF:
|
||
|
|
||
|
def __init__(self, href, title, date, newspaper):
|
||
|
self.href = href
|
||
|
self.title = title
|
||
|
self.date = date
|
||
|
self.newspaper = newspaper
|
||
|
self.add_to_file()
|
||
|
self.print_data()
|
||
|
|
||
|
def add_to_file(self):
|
||
|
with open('data.csv', 'a', newline='', encoding='UTF-8') as file:
|
||
|
writer = csv.writer(file)
|
||
|
writer.writerow([self.date, self.title, self.newspaper, self.href])
|
||
|
|
||
|
def print_data(self):
|
||
|
print('<' + self.date + ', ' + self.title + ', ' +
|
||
|
self.newspaper + ', ' + self.href + '>')
|
||
|
|
||
|
|
||
|
def parse_date(date: str):
|
||
|
|
||
|
def parse_months(month):
|
||
|
m = 'MM'
|
||
|
month.strip()
|
||
|
if(month == 'styczeń' or month == 'stycznia'):
|
||
|
m = '01'
|
||
|
if(month == 'lutego' or month == 'luty'):
|
||
|
m = '02'
|
||
|
if(month == 'marca' or month == 'marzec'):
|
||
|
m = '03'
|
||
|
if(month == 'kwiecień' or month == 'kwietnia'):
|
||
|
m = '04'
|
||
|
if(month == 'maj' or month == 'maja'):
|
||
|
m = '05'
|
||
|
if(month == 'czerwca' or month == 'czerwiec'):
|
||
|
m = '06'
|
||
|
if(month == 'lipiec' or month == 'lipca'):
|
||
|
m = '07'
|
||
|
if(month == 'sierpnia' or month == 'sierpień'):
|
||
|
m = '08'
|
||
|
if(month == 'wrzesień' or month == 'września'):
|
||
|
m = '09'
|
||
|
if(month == 'październik' or month == 'października'):
|
||
|
m = '10'
|
||
|
if(month == 'listopad' or month == 'listopada'):
|
||
|
m = '11'
|
||
|
if(month == 'grudzień' or month == 'grudnia'):
|
||
|
m = '12'
|
||
|
return m
|
||
|
|
||
|
if (date):
|
||
|
p_date = re.search(
|
||
|
r'^(([0-9]*)-*[0-9]*)\ *([a-zA-ZAaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż]*)\ *([0-9]{4})$', date.strip())
|
||
|
if(p_date):
|
||
|
if(p_date.group(4)):
|
||
|
date = p_date.group(4)
|
||
|
if(p_date.group(3)):
|
||
|
date = date + '-' + parse_months(p_date.group(3))
|
||
|
if(p_date.group(2)):
|
||
|
if(len(p_date.group(2)) == 1):
|
||
|
date = date + '-0' + p_date.group(2)
|
||
|
else:
|
||
|
date = date + '-' + p_date.group(2)
|
||
|
else:
|
||
|
date = 'Invalid date: ' + date
|
||
|
else:
|
||
|
date = 'Invalid date'
|
||
|
return date
|
||
|
|
||
|
|
||
|
def parse_data(df):
|
||
|
href = 'https://www2.pztkd.lublin.pl/' + df.attrs['href']
|
||
|
title = df('div', {'class': "rounded18 glow"})[2].text
|
||
|
|
||
|
res = df('span')[0]
|
||
|
newspaper = re.search(r'>"(.*)"', str(res))
|
||
|
date = re.search(r"<br\/>(.*)<", str(res))
|
||
|
|
||
|
if(date):
|
||
|
date = date.group(1)
|
||
|
else:
|
||
|
date = None
|
||
|
|
||
|
if(newspaper):
|
||
|
newspaper = newspaper.group(1)
|
||
|
else:
|
||
|
newspaper = 'Invalid newspaper'
|
||
|
|
||
|
return DF(href, title.strip(), parse_date(date), newspaper)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
with open('data.csv', 'w', newline='', encoding='UTF-8') as file:
|
||
|
writer = csv.writer(file)
|
||
|
writer.writerow(['Data', 'Tytuł', 'Gazeta', 'Link'])
|
||
|
|
||
|
driver = webdriver.Chrome(ChromeDriverManager().install())
|
||
|
driver.get("https://www2.pztkd.lublin.pl/archpras.html#z1996")
|
||
|
|
||
|
time.sleep(5)
|
||
|
html = driver.page_source
|
||
|
soup = BeautifulSoup(html, 'html.parser')
|
||
|
links = soup('a', {'data-lightbox': "roadtrip"})
|
||
|
|
||
|
for link in links:
|
||
|
data.append(parse_data(link))
|
||
|
|
||
|
print('Przetworzono ' + str(len(data)) + ' wycinków gazet.')
|
||
|
|
||
|
driver.quit()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|