from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup import time import re import csv data = [] class DF: def __init__(self, href, title, date, newspaper): self.href = href self.title = title self.date = date self.newspaper = newspaper self.add_to_file() self.print_data() def add_to_file(self): with open('data.csv', 'a', newline='', encoding='UTF-8') as file: writer = csv.writer(file) writer.writerow([self.date, self.title, self.newspaper, self.href]) def print_data(self): print('<' + self.date + ', ' + self.title + ', ' + self.newspaper + ', ' + self.href + '>') def parse_date(date: str): def parse_months(month): m = 'MM' month.strip() if(month == 'styczeń' or month == 'stycznia'): m = '01' if(month == 'lutego' or month == 'luty'): m = '02' if(month == 'marca' or month == 'marzec'): m = '03' if(month == 'kwiecień' or month == 'kwietnia'): m = '04' if(month == 'maj' or month == 'maja'): m = '05' if(month == 'czerwca' or month == 'czerwiec'): m = '06' if(month == 'lipiec' or month == 'lipca'): m = '07' if(month == 'sierpnia' or month == 'sierpień'): m = '08' if(month == 'wrzesień' or month == 'września'): m = '09' if(month == 'październik' or month == 'października'): m = '10' if(month == 'listopad' or month == 'listopada'): m = '11' if(month == 'grudzień' or month == 'grudnia'): m = '12' return m if (date): p_date = re.search( r'^(([0-9]*)-*[0-9]*)\ *([a-zA-ZAaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż]*)\ *([0-9]{4})$', date.strip()) if(p_date): if(p_date.group(4)): date = p_date.group(4) if(p_date.group(3)): date = date + '-' + parse_months(p_date.group(3)) if(p_date.group(2)): if(len(p_date.group(2)) == 1): date = date + '-0' + p_date.group(2) else: date = date + '-' + p_date.group(2) else: date = 'Invalid date: ' + date else: date = 'Invalid date' return date def parse_data(df): href = 'https://www2.pztkd.lublin.pl/' + df.attrs['href'] title = df('div', {'class': "rounded18 glow"})[2].text res = df('span')[0] newspaper = re.search(r'>"(.*)"', str(res)) date = re.search(r"(.*)<", str(res)) if(date): date = date.group(1) else: date = None if(newspaper): newspaper = newspaper.group(1) else: newspaper = 'Invalid newspaper' return DF(href, title.strip(), parse_date(date), newspaper) def main(): with open('data.csv', 'w', newline='', encoding='UTF-8') as file: writer = csv.writer(file) writer.writerow(['Data', 'Tytuł', 'Gazeta', 'Link']) driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get("https://www2.pztkd.lublin.pl/archpras.html#z1996") time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') links = soup('a', {'data-lightbox': "roadtrip"}) for link in links: data.append(parse_data(link)) print('Przetworzono ' + str(len(data)) + ' wycinków gazet.') driver.quit() if __name__ == "__main__": main()