import pandas as pd import requests from bs4 import BeautifulSoup import re from tqdm import tqdm import time def main(): MAIN_URL = "https://pl.wikisource.org/" URL_YELLOW = "https://pl.wikisource.org/wiki/Kategoria:Skorygowana" URL_GREEN = "https://pl.wikisource.org/wiki/Kategoria:Uwierzytelniona" def get_page_data(page_element): time.sleep(0.5) doc = requests.get(MAIN_URL + page_element['href']) doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element text = text_elem.text if not text_elem.find("math") else "math image" image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} r = requests.get(URL_GREEN) soup = BeautifulSoup(r.text, 'lxml') page_number = 1 result_list = [] max_len = "".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]) try: while True: time.sleep(5) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) if next_page and page_number != 1: r = requests.get(MAIN_URL + next_page) elif not next_page: break if r.status_code != 200: print(r.__dict__) time.sleep(10) r = requests.get(MAIN_URL + next_page) if r.status_code != 200: break soup = BeautifulSoup(r.text, 'lxml') page_number += 1 links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) for link in tqdm(links): result_list.append(get_page_data(link)) print("Page number:", page_number) print("Number of elements:", 200 * page_number, "/", max_len) except Exception as e: print(e) df = pd.DataFrame(result_list) df.to_csv("./green.tsv", sep="\t") df = pd.DataFrame(result_list) df.to_csv("./yellow.tsv", sep="\t") if __name__ == "__main__": main()