import pandas as pd import requests from bs4 import BeautifulSoup import re from tqdm import tqdm import time import argparse MAIN_URL = "https://pl.wikisource.org/" def main(args): category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"} CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}" def get_page_data(page_element): time.sleep(0.5) doc = requests.get(MAIN_URL + page_element['href']) doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element text = text_elem.text image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} r = requests.get(CATEGORY_URL) soup = BeautifulSoup(r.text, 'lxml') page_number = 1 data_number = 0 result_list = [] max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])) try: with tqdm(total=max_len) as pbar: while data_number < max_len: pbar.set_description(f"Page number: {page_number}") time.sleep(5) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) if next_page and page_number != 1: r = requests.get(MAIN_URL + next_page) elif not next_page: break if r.status_code != 200: print(r.__dict__) time.sleep(30) r = requests.get(MAIN_URL + next_page) if r.status_code != 200: break soup = BeautifulSoup(r.text, 'lxml') links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) page_number += 1 for link in links: result_list.append(get_page_data(link)) data_number += 1 pbar.update(1) except Exception as e: print(e) df = pd.DataFrame(result_list) df.to_csv(f"./{args.type}.tsv", sep="\t") df = pd.DataFrame(result_list) df.to_csv(f"./{args.type}.tsv", sep="\t") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"]) args, left_argv = parser.parse_known_args() main(args)