import pandas as pd import requests from bs4 import BeautifulSoup import re from tqdm import tqdm import time import argparse MAIN_URL = "https://pl.wikisource.org/" def get_page_data(page_element): time.sleep(0.5) doc = requests.get(MAIN_URL + page_element['href']) doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") text = doc_soup.find("div", {"class": "pagetext"}).next_element image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text} def save_data(file_name, data): df = pd.DataFrame(data) df.to_csv(f"./{file_name}.tsv", sep="\t") def main(args): category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"} if args.start_page: CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}" else: CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}" r = requests.get(CATEGORY_URL) soup = BeautifulSoup(r.text, 'lxml') page_number = 1 if not args.start_page_number else args.start_page_number data_number = 0 if not args.start_page_number else args.start_page_number * 200 result_list = [] max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])) try: with tqdm(total=max_len) as pbar: while data_number < max_len: pbar.set_description(f"Page number: {page_number}") time.sleep(5) next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None) if next_page and page_number != 1: r = requests.get(MAIN_URL + next_page) elif not next_page: break if r.status_code != 200: print(r.__dict__) time.sleep(30) r = requests.get(MAIN_URL + next_page) if r.status_code != 200: break soup = BeautifulSoup(r.text, 'lxml') links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) page_number += 1 for link in links: result_list.append(get_page_data(link)) data_number += 1 pbar.update(1) except Exception as e: print(e) save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list) save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True) parser.add_argument("--output_file_name", type=str, required=True) parser.add_argument("--start_file_name", type=str, required=False) parser.add_argument("--start_page_number", type=int, required=False) args, left_argv = parser.parse_known_args() main(args)