2023-01-03 10:53:34 +01:00
|
|
|
import pandas as pd
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import re
|
|
|
|
from tqdm import tqdm
|
|
|
|
import time
|
2023-01-03 14:50:37 +01:00
|
|
|
import argparse
|
2023-01-03 10:53:34 +01:00
|
|
|
|
2023-01-03 16:11:50 +01:00
|
|
|
MAIN_URL = "https://pl.wikisource.org/"
|
|
|
|
|
2023-01-07 11:34:27 +01:00
|
|
|
def get_page_data(page_element):
|
2023-01-03 14:12:57 +01:00
|
|
|
time.sleep(0.5)
|
2023-01-03 10:53:34 +01:00
|
|
|
doc = requests.get(MAIN_URL + page_element['href'])
|
|
|
|
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
|
2023-01-07 11:34:27 +01:00
|
|
|
text = doc_soup.find("div", {"class": "pagetext"}).next_element
|
2023-01-03 10:53:34 +01:00
|
|
|
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
2023-01-07 11:34:27 +01:00
|
|
|
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
|
|
|
|
|
|
|
|
def save_data(file_name, data):
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
df.to_csv(f"./{file_name}.tsv", sep="\t")
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
|
|
|
|
if args.start_page:
|
|
|
|
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}"
|
|
|
|
else:
|
|
|
|
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
|
2023-01-03 10:53:34 +01:00
|
|
|
|
2023-01-03 16:11:50 +01:00
|
|
|
r = requests.get(CATEGORY_URL)
|
2023-01-03 10:53:34 +01:00
|
|
|
soup = BeautifulSoup(r.text, 'lxml')
|
2023-01-07 11:34:27 +01:00
|
|
|
page_number = 1 if not args.start_page_number else args.start_page_number
|
|
|
|
data_number = 0 if not args.start_page_number else args.start_page_number * 200
|
2023-01-03 10:53:34 +01:00
|
|
|
result_list = []
|
2023-01-03 16:11:50 +01:00
|
|
|
max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
|
|
|
|
|
2023-01-03 10:53:34 +01:00
|
|
|
try:
|
2023-01-03 16:11:50 +01:00
|
|
|
with tqdm(total=max_len) as pbar:
|
|
|
|
while data_number < max_len:
|
|
|
|
pbar.set_description(f"Page number: {page_number}")
|
|
|
|
time.sleep(5)
|
|
|
|
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona").get('href', None)
|
|
|
|
if next_page and page_number != 1:
|
|
|
|
r = requests.get(MAIN_URL + next_page)
|
|
|
|
elif not next_page:
|
|
|
|
break
|
2023-01-03 10:53:34 +01:00
|
|
|
|
2023-01-03 16:11:50 +01:00
|
|
|
if r.status_code != 200:
|
|
|
|
print(r.__dict__)
|
2023-01-04 21:32:26 +01:00
|
|
|
time.sleep(30)
|
2023-01-03 16:11:50 +01:00
|
|
|
r = requests.get(MAIN_URL + next_page)
|
|
|
|
if r.status_code != 200:
|
|
|
|
break
|
|
|
|
soup = BeautifulSoup(r.text, 'lxml')
|
|
|
|
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
|
|
|
page_number += 1
|
|
|
|
for link in links:
|
|
|
|
result_list.append(get_page_data(link))
|
|
|
|
data_number += 1
|
|
|
|
pbar.update(1)
|
2023-01-03 10:53:34 +01:00
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
2023-01-07 11:34:27 +01:00
|
|
|
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
|
|
|
|
|
|
|
|
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
|
2023-01-03 10:53:34 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2023-01-03 16:11:50 +01:00
|
|
|
parser = argparse.ArgumentParser()
|
2023-01-07 11:34:27 +01:00
|
|
|
parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
|
|
|
|
parser.add_argument("--output_file_name", type=str, required=True)
|
|
|
|
parser.add_argument("--start_file_name", type=str, required=False)
|
|
|
|
parser.add_argument("--start_page_number", type=int, required=False)
|
2023-01-03 16:11:50 +01:00
|
|
|
args, left_argv = parser.parse_known_args()
|
|
|
|
main(args)
|