crawler update with continuation, files removed

This commit is contained in:
Michał Kozłowski 2023-01-07 11:34:27 +01:00
parent 683bc9e6fc
commit 0a39807fec
3 changed files with 23 additions and 2638788 deletions

View File

@ -8,23 +8,29 @@ import argparse
MAIN_URL = "https://pl.wikisource.org/" MAIN_URL = "https://pl.wikisource.org/"
def main(args): def get_page_data(page_element):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
def get_page_data(page_element):
time.sleep(0.5) time.sleep(0.5)
doc = requests.get(MAIN_URL + page_element['href']) doc = requests.get(MAIN_URL + page_element['href'])
doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8") doc_soup = BeautifulSoup(doc.text, 'lxml', from_encoding="utf-8")
text_elem = doc_soup.find("div", {"class": "pagetext"}).next_element text = doc_soup.find("div", {"class": "pagetext"}).next_element
text = text_elem.text
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text,} return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
def save_data(file_name, data):
df = pd.DataFrame(data)
df.to_csv(f"./{file_name}.tsv", sep="\t")
def main(args):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
if args.start_page:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[args.type]}&pagefrom={args.start_file_name}"
else:
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[args.type]}"
r = requests.get(CATEGORY_URL) r = requests.get(CATEGORY_URL)
soup = BeautifulSoup(r.text, 'lxml') soup = BeautifulSoup(r.text, 'lxml')
page_number = 1 page_number = 1 if not args.start_page_number else args.start_page_number
data_number = 0 data_number = 0 if not args.start_page_number else args.start_page_number * 200
result_list = [] result_list = []
max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])) max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
@ -54,14 +60,16 @@ def main(args):
pbar.update(1) pbar.update(1)
except Exception as e: except Exception as e:
print(e) print(e)
df = pd.DataFrame(result_list) save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
df.to_csv(f"./{args.type}.tsv", sep="\t")
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
df = pd.DataFrame(result_list)
df.to_csv(f"./{args.type}.tsv", sep="\t")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"]) parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--start_file_name", type=str, required=False)
parser.add_argument("--start_page_number", type=int, required=False)
args, left_argv = parser.parse_known_args() args, left_argv = parser.parse_known_args()
main(args) main(args)

1385641
green.tsv

File diff suppressed because it is too large Load Diff

1253132
yellow.tsv

File diff suppressed because one or more lines are too long