From da0e4f32634f3ef97d3d11a1d8801d48a859d3fe Mon Sep 17 00:00:00 2001 From: zzombely Date: Sun, 12 Mar 2023 15:57:35 +0000 Subject: [PATCH] crawler classes and image downloader --- .gitignore | 4 +- crawler.py | 25 ++++++----- crawler_class.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++ image_class.py | 91 +++++++++++++++++++++++++++++++++++++ mail_test.py | 8 ++++ 5 files changed, 231 insertions(+), 12 deletions(-) create mode 100644 crawler_class.py create mode 100644 image_class.py create mode 100644 mail_test.py diff --git a/.gitignore b/.gitignore index a57b9bc..fb5f559 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ env *.out images* -*.tsv \ No newline at end of file +*.tsv +env_wikicrawler +temp_images \ No newline at end of file diff --git a/crawler.py b/crawler.py index 7cfbdad..ab0192c 100644 --- a/crawler.py +++ b/crawler.py @@ -13,12 +13,13 @@ def get_page_data(page_element): doc = requests.get(MAIN_URL + page_element['href']) doc_soup = BeautifulSoup(doc.text, 'lxml') text = doc_soup.find("div", {"class": "pagetext"}).next_element - image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src'] + image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src'] return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text} -def save_data(file_name, data): - df = pd.DataFrame(data) - df.to_csv(f"./{file_name}.tsv", sep="\t") +def save_data(file_name, data, args): + if not args.testing: + df = pd.DataFrame(data) + df.to_csv(f"./{file_name}.tsv", sep="\t") def main(args): category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"} @@ -51,7 +52,7 @@ def main(args): if r.status_code != 200: print(r.__dict__) - time.sleep(30) + time.sleep(60) r = requests.get(MAIN_URL + next_page) if r.status_code != 200: break @@ -63,19 +64,21 @@ def main(args): data_number += 1 pbar.update(1) except Exception as e: - print(e) - save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list) + print("Error:", e) + save_data(f"./{args.output_file_name}-{args.type}", result_list, args) except KeyboardInterrupt: - save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list) + save_data(f"./{args.output_file_name}-{args.type}", result_list, args) - save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list) + save_data(f"./{args.output_file_name}-{args.type}", result_list, args) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True) + parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True) parser.add_argument("--output_file_name", type=str, required=True) parser.add_argument("--start_file_name", type=str, required=False) parser.add_argument("--start_page_number", type=int, required=False) + parser.add_argument("--testing", type=bool, required=False, default=False) args, left_argv = parser.parse_known_args() - main(args) + print(type(vars(args))) + # main(args) diff --git a/crawler_class.py b/crawler_class.py new file mode 100644 index 0000000..c4cf0a0 --- /dev/null +++ b/crawler_class.py @@ -0,0 +1,115 @@ +from bs4 import BeautifulSoup +import requests +import time +import os +from tqdm import tqdm +import csv +from collections import deque +import argparse +import re +import pandas as pd + +MAIN_URL = "https://pl.wikisource.org/" + + +class WikiCrawler: + def __init__(self, wiki_type: str, output_file_name: str): + self.wiki_type = wiki_type + self.output_file_name = output_file_name + self.page_number = 1 + self.index = 1 + self.load_last_checkpoint() + + def load_last_checkpoint(self): + self.start_file_name = None + if os.path.exists(self.output_file_name): + df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t') + last_line = df.tail(1).iloc()[0] + self.start_file_name = last_line[0] + self.page_number = int(last_line[-1]) + self.index = int(last_line[-2]) + del df + print(f"Starting from index: {self.index}, page: {self.page_number}") + + def _init_crawl(self): + category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"} + CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}" + # if should start from other step + if self.start_file_name: + CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}" + request = requests.get(CATEGORY_URL) + assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}" + + soup = BeautifulSoup(request.text, 'lxml') + self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:])) + self.pbar = tqdm(total=self.max_len) + + if self.start_file_name: + self.pbar.update(self.index) + self.pbar.refresh() + + return soup, request + + def save_page_data(self, page_element): + time.sleep(0.3) + doc_request = requests.get(MAIN_URL + page_element['href']) + assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}" + doc_soup = BeautifulSoup(doc_request.text, 'lxml') + text = doc_soup.find("div", {"class": "pagetext"}).next_element + image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src'] + + with open(self.output_file_name, 'a', newline='') as output_csv: + row_dict = { + "title": page_element['title'], + "href": MAIN_URL + page_element['href'], + "image_url": image_url, + "text": text.text, + "index": self.index, + "page_number": self.page_number + } + fields = list(row_dict.keys()) + writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t') + writer.writerow(row_dict) + + def crawl(self): + soup, r = self._init_crawl() + first_search = True + while self.index < self.max_len: + time.sleep(0.3) + self.pbar.set_description(f"Page number: {self.page_number}") + next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona") + if next_page: + next_page = next_page.get('href', None) + if next_page and not first_search: + r = requests.get(MAIN_URL + next_page) + elif not next_page: + print(soup) + print("\n\n\n", soup.text) + print("End of pages, or next page not found") + break + + # handle wrong request + if r.status_code != 200: + print('Retry of request, request data: ', r.__dict__) + time.sleep(60) + r = requests.get(MAIN_URL + next_page) + assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}" + + soup = BeautifulSoup(r.text, 'lxml') + links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")}) + for link in links: + self.save_page_data(link) + self.index += 1 + self.pbar.update(1) + self.page_number += 1 + first_search = False + print("Finished") + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True) + parser.add_argument("--output_file_name", type=str, required=True) + args, left_argv = parser.parse_known_args() + + crawler = WikiCrawler(**vars(args)) + crawler.crawl() diff --git a/image_class.py b/image_class.py new file mode 100644 index 0000000..b585244 --- /dev/null +++ b/image_class.py @@ -0,0 +1,91 @@ +import os +import argparse +import pandas as pd +import requests +from PIL import Image +from tqdm import tqdm +import pickle +import time +from pprint import pprint +import json +from datasets import load_dataset +from huggingface_hub import login +import shutil + +headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'} + +class WikiImage: + + def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1): + self.input_file_path = input_file_path + self.split_number = split_number + self.max_dataset_len = 10000 + self.output_folder = output_folder + self.dataset_name = dataset_name + print("Loading input file") + self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:] + if os.path.exists(self.output_folder): + print("Removing old dear") + if os.path.exists('/home/zombely/.cache/huggingface/datasets'): + shutil.rmtree('/home/zombely/.cache/huggingface/datasets') + shutil.rmtree(self.output_folder) + os.mkdir(self.output_folder) + self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}") + + login(os.environ.get("HUG_TOKEN"), True) + + def image_save(self, row): + time.sleep(0.3) + image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) + if image_request.status_code in [500, 404]: + print(f"Image {row[1]['title']} is not reacheable") + return + if image_request.status_code != 200: + time.sleep(80) + image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers) + assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}" + + image = Image.open(image_request.raw) + if image.mode != "RGB": + image = image.convert("RGB") + title = row[1]['title'].replace("Strona:", "").replace("/", "-") + image.save(f"{self.output_folder}/{title}.png") + + with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f: + # f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n") + json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False) + f.write("\n") + + def push_dataset(self, split_name: str): + print(f"Pushing split: {split_name}") + dataset = load_dataset(self.output_folder) + dataset[split_name] = dataset.pop('train') + dataset.push_to_hub(f'Zombely/{self.dataset_name}') + shutil.rmtree(self.output_folder) + shutil.rmtree('/home/zombely/.cache/huggingface/datasets') + os.mkdir(self.output_folder) + del dataset + print("Upload finished") + + def crawl(self): + print("Start download") + for index, row in enumerate(self.pbar): + self.image_save(row) + if (index + 1) % self.max_dataset_len == 0: + self.push_dataset(f'train_{self.split_number}') + self.split_number += 1 + self.pbar.set_description(f'Split: {self.split_number}') + + self.push_dataset('validation') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_file_path", type=str, required=True) + parser.add_argument("--dataset_name", type=str, required=True) + parser.add_argument("--output_folder", type=str, required=False, default='temp_images') + parser.add_argument("--split_number", type=int, required=False, default=1) + args, left_argv = parser.parse_known_args() + crawler = WikiImage(**vars(args)) + crawler.crawl() + diff --git a/mail_test.py b/mail_test.py new file mode 100644 index 0000000..f10391b --- /dev/null +++ b/mail_test.py @@ -0,0 +1,8 @@ +import smtplib + +def main(): + smtp = smtplib.SMTP("0.0.0.0", 25, 'mail') + smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome") + +if __name__ == "__main__": + main()