crawler classes and image downloader

This commit is contained in:
zzombely 2023-03-12 15:57:35 +00:00
parent 680c3d000c
commit da0e4f3263
5 changed files with 231 additions and 12 deletions

4
.gitignore vendored
View File

@ -1,4 +1,6 @@
env
*.out
images*
*.tsv
*.tsv
env_wikicrawler
temp_images

View File

@ -13,12 +13,13 @@ def get_page_data(page_element):
doc = requests.get(MAIN_URL + page_element['href'])
doc_soup = BeautifulSoup(doc.text, 'lxml')
text = doc_soup.find("div", {"class": "pagetext"}).next_element
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
def save_data(file_name, data):
df = pd.DataFrame(data)
df.to_csv(f"./{file_name}.tsv", sep="\t")
def save_data(file_name, data, args):
if not args.testing:
df = pd.DataFrame(data)
df.to_csv(f"./{file_name}.tsv", sep="\t")
def main(args):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
@ -51,7 +52,7 @@ def main(args):
if r.status_code != 200:
print(r.__dict__)
time.sleep(30)
time.sleep(60)
r = requests.get(MAIN_URL + next_page)
if r.status_code != 200:
break
@ -63,19 +64,21 @@ def main(args):
data_number += 1
pbar.update(1)
except Exception as e:
print(e)
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
print("Error:", e)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
except KeyboardInterrupt:
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
parser.add_argument("--start_file_name", type=str, required=False)
parser.add_argument("--start_page_number", type=int, required=False)
parser.add_argument("--testing", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)
print(type(vars(args)))
# main(args)

115
crawler_class.py Normal file
View File

@ -0,0 +1,115 @@
from bs4 import BeautifulSoup
import requests
import time
import os
from tqdm import tqdm
import csv
from collections import deque
import argparse
import re
import pandas as pd
MAIN_URL = "https://pl.wikisource.org/"
class WikiCrawler:
def __init__(self, wiki_type: str, output_file_name: str):
self.wiki_type = wiki_type
self.output_file_name = output_file_name
self.page_number = 1
self.index = 1
self.load_last_checkpoint()
def load_last_checkpoint(self):
self.start_file_name = None
if os.path.exists(self.output_file_name):
df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t')
last_line = df.tail(1).iloc()[0]
self.start_file_name = last_line[0]
self.page_number = int(last_line[-1])
self.index = int(last_line[-2])
del df
print(f"Starting from index: {self.index}, page: {self.page_number}")
def _init_crawl(self):
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}"
# if should start from other step
if self.start_file_name:
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}"
request = requests.get(CATEGORY_URL)
assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}"
soup = BeautifulSoup(request.text, 'lxml')
self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
self.pbar = tqdm(total=self.max_len)
if self.start_file_name:
self.pbar.update(self.index)
self.pbar.refresh()
return soup, request
def save_page_data(self, page_element):
time.sleep(0.3)
doc_request = requests.get(MAIN_URL + page_element['href'])
assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}"
doc_soup = BeautifulSoup(doc_request.text, 'lxml')
text = doc_soup.find("div", {"class": "pagetext"}).next_element
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
with open(self.output_file_name, 'a', newline='') as output_csv:
row_dict = {
"title": page_element['title'],
"href": MAIN_URL + page_element['href'],
"image_url": image_url,
"text": text.text,
"index": self.index,
"page_number": self.page_number
}
fields = list(row_dict.keys())
writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t')
writer.writerow(row_dict)
def crawl(self):
soup, r = self._init_crawl()
first_search = True
while self.index < self.max_len:
time.sleep(0.3)
self.pbar.set_description(f"Page number: {self.page_number}")
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona")
if next_page:
next_page = next_page.get('href', None)
if next_page and not first_search:
r = requests.get(MAIN_URL + next_page)
elif not next_page:
print(soup)
print("\n\n\n", soup.text)
print("End of pages, or next page not found")
break
# handle wrong request
if r.status_code != 200:
print('Retry of request, request data: ', r.__dict__)
time.sleep(60)
r = requests.get(MAIN_URL + next_page)
assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}"
soup = BeautifulSoup(r.text, 'lxml')
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
for link in links:
self.save_page_data(link)
self.index += 1
self.pbar.update(1)
self.page_number += 1
first_search = False
print("Finished")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True)
parser.add_argument("--output_file_name", type=str, required=True)
args, left_argv = parser.parse_known_args()
crawler = WikiCrawler(**vars(args))
crawler.crawl()

91
image_class.py Normal file
View File

@ -0,0 +1,91 @@
import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
from datasets import load_dataset
from huggingface_hub import login
import shutil
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'}
class WikiImage:
def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1):
self.input_file_path = input_file_path
self.split_number = split_number
self.max_dataset_len = 10000
self.output_folder = output_folder
self.dataset_name = dataset_name
print("Loading input file")
self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:]
if os.path.exists(self.output_folder):
print("Removing old dear")
if os.path.exists('/home/zombely/.cache/huggingface/datasets'):
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
shutil.rmtree(self.output_folder)
os.mkdir(self.output_folder)
self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}")
login(os.environ.get("HUG_TOKEN"), True)
def image_save(self, row):
time.sleep(0.3)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if image_request.status_code in [500, 404]:
print(f"Image {row[1]['title']} is not reacheable")
return
if image_request.status_code != 200:
time.sleep(80)
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}"
image = Image.open(image_request.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{self.output_folder}/{title}.png")
with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
# f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n")
json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False)
f.write("\n")
def push_dataset(self, split_name: str):
print(f"Pushing split: {split_name}")
dataset = load_dataset(self.output_folder)
dataset[split_name] = dataset.pop('train')
dataset.push_to_hub(f'Zombely/{self.dataset_name}')
shutil.rmtree(self.output_folder)
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
os.mkdir(self.output_folder)
del dataset
print("Upload finished")
def crawl(self):
print("Start download")
for index, row in enumerate(self.pbar):
self.image_save(row)
if (index + 1) % self.max_dataset_len == 0:
self.push_dataset(f'train_{self.split_number}')
self.split_number += 1
self.pbar.set_description(f'Split: {self.split_number}')
self.push_dataset('validation')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file_path", type=str, required=True)
parser.add_argument("--dataset_name", type=str, required=True)
parser.add_argument("--output_folder", type=str, required=False, default='temp_images')
parser.add_argument("--split_number", type=int, required=False, default=1)
args, left_argv = parser.parse_known_args()
crawler = WikiImage(**vars(args))
crawler.crawl()

8
mail_test.py Normal file
View File

@ -0,0 +1,8 @@
import smtplib
def main():
smtp = smtplib.SMTP("0.0.0.0", 25, 'mail')
smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome")
if __name__ == "__main__":
main()