crawler classes and image downloader
This commit is contained in:
parent
680c3d000c
commit
da0e4f3263
2
.gitignore
vendored
2
.gitignore
vendored
@ -2,3 +2,5 @@ env
|
||||
*.out
|
||||
images*
|
||||
*.tsv
|
||||
env_wikicrawler
|
||||
temp_images
|
21
crawler.py
21
crawler.py
@ -13,10 +13,11 @@ def get_page_data(page_element):
|
||||
doc = requests.get(MAIN_URL + page_element['href'])
|
||||
doc_soup = BeautifulSoup(doc.text, 'lxml')
|
||||
text = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).next_element['src']
|
||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
|
||||
return {"title": page_element['title'], "href": MAIN_URL + page_element['href'], "image_url": image_url, "text": text.text}
|
||||
|
||||
def save_data(file_name, data):
|
||||
def save_data(file_name, data, args):
|
||||
if not args.testing:
|
||||
df = pd.DataFrame(data)
|
||||
df.to_csv(f"./{file_name}.tsv", sep="\t")
|
||||
|
||||
@ -51,7 +52,7 @@ def main(args):
|
||||
|
||||
if r.status_code != 200:
|
||||
print(r.__dict__)
|
||||
time.sleep(30)
|
||||
time.sleep(60)
|
||||
r = requests.get(MAIN_URL + next_page)
|
||||
if r.status_code != 200:
|
||||
break
|
||||
@ -63,19 +64,21 @@ def main(args):
|
||||
data_number += 1
|
||||
pbar.update(1)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
|
||||
print("Error:", e)
|
||||
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||
except KeyboardInterrupt:
|
||||
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
|
||||
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||
|
||||
save_data(f"./{args.output_file_name}-{args.type}.tsv", result_list)
|
||||
save_data(f"./{args.output_file_name}-{args.type}", result_list, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
|
||||
parser.add_argument("--wiki_type", type=str, default='green', choices=["green", "yellow", "red"], required=True)
|
||||
parser.add_argument("--output_file_name", type=str, required=True)
|
||||
parser.add_argument("--start_file_name", type=str, required=False)
|
||||
parser.add_argument("--start_page_number", type=int, required=False)
|
||||
parser.add_argument("--testing", type=bool, required=False, default=False)
|
||||
args, left_argv = parser.parse_known_args()
|
||||
main(args)
|
||||
print(type(vars(args)))
|
||||
# main(args)
|
||||
|
115
crawler_class.py
Normal file
115
crawler_class.py
Normal file
@ -0,0 +1,115 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import time
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
from collections import deque
|
||||
import argparse
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
MAIN_URL = "https://pl.wikisource.org/"
|
||||
|
||||
|
||||
class WikiCrawler:
|
||||
def __init__(self, wiki_type: str, output_file_name: str):
|
||||
self.wiki_type = wiki_type
|
||||
self.output_file_name = output_file_name
|
||||
self.page_number = 1
|
||||
self.index = 1
|
||||
self.load_last_checkpoint()
|
||||
|
||||
def load_last_checkpoint(self):
|
||||
self.start_file_name = None
|
||||
if os.path.exists(self.output_file_name):
|
||||
df = pd.read_csv(self.output_file_name, encoding='utf-8', sep='\t')
|
||||
last_line = df.tail(1).iloc()[0]
|
||||
self.start_file_name = last_line[0]
|
||||
self.page_number = int(last_line[-1])
|
||||
self.index = int(last_line[-2])
|
||||
del df
|
||||
print(f"Starting from index: {self.index}, page: {self.page_number}")
|
||||
|
||||
def _init_crawl(self):
|
||||
category_dict = {"green": "Uwierzytelniona", "yellow": "Skorygowana", "red": "Przepisana"}
|
||||
CATEGORY_URL = f"{MAIN_URL}/wiki/Kategoria:{category_dict[self.wiki_type]}"
|
||||
# if should start from other step
|
||||
if self.start_file_name:
|
||||
CATEGORY_URL = f"{MAIN_URL}/w/index.php?title=Kategoria:{category_dict[self.wiki_type]}&pagefrom={self.start_file_name}"
|
||||
request = requests.get(CATEGORY_URL)
|
||||
assert request.status_code == 200, f"Status diffrent on main request, status: {request.status_code}"
|
||||
|
||||
soup = BeautifulSoup(request.text, 'lxml')
|
||||
self.max_len = int("".join(re.findall("\d", re.sub("\xa0",'', soup.find("div", {"id": "mw-pages"}).find("p").text))[3:]))
|
||||
self.pbar = tqdm(total=self.max_len)
|
||||
|
||||
if self.start_file_name:
|
||||
self.pbar.update(self.index)
|
||||
self.pbar.refresh()
|
||||
|
||||
return soup, request
|
||||
|
||||
def save_page_data(self, page_element):
|
||||
time.sleep(0.3)
|
||||
doc_request = requests.get(MAIN_URL + page_element['href'])
|
||||
assert doc_request.status_code == 200, f"Wrong status on requesting doc link: {MAIN_URL + page_element['href']}"
|
||||
doc_soup = BeautifulSoup(doc_request.text, 'lxml')
|
||||
text = doc_soup.find("div", {"class": "pagetext"}).next_element
|
||||
image_url = doc_soup.find("div", {"class": "prp-page-image"}).find("img")['src']
|
||||
|
||||
with open(self.output_file_name, 'a', newline='') as output_csv:
|
||||
row_dict = {
|
||||
"title": page_element['title'],
|
||||
"href": MAIN_URL + page_element['href'],
|
||||
"image_url": image_url,
|
||||
"text": text.text,
|
||||
"index": self.index,
|
||||
"page_number": self.page_number
|
||||
}
|
||||
fields = list(row_dict.keys())
|
||||
writer = csv.DictWriter(output_csv, fieldnames=list(row_dict.keys()), delimiter='\t')
|
||||
writer.writerow(row_dict)
|
||||
|
||||
def crawl(self):
|
||||
soup, r = self._init_crawl()
|
||||
first_search = True
|
||||
while self.index < self.max_len:
|
||||
time.sleep(0.3)
|
||||
self.pbar.set_description(f"Page number: {self.page_number}")
|
||||
next_page = soup.find("a", {"href": re.compile(r"\/w\/index.php.*")}, string="następna strona")
|
||||
if next_page:
|
||||
next_page = next_page.get('href', None)
|
||||
if next_page and not first_search:
|
||||
r = requests.get(MAIN_URL + next_page)
|
||||
elif not next_page:
|
||||
print(soup)
|
||||
print("\n\n\n", soup.text)
|
||||
print("End of pages, or next page not found")
|
||||
break
|
||||
|
||||
# handle wrong request
|
||||
if r.status_code != 200:
|
||||
print('Retry of request, request data: ', r.__dict__)
|
||||
time.sleep(60)
|
||||
r = requests.get(MAIN_URL + next_page)
|
||||
assert r.status_code == 200, f"Retry failed, request status: {r.status_code}, full_info: {r.__dict__}"
|
||||
|
||||
soup = BeautifulSoup(r.text, 'lxml')
|
||||
links = soup.find_all("a", {"href": re.compile(r"\/wiki\/Strona:.*")})
|
||||
for link in links:
|
||||
self.save_page_data(link)
|
||||
self.index += 1
|
||||
self.pbar.update(1)
|
||||
self.page_number += 1
|
||||
first_search = False
|
||||
print("Finished")
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--wiki_type", type=str, choices=["green", "yellow", "red"], required=True)
|
||||
parser.add_argument("--output_file_name", type=str, required=True)
|
||||
args, left_argv = parser.parse_known_args()
|
||||
|
||||
crawler = WikiCrawler(**vars(args))
|
||||
crawler.crawl()
|
91
image_class.py
Normal file
91
image_class.py
Normal file
@ -0,0 +1,91 @@
|
||||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import requests
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import time
|
||||
from pprint import pprint
|
||||
import json
|
||||
from datasets import load_dataset
|
||||
from huggingface_hub import login
|
||||
import shutil
|
||||
|
||||
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (no.rp.mk.info@gmail.com) requests/2.28.1'}
|
||||
|
||||
class WikiImage:
|
||||
|
||||
def __init__(self, input_file_path: str, dataset_name: str, output_folder: str = 'temp_images', split_number: int = 1):
|
||||
self.input_file_path = input_file_path
|
||||
self.split_number = split_number
|
||||
self.max_dataset_len = 10000
|
||||
self.output_folder = output_folder
|
||||
self.dataset_name = dataset_name
|
||||
print("Loading input file")
|
||||
self.dataframe = pd.read_csv(self.input_file_path, sep='\t')[(self.split_number - 1) * self.max_dataset_len:]
|
||||
if os.path.exists(self.output_folder):
|
||||
print("Removing old dear")
|
||||
if os.path.exists('/home/zombely/.cache/huggingface/datasets'):
|
||||
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
|
||||
shutil.rmtree(self.output_folder)
|
||||
os.mkdir(self.output_folder)
|
||||
self.pbar = tqdm(self.dataframe.iterrows(), total=len(self.dataframe), desc=f"Split: {self.split_number}")
|
||||
|
||||
login(os.environ.get("HUG_TOKEN"), True)
|
||||
|
||||
def image_save(self, row):
|
||||
time.sleep(0.3)
|
||||
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||
if image_request.status_code in [500, 404]:
|
||||
print(f"Image {row[1]['title']} is not reacheable")
|
||||
return
|
||||
if image_request.status_code != 200:
|
||||
time.sleep(80)
|
||||
image_request = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
||||
assert image_request.status_code == 200, f"Response status is diffrent, status_code: {image_request.status_code}, full info: {image_request.__dict__}"
|
||||
|
||||
image = Image.open(image_request.raw)
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
|
||||
image.save(f"{self.output_folder}/{title}.png")
|
||||
|
||||
with open(f"{self.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
|
||||
# f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)})+"\n")
|
||||
json.dump({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text'].replace('"', "'")}}, ensure_ascii=False)}, f, ensure_ascii=False)
|
||||
f.write("\n")
|
||||
|
||||
def push_dataset(self, split_name: str):
|
||||
print(f"Pushing split: {split_name}")
|
||||
dataset = load_dataset(self.output_folder)
|
||||
dataset[split_name] = dataset.pop('train')
|
||||
dataset.push_to_hub(f'Zombely/{self.dataset_name}')
|
||||
shutil.rmtree(self.output_folder)
|
||||
shutil.rmtree('/home/zombely/.cache/huggingface/datasets')
|
||||
os.mkdir(self.output_folder)
|
||||
del dataset
|
||||
print("Upload finished")
|
||||
|
||||
def crawl(self):
|
||||
print("Start download")
|
||||
for index, row in enumerate(self.pbar):
|
||||
self.image_save(row)
|
||||
if (index + 1) % self.max_dataset_len == 0:
|
||||
self.push_dataset(f'train_{self.split_number}')
|
||||
self.split_number += 1
|
||||
self.pbar.set_description(f'Split: {self.split_number}')
|
||||
|
||||
self.push_dataset('validation')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input_file_path", type=str, required=True)
|
||||
parser.add_argument("--dataset_name", type=str, required=True)
|
||||
parser.add_argument("--output_folder", type=str, required=False, default='temp_images')
|
||||
parser.add_argument("--split_number", type=int, required=False, default=1)
|
||||
args, left_argv = parser.parse_known_args()
|
||||
crawler = WikiImage(**vars(args))
|
||||
crawler.crawl()
|
||||
|
8
mail_test.py
Normal file
8
mail_test.py
Normal file
@ -0,0 +1,8 @@
|
||||
import smtplib
|
||||
|
||||
def main():
|
||||
smtp = smtplib.SMTP("0.0.0.0", 25, 'mail')
|
||||
smtp.sendmail('info@zbhome.com', ['michalkozlowski936@gmail.com'], "Hello from zbhome")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user