wikisource-crawler/image_download.py
Michał Kozłowski 680c3d000c fixing
2023-01-10 22:57:22 +01:00

79 lines
3.2 KiB
Python

import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
import json
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
print("Saving state, index: ", index+offset)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
print(f"Creating missing folder: {args.output_folder}")
os.mkdir(args.output_folder)
if args.from_checkpoint and os.path.exists("./state.pickle"):
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
for n, row in enumerate(pbar):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
time.sleep(80)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
return
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
return
except (Exception, KeyboardInterrupt) as e:
print(f"Error: {str(e)} \n")
print(f"Row: {row}")
save_state(n, offset)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_mb", type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)