79 lines
3.2 KiB
Python
79 lines
3.2 KiB
Python
import os
|
|
import argparse
|
|
import pandas as pd
|
|
import requests
|
|
from PIL import Image
|
|
from tqdm import tqdm
|
|
import pickle
|
|
import time
|
|
from pprint import pprint
|
|
import json
|
|
|
|
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
|
|
|
|
def save_state(index, offset):
|
|
with open("./state.pickle", "wb") as state_file:
|
|
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
|
|
print("Saving state, index: ", index+offset)
|
|
|
|
def main(args):
|
|
|
|
df = pd.read_csv(args.file_path, sep="\t")
|
|
offset = 0
|
|
if not os.path.exists(args.output_folder):
|
|
print(f"Creating missing folder: {args.output_folder}")
|
|
os.mkdir(args.output_folder)
|
|
|
|
if args.from_checkpoint and os.path.exists("./state.pickle"):
|
|
with open("state.pickle", "rb") as state:
|
|
state_dict = pickle.load(state)
|
|
offset = state_dict["row_index"]
|
|
print("Starting from checkpoint, index: ", offset)
|
|
df = df[offset:]
|
|
|
|
pbar = tqdm(df.iterrows(), total=len(df), desc=f"0/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
|
|
for n, row in enumerate(pbar):
|
|
try:
|
|
time.sleep(0.2)
|
|
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
|
if r.status_code != 200:
|
|
time.sleep(80)
|
|
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
|
|
if r.status_code != 200:
|
|
pprint(r.__dict__)
|
|
save_state(n, offset)
|
|
return
|
|
image = Image.open(r.raw)
|
|
if image.mode != "RGB":
|
|
image = image.convert("RGB")
|
|
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
|
|
image.save(f"{args.output_folder}/{title}.png")
|
|
|
|
with open(f"{args.output_folder}/metadata.jsonl", mode='a', encoding='utf-8') as f:
|
|
f.write(str({"file_name": f"{title}.png", "ground_truth": json.dumps({"gt_parse": {"text_sequance": row[1]['text']}}, ensure_ascii=False)}) + "\n")
|
|
|
|
dir_size = round(sum(os.path.getsize(f"./{args.output_folder}/{file}") for file in os.listdir(f"./{args.output_folder}")) * 0.000001, 2)
|
|
|
|
pbar.set_description(f"{dir_size}/{args.max_folder_size_mb if args.max_folder_size_mb else 'No limit given'} MB")
|
|
|
|
if args.max_folder_size_mb and dir_size > args.max_folder_size_mb:
|
|
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
|
|
save_state(n, offset)
|
|
return
|
|
|
|
except (Exception, KeyboardInterrupt) as e:
|
|
print(f"Error: {str(e)} \n")
|
|
print(f"Row: {row}")
|
|
save_state(n, offset)
|
|
return
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--file_path", type=str, required=True)
|
|
parser.add_argument("--output_folder", type=str, default="./images")
|
|
parser.add_argument("--max_folder_size_mb", type=float, required=False)
|
|
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
|
|
args, left_argv = parser.parse_known_args()
|
|
main(args) |