wikisource-crawler/image_download.py
Michał Kozłowski 0f3a6eb9da qol update
2023-01-07 14:39:53 +01:00

69 lines
2.4 KiB
Python

import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
print("Saving state, index: ", index+offset)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
print(f"Creating missing folder: {args.output_folder}")
os.mkdir(args.output_folder)
if args.from_checkpoint:
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
break
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
break
except Exception as e:
print(e)
save_state(n, offset)
break
except KeyboardInterrupt:
save_state(n, offset)
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)