wikisource-crawler/image_download.py
Michał Kozłowski 9ee285cf95 fix
2023-01-07 14:41:26 +01:00

69 lines
2.4 KiB
Python

import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
print("Saving state, index: ", index+offset)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
print(f"Creating missing folder: {args.output_folder}")
os.mkdir(args.output_folder)
if args.from_checkpoint and os.path.exists("./state.pickle"):
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
print("Starting from checkpoint, index: ", offset)
df = df[offset:]
for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
break
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_mb:
print(f"Limit size of: {args.max_folder_size_mb}, exceeded")
save_state(n, offset)
break
except Exception as e:
print(e)
save_state(n, offset)
break
except KeyboardInterrupt:
save_state(n, offset)
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)