wikisource-crawler/image_download.py
Michał Kozłowski bf6014ba98 header update
2023-01-07 14:32:45 +01:00

65 lines
2.2 KiB
Python

import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1'}
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
os.mkdir(args.output_folder)
if args.from_checkpoint:
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
df = df[offset:]
for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True, headers=headers)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
break
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_gb:
save_state(n, offset)
break
except Exception as e:
print(e)
save_state(n, offset)
break
except KeyboardInterrupt:
save_state(n, offset)
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_gb", default=5000.0, type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)