wikisource-crawler/image_download.py
2023-01-07 14:19:01 +01:00

66 lines
2.3 KiB
Python

import os
import argparse
import pandas as pd
import requests
from PIL import Image
from tqdm import tqdm
import pickle
import time
from pprint import pprint
session = requests.Session()
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 OPR/93.0.0.0'})
def save_state(index, offset):
with open("./state.pickle", "wb") as state_file:
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
def main(args):
df = pd.read_csv(args.file_path, sep="\t")
offset = 0
if not os.path.exists(args.output_folder):
os.mkdir(args.output_folder)
if args.from_checkpoint:
with open("state.pickle", "rb") as state:
state_dict = pickle.load(state)
offset = state_dict["row_index"]
df = df[offset:]
for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
try:
time.sleep(0.2)
r = requests.get(f"https:{row[1]['image_url']}", stream=True)
if r.status_code != 200:
pprint(r.__dict__)
save_state(n, offset)
break
image = Image.open(r.raw)
if image.mode != "RGB":
image = image.convert("RGB")
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
image.save(f"{args.output_folder}/{title}.png")
if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_gb:
save_state(n, offset)
break
except Exception as e:
print(e)
save_state(n, offset)
break
except KeyboardInterrupt:
save_state(n, offset)
break
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--file_path", type=str, required=True)
parser.add_argument("--output_folder", type=str, default="./images")
parser.add_argument("--max_folder_size_gb", default=5000.0, type=float, required=False)
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
args, left_argv = parser.parse_known_args()
main(args)