import os import argparse import pandas as pd import requests from PIL import Image from tqdm import tqdm import pickle import time from pprint import pprint session = requests.Session() session.headers.update({'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl)'}) def save_state(index, offset): with open("./state.pickle", "wb") as state_file: pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL) def main(args): df = pd.read_csv(args.file_path, sep="\t") offset = 0 if not os.path.exists(args.output_folder): os.mkdir(args.output_folder) if args.from_checkpoint: with open("state.pickle", "rb") as state: state_dict = pickle.load(state) offset = state_dict["row_index"] df = df[offset:] for n, row in enumerate(tqdm(df.iterrows(), total=len(df))): try: time.sleep(0.2) r = requests.get(f"https:{row[1]['image_url']}", stream=True) if r.status_code != 200: pprint(r.__dict__) save_state(n, offset) break image = Image.open(r.raw) if image.mode != "RGB": image = image.convert("RGB") title = row[1]['title'].replace("Strona:", "").replace("/", "-") image.save(f"{args.output_folder}/{title}.png") if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_gb: save_state(n, offset) break except Exception as e: print(e) save_state(n, offset) break except KeyboardInterrupt: save_state(n, offset) break if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file_path", type=str, required=True) parser.add_argument("--output_folder", type=str, default="./images") parser.add_argument("--max_folder_size_gb", default=5000.0, type=float, required=False) parser.add_argument("--from_checkpoint", type=bool, required=False, default=False) args, left_argv = parser.parse_known_args() main(args)