diff --git a/image_download.py b/image_download.py index f630b39..2d5f0b4 100644 --- a/image_download.py +++ b/image_download.py @@ -13,18 +13,21 @@ headers = {'User-Agent': 'ImageDownloadOcrBot/1.0 (micha9@op.pl) requests/2.28.1 def save_state(index, offset): with open("./state.pickle", "wb") as state_file: pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL) + print("Saving state, index: ", index+offset) def main(args): df = pd.read_csv(args.file_path, sep="\t") offset = 0 if not os.path.exists(args.output_folder): + print(f"Creating missing folder: {args.output_folder}") os.mkdir(args.output_folder) if args.from_checkpoint: with open("state.pickle", "rb") as state: state_dict = pickle.load(state) offset = state_dict["row_index"] + print("Starting from checkpoint, index: ", offset) df = df[offset:] for n, row in enumerate(tqdm(df.iterrows(), total=len(df))): @@ -41,7 +44,8 @@ def main(args): title = row[1]['title'].replace("Strona:", "").replace("/", "-") image.save(f"{args.output_folder}/{title}.png") - if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_gb: + if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_mb: + print(f"Limit size of: {args.max_folder_size_mb}, exceeded") save_state(n, offset) break except Exception as e: @@ -59,7 +63,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file_path", type=str, required=True) parser.add_argument("--output_folder", type=str, default="./images") - parser.add_argument("--max_folder_size_gb", default=5000.0, type=float, required=False) + parser.add_argument("--max_folder_size_mb", default=5000.0, type=float, required=False) parser.add_argument("--from_checkpoint", type=bool, required=False, default=False) args, left_argv = parser.parse_known_args() main(args) \ No newline at end of file