image download script
This commit is contained in:
parent
526eb7d5e4
commit
f55125f447
1042
image_download.ipynb
Normal file
1042
image_download.ipynb
Normal file
File diff suppressed because one or more lines are too long
66
image_download.py
Normal file
66
image_download.py
Normal file
@ -0,0 +1,66 @@
|
||||
import os
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import requests
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import time
|
||||
from pprint import pprint
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 OPR/93.0.0.0'})
|
||||
|
||||
def save_state(index, offset):
|
||||
with open("./state.pickle", "wb") as state_file:
|
||||
pickle.dump({"row_index": index+offset}, state_file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def main(args):
|
||||
|
||||
df = pd.read_csv(args.file_path, sep="\t")
|
||||
offset = 0
|
||||
if not os.path.exists(args.output_folder):
|
||||
os.mkdir(args.output_folder)
|
||||
|
||||
if args.from_checkpoint:
|
||||
with open("state.pickle", "rb") as state:
|
||||
state_dict = pickle.load(state)
|
||||
offset = state_dict["row_index"]
|
||||
df = df[offset:]
|
||||
|
||||
for n, row in enumerate(tqdm(df.iterrows(), total=len(df))):
|
||||
try:
|
||||
time.sleep(0.2)
|
||||
r = requests.get(f"https:{row[1]['image_url']}", stream=True)
|
||||
if r.status_code != 200:
|
||||
pprint(r.__dict__)
|
||||
save_state(n, offset)
|
||||
break
|
||||
image = Image.open(r.raw)
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
title = row[1]['title'].replace("Strona:", "").replace("/", "-")
|
||||
image.save(f"{args.output_folder}/{title}.png")
|
||||
|
||||
if round(sum(os.path.getsize(f"./images/{file}") for file in os.listdir("./images")) * 0.000001, 2) > args.max_folder_size_gb:
|
||||
save_state(n, offset)
|
||||
break
|
||||
except Exception as e:
|
||||
print(e)
|
||||
save_state(n, offset)
|
||||
break
|
||||
|
||||
except KeyboardInterrupt:
|
||||
save_state(n, offset)
|
||||
break
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--file_path", type=str, required=True)
|
||||
parser.add_argument("--output_folder", type=str, default="./images")
|
||||
parser.add_argument("--max_folder_size_gb", default=5000.0, type=float, required=False)
|
||||
parser.add_argument("--from_checkpoint", type=bool, required=False, default=False)
|
||||
args, left_argv = parser.parse_known_args()
|
||||
main(args)
|
Loading…
Reference in New Issue
Block a user