From 89a1a4399d0d89eb5287e0053bb97e088883cab0 Mon Sep 17 00:00:00 2001 From: Jakub-Prus <68164819+Jakub-Prus@users.noreply.github.com> Date: Wed, 25 May 2022 08:11:49 +0200 Subject: [PATCH] Building dataset and resizing images --- imageClasification/imageResize.py | 11 +++ imageClasification/imageScraper.py | 110 +++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 imageClasification/imageResize.py create mode 100644 imageClasification/imageScraper.py diff --git a/imageClasification/imageResize.py b/imageClasification/imageResize.py new file mode 100644 index 0000000..1782404 --- /dev/null +++ b/imageClasification/imageResize.py @@ -0,0 +1,11 @@ +from PIL import Image +import os +import PIL +import glob + + +images = [file for file in os.listdir() if file.endswith(('jpeg', 'png', 'jpg'))] +for image in images: + img = Image.open(image) + img.thumbnail((256,256)) + img.save(image, optimize=True, quality=60) \ No newline at end of file diff --git a/imageClasification/imageScraper.py b/imageClasification/imageScraper.py new file mode 100644 index 0000000..5de6134 --- /dev/null +++ b/imageClasification/imageScraper.py @@ -0,0 +1,110 @@ +import hashlib +import io +import os +import time + +import requests +from PIL import Image +from selenium import webdriver +# from selenium.webdriver.common.devtools.v100 import io + +DRIVER_PATH = './chromedriver.exe' + + + + +def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): + def scroll_to_end(wd): + wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(sleep_between_interactions) + + # build the google query + + search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" + + # load the page + wd.get(search_url.format(q=query)) + + image_urls = set() + image_count = 0 + results_start = 0 + while image_count < max_links_to_fetch: + scroll_to_end(wd) + + # get all image thumbnail results + thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") + number_results = len(thumbnail_results) + + print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") + + for img in thumbnail_results[results_start:number_results]: + # try to click every thumbnail such that we can get the real image behind it + try: + img.click() + time.sleep(sleep_between_interactions) + except Exception: + continue + + # extract image urls + actual_images = wd.find_elements_by_css_selector('img.n3VNCb') + for actual_image in actual_images: + if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): + image_urls.add(actual_image.get_attribute('src')) + + image_count = len(image_urls) + + if len(image_urls) >= max_links_to_fetch: + print(f"Found: {len(image_urls)} image links, done!") + break + else: + print("Found:", len(image_urls), "image links, looking for more ...") + # time.sleep(30) + continue + # return + load_more_button = wd.find_element_by_css_selector(".mye4qd") + if load_more_button: + wd.execute_script("document.querySelector('.mye4qd').click();") + + # move the result startpoint further down + results_start = len(thumbnail_results) + + return image_urls + + +def persist_image(folder_path: str, url: str): + try: + image_content = requests.get(url).content + + except Exception as e: + print(f"ERROR - Could not download {url} - {e}") + + try: + image_file = io.BytesIO(image_content) + image = Image.open(image_file).convert('RGB') + file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg') + with open(file_path, 'wb') as f: + image.save(f, "JPEG", quality=85) + print(f"SUCCESS - saved {url} - as {file_path}") + except Exception as e: + print(f"ERROR - Could not save {url} - {e}") + + +def search_and_download(search_term: str, driver_path: str, target_path='./images', number_images=200): + target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' '))) + + if not os.path.exists(target_folder): + os.makedirs(target_folder) + + with webdriver.Chrome(executable_path=driver_path) as wd: + res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions= 0.2) + + for elem in res: + persist_image(target_folder, elem) + + +search_term = 'cowss' + +search_and_download( + search_term=search_term, + driver_path=DRIVER_PATH +) \ No newline at end of file