from __future__ import unicode_literals from __future__ import print_function from __future__ import absolute_import from future import standard_library standard_library.install_aliases() from builtins import str from builtins import range from builtins import object from unidecode import unidecode from .utils import get_browser_with_url, write_html_to_file, measure_time from bs4 import BeautifulSoup import urllib.parse import sys import requests import shutil import os import threading import queue IMAGE_FORMATS = ["bmp", "gif", "jpg", "png", "psd", "pspimage", "thm", "tif", "yuv", "ai", "drw", "eps", "ps", "svg", "tiff", "jpeg", "jif", "jfif", "jp2", "jpx", "j2k", "j2c", "fpx", "pcd", "png", "pdf"] # AUXILIARY CLASSES class ImageType(object): NONE = None FACE = "face" PHOTO = "photo" CLIPART = "clipart" LINE_DRAWING = "lineart" class SizeCategory(object): NONE = None ICON = "i" LARGE = "l" MEDIUM = "m" SMALL = "s" LARGER_THAN = "lt" EXACTLY = "ex" class LargerThan(object): NONE = None QSVGA = "qsvga" # 400 x 300 VGA = "vga" # 640 x 480 SVGA = "svga" # 800 x 600 XGA = "xga" # 1024 x 768 MP_2 = "2mp" # 2 MP (1600 x 1200) MP_4 = "4mp" # 4 MP (2272 x 1704) MP_6 = "6mp" # 6 MP (2816 x 2112) MP_8 = "8mp" # 8 MP (3264 x 2448) MP_10 = "10mp" # 10 MP (3648 x 2736) MP_12 = "12mp" # 12 MP (4096 x 3072) MP_15 = "15mp" # 15 MP (4480 x 3360) MP_20 = "20mp" # 20 MP (5120 x 3840) MP_40 = "40mp" # 40 MP (7216 x 5412) MP_70 = "70mp" # 70 MP (9600 x 7200) class ColorType(object): NONE = None COLOR = "color" BLACK_WHITE = "gray" SPECIFIC = "specific" class License(object): NONE = None REUSE = "fc" REUSE_WITH_MOD = "fmc" REUSE_NON_COMMERCIAL = "f" REUSE_WITH_MOD_NON_COMMERCIAL = "fm" class ImageOptions(object): """Allows passing options to filter a google images search.""" def __init__(self): self.image_type = None self.size_category = None self.larger_than = None self.exact_width = None self.exact_height = None self.color_type = None self.color = None self.license = None def __repr__(self): return unidecode(self.__dict__) def get_tbs(self): tbs = None if self.image_type: # clipart tbs = self._add_to_tbs(tbs, "itp", self.image_type) if self.size_category and not (self.larger_than or (self.exact_width and self.exact_height)): # i = icon, l = large, m = medium, lt = larger than, ex = exact tbs = self._add_to_tbs(tbs, "isz", self.size_category) if self.larger_than: # qsvga,4mp tbs = self._add_to_tbs(tbs, "isz", SizeCategory.LARGER_THAN) tbs = self._add_to_tbs(tbs, "islt", self.larger_than) if self.exact_width and self.exact_height: tbs = self._add_to_tbs(tbs, "isz", SizeCategory.EXACTLY) tbs = self._add_to_tbs(tbs, "iszw", self.exact_width) tbs = self._add_to_tbs(tbs, "iszh", self.exact_height) if self.color_type and not self.color: # color = color, gray = black and white, specific = user defined tbs = self._add_to_tbs(tbs, "ic", self.color_type) if self.color: tbs = self._add_to_tbs(tbs, "ic", ColorType.SPECIFIC) tbs = self._add_to_tbs(tbs, "isc", self.color) if self.license: tbs = self._add_to_tbs(tbs, "sur", self.license) return tbs def _add_to_tbs(self, tbs, name, value): if tbs: return "%s,%s:%s" % (tbs, name, value) else: return "&tbs=%s:%s" % (name, value) class ImageResult(object): """Represents a google image search result.""" ROOT_FILENAME = "img" DEFAULT_FORMAT = "jpg" def __init__(self): self.name = None self.file_name = None self.link = None self.thumb = None self.thumb_width = None self.thumb_height = None self.width = None self.height = None self.filesize = None self.format = None self.domain = None self.page = None self.index = None self.site = None def __eq__(self, other): return self.link == other.link def __hash__(self): return id(self.link) def __repr__(self): string = "ImageResult(index={i}, page={p}, domain={d}, link={l})".format( i=str(self.index), p=str(self.page), d=unidecode(self.domain) if self.domain else None, l=unidecode(self.link) if self.link else None ) return string def download(self, path="images"): """Download an image to a given path.""" self._create_path(path) # print path try: response = requests.get(self.link, stream=True) # request a protected image (adding a referer to the request) # referer = self.domain # image = self.link # req = urllib2.Request(image) # req.add_header('Referer', referer) # here is the trick # response = urllib2.urlopen(req) if "image" in response.headers['content-type']: path_filename = self._get_path_filename(path) with open(path_filename, 'wb') as output_file: shutil.copyfileobj(response.raw, output_file) # output_file.write(response.content) else: print("\r\rskiped! cached image") del response except Exception as inst: print(self.link, "has failed:") print(inst) def _get_path_filename(self, path): """Build the filename to download. Checks that filename is not already in path. Otherwise looks for another name. >>> ir = ImageResult() >>> ir._get_path_filename("test") 'test\\\img3.jpg' >>> ir.name = "pirulo" >>> ir.format = "jpg" >>> ir._get_path_filename("test") 'test\\\pirulo.jpg' """ path_filename = None # preserve the original name if self.file_name: original_filename = self.file_name path_filename = os.path.join(path, original_filename) # create a default name if there is no original name if not path_filename or os.path.isfile(path_filename): # take the format of the file, or use default if self.format: file_format = self.format else: file_format = self.DEFAULT_FORMAT # create root of file, until reaching a non existent one i = 1 default_filename = self.ROOT_FILENAME + str(i) + "." + file_format path_filename = os.path.join(path, default_filename) while os.path.isfile(path_filename): i += 1 default_filename = self.ROOT_FILENAME + str(i) + "." + \ file_format path_filename = os.path.join(path, default_filename) return path_filename def _create_path(self, path): """Create a path, if it doesn't exists.""" if not os.path.isdir(path): os.mkdir(path) # PRIVATE def _parse_image_format(image_link): """Parse an image format from a download link. Args: image_link: link to download an image. >>> link = "http://blogs.elpais.com/.a/6a00d8341bfb1653ef01a73dbb4a78970d-pi" >>> Google._parse_image_format(link) >>> link = "http://minionslovebananas.com/images/gallery/preview/Chiquita-DM2-minion-banana-3.jpg%3Fw%3D300%26h%3D429" >>> Google._parse_image_format(link) """ parsed_format = image_link[image_link.rfind(".") + 1:] # OLD: identify formats even with weird final characters if parsed_format not in IMAGE_FORMATS: for image_format in IMAGE_FORMATS: if image_format in parsed_format: parsed_format = image_format break if parsed_format not in IMAGE_FORMATS: parsed_format = None return parsed_format def _get_images_req_url(query, image_options=None, page=0, per_page=20): query = query.strip().replace(":", "%3A").replace( "+", "%2B").replace("&", "%26").replace(" ", "+") url = "https://www.google.com.ar/search?q={}".format(query) + \ "&es_sm=122&source=lnms" + \ "&tbm=isch&sa=X&ei=DDdUVL-fE4SpNq-ngPgK&ved=0CAgQ_AUoAQ" + \ "&biw=1024&bih=719&dpr=1.25" if image_options: tbs = image_options.get_tbs() if tbs: url = url + tbs return url def _find_divs_with_images(soup): try: div_container = soup.find("div", {"id": "rg_s"}) divs = div_container.find_all("div", {"class": "rg_di"}) except: divs = None return divs def _get_file_name(link): temp_name = link.rsplit('/', 1)[-1] image_format = _parse_image_format(link) if image_format and temp_name.rsplit(".", 1)[-1] != image_format: file_name = temp_name.rsplit(".", 1)[0] + "." + image_format else: file_name = temp_name return file_name def _get_name(): pass def _get_filesize(): pass def _get_image_data(res, a): """Parse image data and write it to an ImageResult object. Args: res: An ImageResult object. a: An "a" html tag. """ google_middle_link = a["href"] url_parsed = urllib.parse.urlparse(google_middle_link) qry_parsed = urllib.parse.parse_qs(url_parsed.query) res.name = _get_name() res.link = qry_parsed["imgurl"][0] res.file_name = _get_file_name(res.link) res.format = _parse_image_format(res.link) res.width = qry_parsed["w"][0] res.height = qry_parsed["h"][0] res.site = qry_parsed["imgrefurl"][0] res.domain = urllib.parse.urlparse(res.site).netloc res.filesize = _get_filesize() def _get_thumb_data(res, img): """Parse thumb data and write it to an ImageResult object. Args: res: An ImageResult object. a: An "a" html tag. """ try: res.thumb = img[0]["src"] except: res.thumb = img[0]["data-src"] try: img_style = img[0]["style"].split(";") img_style_dict = {i.split(":")[0]: i.split(":")[-1] for i in img_style} res.thumb_width = img_style_dict["width"] res.thumb_height = img_style_dict["height"] except: exc_type, exc_value, exc_traceback = sys.exc_info() print(exc_type, exc_value, "index=", res.index) # PUBLIC def search_old(query, image_options=None, pages=1): results = [] for i in range(pages): url = get_image_search_url(query, image_options, i) html = get_html(url) if html: if Google.DEBUG_MODE: write_html_to_file( html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i)) j = 0 soup = BeautifulSoup(html) match = re.search("dyn.setResults\((.+)\);", html) if match: init = str(match.group(1), errors="ignore") tokens = init.split('],[') for token in tokens: res = ImageResult() res.page = i res.index = j toks = token.split(",") # should be 32 or 33, but seems to change, so just make sure no exceptions # will be thrown by the indexing if (len(toks) > 22): for t in range(len(toks)): toks[t] = toks[t].replace('\\x3cb\\x3e', '').replace( '\\x3c/b\\x3e', '').replace('\\x3d', '=').replace('\\x26', '&') match = re.search( "imgurl=(?P[^&]+)&imgrefurl", toks[0]) if match: res.link = match.group("link") res.name = toks[6].replace('"', '') res.thumb = toks[21].replace('"', '') res.format = toks[10].replace('"', '') res.domain = toks[11].replace('"', '') match = re.search( "(?P[0-9]+) × (?P[0-9]+) - (?P[^ ]+)", toks[9].replace('"', '')) if match: res.width = match.group("width") res.height = match.group("height") res.filesize = match.group("size") results.append(res) j = j + 1 return results def search(query, image_options=None, num_images=50): """Search images in google. Search images in google filtering by image type, size category, resolution, exact width, exact height, color type or color. A simple search can be performed without passing options. To filter the search, an ImageOptions must be built with the different filter categories and passed. Args: query: string to search in google images image_options: an ImageOptions object to filter the search num_images: number of images to be scraped Returns: A list of ImageResult objects """ results = set() curr_num_img = 1 page = 0 browser = get_browser_with_url("about:home") while curr_num_img <= num_images: page += 1 url = _get_images_req_url(query, image_options, page) # html = get_html_from_dynamic_site(url) browser.get(url) html = browser.page_source if html: soup = BeautifulSoup(html) # iterate over the divs containing images in one page divs = _find_divs_with_images(soup) # empty search result page case if not divs: break for div in divs: res = ImageResult() # store indexing paramethers res.page = page res.index = curr_num_img # get url of image and its secondary data a = div.find("a") if a: _get_image_data(res, a) # get url of thumb and its size paramethers img = a.find_all("img") if img: _get_thumb_data(res, img) # increment image counter only if a new image was added prev_num_results = len(results) results.add(res) curr_num_results = len(results) if curr_num_results > prev_num_results: curr_num_img += 1 # break the loop when limit of images is reached if curr_num_img >= num_images: break browser.quit() return list(results) def _download_image(image_result, path): if image_result.format: if path: image_result.download(path) else: image_result.download() @measure_time def download(image_results, path=None): """Download a list of images. Args: images_list: a list of ImageResult instances path: path to store downloaded images. """ total_images = len(image_results) i = 1 for image_result in image_results: progress = "".join(["Downloading image ", str(i), " (", str(total_images), ")"]) print(progress) sys.stdout.flush() _download_image(image_result, path) i += 1 class ThreadUrl(threading.Thread): """Threaded Url Grab""" def __init__(self, queue, path, total): threading.Thread.__init__(self) self.queue = queue self.path = path self.total = total def run(self): while True: # grabs host from queue image_result = self.queue.get() counter = self.total - self.queue.qsize() progress = "".join(["Downloading image ", str(counter), " (", str(self.total), ")"]) print(progress) sys.stdout.flush() _download_image(image_result, self.path) # signals to queue job is done self.queue.task_done() @measure_time def fast_download(image_results, path=None, threads=10): # print path queue = queue.Queue() total = len(image_results) for image_result in image_results: queue.put(image_result) # spawn a pool of threads, and pass them queue instance for i in range(threads): t = ThreadUrl(queue, path, total) t.setDaemon(True) t.start() # wait on the queue until everything has been processed queue.join()