PCQRSCANER/venv/Lib/site-packages/google/modules/images.py

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()
from builtins import str
from builtins import range
from builtins import object
from unidecode import unidecode

from .utils import get_browser_with_url, write_html_to_file, measure_time
from bs4 import BeautifulSoup
import urllib.parse
import sys
import requests
import shutil
import os
import threading
import queue


IMAGE_FORMATS = ["bmp", "gif", "jpg", "png", "psd", "pspimage", "thm",
                 "tif", "yuv", "ai", "drw", "eps", "ps", "svg", "tiff",
                 "jpeg", "jif", "jfif", "jp2", "jpx", "j2k", "j2c", "fpx",
                 "pcd", "png", "pdf"]


# AUXILIARY CLASSES
class ImageType(object):
    NONE = None
    FACE = "face"
    PHOTO = "photo"
    CLIPART = "clipart"
    LINE_DRAWING = "lineart"


class SizeCategory(object):
    NONE = None
    ICON = "i"
    LARGE = "l"
    MEDIUM = "m"
    SMALL = "s"
    LARGER_THAN = "lt"
    EXACTLY = "ex"


class LargerThan(object):
    NONE = None
    QSVGA = "qsvga"  # 400 x 300
    VGA = "vga"     # 640 x 480
    SVGA = "svga"   # 800 x 600
    XGA = "xga"     # 1024 x 768
    MP_2 = "2mp"    # 2 MP (1600 x 1200)
    MP_4 = "4mp"    # 4 MP (2272 x 1704)
    MP_6 = "6mp"    # 6 MP (2816 x 2112)
    MP_8 = "8mp"    # 8 MP (3264 x 2448)
    MP_10 = "10mp"  # 10 MP (3648 x 2736)
    MP_12 = "12mp"  # 12 MP (4096 x 3072)
    MP_15 = "15mp"  # 15 MP (4480 x 3360)
    MP_20 = "20mp"  # 20 MP (5120 x 3840)
    MP_40 = "40mp"  # 40 MP (7216 x 5412)
    MP_70 = "70mp"  # 70 MP (9600 x 7200)


class ColorType(object):
    NONE = None
    COLOR = "color"
    BLACK_WHITE = "gray"
    SPECIFIC = "specific"


class License(object):
    NONE = None
    REUSE = "fc"
    REUSE_WITH_MOD = "fmc"
    REUSE_NON_COMMERCIAL = "f"
    REUSE_WITH_MOD_NON_COMMERCIAL = "fm"


class ImageOptions(object):

    """Allows passing options to filter a google images search."""

    def __init__(self):
        self.image_type = None
        self.size_category = None
        self.larger_than = None
        self.exact_width = None
        self.exact_height = None
        self.color_type = None
        self.color = None
        self.license = None

    def __repr__(self):
        return unidecode(self.__dict__)

    def get_tbs(self):
        tbs = None
        if self.image_type:
            # clipart
            tbs = self._add_to_tbs(tbs, "itp", self.image_type)
        if self.size_category and not (self.larger_than or (self.exact_width and self.exact_height)):
            # i = icon, l = large, m = medium, lt = larger than, ex = exact
            tbs = self._add_to_tbs(tbs, "isz", self.size_category)
        if self.larger_than:
            # qsvga,4mp
            tbs = self._add_to_tbs(tbs, "isz", SizeCategory.LARGER_THAN)
            tbs = self._add_to_tbs(tbs, "islt", self.larger_than)
        if self.exact_width and self.exact_height:
            tbs = self._add_to_tbs(tbs, "isz", SizeCategory.EXACTLY)
            tbs = self._add_to_tbs(tbs, "iszw", self.exact_width)
            tbs = self._add_to_tbs(tbs, "iszh", self.exact_height)
        if self.color_type and not self.color:
            # color = color, gray = black and white, specific = user defined
            tbs = self._add_to_tbs(tbs, "ic", self.color_type)
        if self.color:
            tbs = self._add_to_tbs(tbs, "ic", ColorType.SPECIFIC)
            tbs = self._add_to_tbs(tbs, "isc", self.color)
        if self.license:
            tbs = self._add_to_tbs(tbs, "sur", self.license)
        return tbs

    def _add_to_tbs(self, tbs, name, value):
        if tbs:
            return "%s,%s:%s" % (tbs, name, value)
        else:
            return "&tbs=%s:%s" % (name, value)


class ImageResult(object):

    """Represents a google image search result."""

    ROOT_FILENAME = "img"
    DEFAULT_FORMAT = "jpg"

    def __init__(self):
        self.name = None
        self.file_name = None
        self.link = None
        self.thumb = None
        self.thumb_width = None
        self.thumb_height = None
        self.width = None
        self.height = None
        self.filesize = None
        self.format = None
        self.domain = None
        self.page = None
        self.index = None
        self.site = None

    def __eq__(self, other):
        return self.link == other.link

    def __hash__(self):
        return id(self.link)

    def __repr__(self):
        string = "ImageResult(index={i}, page={p}, domain={d}, link={l})".format(
            i=str(self.index),
            p=str(self.page),
            d=unidecode(self.domain) if self.domain else None,
            l=unidecode(self.link) if self.link else None
        )
        return string

    def download(self, path="images"):
        """Download an image to a given path."""

        self._create_path(path)
        # print path

        try:
            response = requests.get(self.link, stream=True)
            # request a protected image (adding a referer to the request)
            # referer = self.domain
            # image = self.link

            # req = urllib2.Request(image)
            # req.add_header('Referer', referer)   # here is the trick
            # response = urllib2.urlopen(req)

            if "image" in response.headers['content-type']:
                path_filename = self._get_path_filename(path)
                with open(path_filename, 'wb') as output_file:
                    shutil.copyfileobj(response.raw, output_file)
                    # output_file.write(response.content)
            else:
                print("\r\rskiped! cached image")

            del response

        except Exception as inst:
            print(self.link, "has failed:")
            print(inst)

    def _get_path_filename(self, path):
        """Build the filename to download.

        Checks that filename is not already in path. Otherwise looks for
        another name.

        >>> ir = ImageResult()
        >>> ir._get_path_filename("test")
        'test\\\img3.jpg'
        >>> ir.name = "pirulo"
        >>> ir.format = "jpg"
        >>> ir._get_path_filename("test")
        'test\\\pirulo.jpg'
        """

        path_filename = None

        # preserve the original name
        if self.file_name:
            original_filename = self.file_name
            path_filename = os.path.join(path, original_filename)

        # create a default name if there is no original name
        if not path_filename or os.path.isfile(path_filename):

            # take the format of the file, or use default
            if self.format:
                file_format = self.format
            else:
                file_format = self.DEFAULT_FORMAT

            # create root of file, until reaching a non existent one
            i = 1
            default_filename = self.ROOT_FILENAME + str(i) + "." + file_format
            path_filename = os.path.join(path, default_filename)
            while os.path.isfile(path_filename):
                i += 1
                default_filename = self.ROOT_FILENAME + str(i) + "." + \
                    file_format
                path_filename = os.path.join(path, default_filename)

        return path_filename

    def _create_path(self, path):
        """Create a path, if it doesn't exists."""

        if not os.path.isdir(path):
            os.mkdir(path)


# PRIVATE
def _parse_image_format(image_link):
    """Parse an image format from a download link.

    Args:
        image_link: link to download an image.

    >>> link = "http://blogs.elpais.com/.a/6a00d8341bfb1653ef01a73dbb4a78970d-pi"
    >>> Google._parse_image_format(link)

    >>> link = "http://minionslovebananas.com/images/gallery/preview/Chiquita-DM2-minion-banana-3.jpg%3Fw%3D300%26h%3D429"
    >>> Google._parse_image_format(link)

    """
    parsed_format = image_link[image_link.rfind(".") + 1:]

    # OLD: identify formats even with weird final characters
    if parsed_format not in IMAGE_FORMATS:
        for image_format in IMAGE_FORMATS:
            if image_format in parsed_format:
                parsed_format = image_format
                break

    if parsed_format not in IMAGE_FORMATS:
        parsed_format = None

    return parsed_format


def _get_images_req_url(query, image_options=None, page=0,
                        per_page=20):
    query = query.strip().replace(":", "%3A").replace(
        "+", "%2B").replace("&", "%26").replace(" ", "+")

    url = "https://www.google.com.ar/search?q={}".format(query) + \
          "&es_sm=122&source=lnms" + \
          "&tbm=isch&sa=X&ei=DDdUVL-fE4SpNq-ngPgK&ved=0CAgQ_AUoAQ" + \
          "&biw=1024&bih=719&dpr=1.25"

    if image_options:
        tbs = image_options.get_tbs()
        if tbs:
            url = url + tbs

    return url


def _find_divs_with_images(soup):

    try:
        div_container = soup.find("div", {"id": "rg_s"})
        divs = div_container.find_all("div", {"class": "rg_di"})
    except:
        divs = None
    return divs


def _get_file_name(link):

    temp_name = link.rsplit('/', 1)[-1]
    image_format = _parse_image_format(link)

    if image_format and temp_name.rsplit(".", 1)[-1] != image_format:
        file_name = temp_name.rsplit(".", 1)[0] + "." + image_format

    else:
        file_name = temp_name

    return file_name


def _get_name():
    pass


def _get_filesize():
    pass


def _get_image_data(res, a):
    """Parse image data and write it to an ImageResult object.

    Args:
        res: An ImageResult object.
        a: An "a" html tag.
    """
    google_middle_link = a["href"]
    url_parsed = urllib.parse.urlparse(google_middle_link)
    qry_parsed = urllib.parse.parse_qs(url_parsed.query)
    res.name = _get_name()
    res.link = qry_parsed["imgurl"][0]
    res.file_name = _get_file_name(res.link)
    res.format = _parse_image_format(res.link)
    res.width = qry_parsed["w"][0]
    res.height = qry_parsed["h"][0]
    res.site = qry_parsed["imgrefurl"][0]
    res.domain = urllib.parse.urlparse(res.site).netloc
    res.filesize = _get_filesize()


def _get_thumb_data(res, img):
    """Parse thumb data and write it to an ImageResult object.

    Args:
        res: An ImageResult object.
        a: An "a" html tag.
    """
    try:
        res.thumb = img[0]["src"]
    except:
        res.thumb = img[0]["data-src"]

    try:
        img_style = img[0]["style"].split(";")
        img_style_dict = {i.split(":")[0]: i.split(":")[-1] for i in img_style}
        res.thumb_width = img_style_dict["width"]
        res.thumb_height = img_style_dict["height"]
    except:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(exc_type, exc_value, "index=", res.index)


# PUBLIC
def search_old(query, image_options=None, pages=1):
    results = []
    for i in range(pages):
        url = get_image_search_url(query, image_options, i)
        html = get_html(url)
        if html:
            if Google.DEBUG_MODE:
                write_html_to_file(
                    html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i))
            j = 0
            soup = BeautifulSoup(html)
            match = re.search("dyn.setResults\((.+)\);</script>", html)
            if match:
                init = str(match.group(1), errors="ignore")
                tokens = init.split('],[')
                for token in tokens:
                    res = ImageResult()
                    res.page = i
                    res.index = j
                    toks = token.split(",")

                    # should be 32 or 33, but seems to change, so just make sure no exceptions
                    # will be thrown by the indexing
                    if (len(toks) > 22):
                        for t in range(len(toks)):
                            toks[t] = toks[t].replace('\\x3cb\\x3e', '').replace(
                                '\\x3c/b\\x3e', '').replace('\\x3d', '=').replace('\\x26', '&')
                        match = re.search(
                            "imgurl=(?P<link>[^&]+)&imgrefurl", toks[0])
                        if match:
                            res.link = match.group("link")
                        res.name = toks[6].replace('"', '')
                        res.thumb = toks[21].replace('"', '')
                        res.format = toks[10].replace('"', '')
                        res.domain = toks[11].replace('"', '')
                        match = re.search(
                            "(?P<width>[0-9]+) &times; (?P<height>[0-9]+) - (?P<size>[^ ]+)", toks[9].replace('"', ''))
                        if match:
                            res.width = match.group("width")
                            res.height = match.group("height")
                            res.filesize = match.group("size")
                        results.append(res)
                        j = j + 1
    return results


def search(query, image_options=None, num_images=50):
    """Search images in google.

    Search images in google filtering by image type, size category, resolution,
    exact width, exact height, color type or color. A simple search can be
    performed without passing options. To filter the search, an ImageOptions
    must be built with the different filter categories and passed.

    Args:
        query: string to search in google images
        image_options: an ImageOptions object to filter the search
        num_images: number of images to be scraped

    Returns:
        A list of ImageResult objects
    """

    results = set()
    curr_num_img = 1
    page = 0
    browser = get_browser_with_url("about:home")
    while curr_num_img <= num_images:

        page += 1
        url = _get_images_req_url(query, image_options, page)
        # html = get_html_from_dynamic_site(url)
        browser.get(url)
        html = browser.page_source

        if html:
            soup = BeautifulSoup(html)

            # iterate over the divs containing images in one page
            divs = _find_divs_with_images(soup)

            # empty search result page case
            if not divs:
                break

            for div in divs:

                res = ImageResult()

                # store indexing paramethers
                res.page = page
                res.index = curr_num_img

                # get url of image and its secondary data
                a = div.find("a")
                if a:
                    _get_image_data(res, a)

                # get url of thumb and its size paramethers
                img = a.find_all("img")
                if img:
                    _get_thumb_data(res, img)

                # increment image counter only if a new image was added
                prev_num_results = len(results)
                results.add(res)
                curr_num_results = len(results)

                if curr_num_results > prev_num_results:
                    curr_num_img += 1

                # break the loop when limit of images is reached
                if curr_num_img >= num_images:
                    break

    browser.quit()

    return list(results)


def _download_image(image_result, path):

    if image_result.format:
        if path:
            image_result.download(path)
        else:
            image_result.download()


@measure_time
def download(image_results, path=None):
    """Download a list of images.

    Args:
        images_list: a list of ImageResult instances
        path: path to store downloaded images.
    """

    total_images = len(image_results)
    i = 1
    for image_result in image_results:

        progress = "".join(["Downloading image ", str(i),
                            " (", str(total_images), ")"])
        print(progress)
        sys.stdout.flush()

        _download_image(image_result, path)

        i += 1


class ThreadUrl(threading.Thread):

    """Threaded Url Grab"""

    def __init__(self, queue, path, total):
        threading.Thread.__init__(self)
        self.queue = queue
        self.path = path
        self.total = total

    def run(self):
        while True:
            # grabs host from queue
            image_result = self.queue.get()

            counter = self.total - self.queue.qsize()
            progress = "".join(["Downloading image ", str(counter),
                                " (", str(self.total), ")"])
            print(progress)
            sys.stdout.flush()
            _download_image(image_result, self.path)

            # signals to queue job is done
            self.queue.task_done()


@measure_time
def fast_download(image_results, path=None, threads=10):
    # print path
    queue = queue.Queue()
    total = len(image_results)

    for image_result in image_results:
        queue.put(image_result)

    # spawn a pool of threads, and pass them queue instance
    for i in range(threads):
        t = ThreadUrl(queue, path, total)
        t.setDaemon(True)
        t.start()

    # wait on the queue until everything has been processed
    queue.join()