566 lines
17 KiB
Python
566 lines
17 KiB
Python
from __future__ import unicode_literals
|
|
from __future__ import print_function
|
|
from __future__ import absolute_import
|
|
from future import standard_library
|
|
standard_library.install_aliases()
|
|
from builtins import str
|
|
from builtins import range
|
|
from builtins import object
|
|
from unidecode import unidecode
|
|
|
|
from .utils import get_browser_with_url, write_html_to_file, measure_time
|
|
from bs4 import BeautifulSoup
|
|
import urllib.parse
|
|
import sys
|
|
import requests
|
|
import shutil
|
|
import os
|
|
import threading
|
|
import queue
|
|
|
|
|
|
IMAGE_FORMATS = ["bmp", "gif", "jpg", "png", "psd", "pspimage", "thm",
|
|
"tif", "yuv", "ai", "drw", "eps", "ps", "svg", "tiff",
|
|
"jpeg", "jif", "jfif", "jp2", "jpx", "j2k", "j2c", "fpx",
|
|
"pcd", "png", "pdf"]
|
|
|
|
|
|
# AUXILIARY CLASSES
|
|
class ImageType(object):
|
|
NONE = None
|
|
FACE = "face"
|
|
PHOTO = "photo"
|
|
CLIPART = "clipart"
|
|
LINE_DRAWING = "lineart"
|
|
|
|
|
|
class SizeCategory(object):
|
|
NONE = None
|
|
ICON = "i"
|
|
LARGE = "l"
|
|
MEDIUM = "m"
|
|
SMALL = "s"
|
|
LARGER_THAN = "lt"
|
|
EXACTLY = "ex"
|
|
|
|
|
|
class LargerThan(object):
|
|
NONE = None
|
|
QSVGA = "qsvga" # 400 x 300
|
|
VGA = "vga" # 640 x 480
|
|
SVGA = "svga" # 800 x 600
|
|
XGA = "xga" # 1024 x 768
|
|
MP_2 = "2mp" # 2 MP (1600 x 1200)
|
|
MP_4 = "4mp" # 4 MP (2272 x 1704)
|
|
MP_6 = "6mp" # 6 MP (2816 x 2112)
|
|
MP_8 = "8mp" # 8 MP (3264 x 2448)
|
|
MP_10 = "10mp" # 10 MP (3648 x 2736)
|
|
MP_12 = "12mp" # 12 MP (4096 x 3072)
|
|
MP_15 = "15mp" # 15 MP (4480 x 3360)
|
|
MP_20 = "20mp" # 20 MP (5120 x 3840)
|
|
MP_40 = "40mp" # 40 MP (7216 x 5412)
|
|
MP_70 = "70mp" # 70 MP (9600 x 7200)
|
|
|
|
|
|
class ColorType(object):
|
|
NONE = None
|
|
COLOR = "color"
|
|
BLACK_WHITE = "gray"
|
|
SPECIFIC = "specific"
|
|
|
|
|
|
class License(object):
|
|
NONE = None
|
|
REUSE = "fc"
|
|
REUSE_WITH_MOD = "fmc"
|
|
REUSE_NON_COMMERCIAL = "f"
|
|
REUSE_WITH_MOD_NON_COMMERCIAL = "fm"
|
|
|
|
|
|
class ImageOptions(object):
|
|
|
|
"""Allows passing options to filter a google images search."""
|
|
|
|
def __init__(self):
|
|
self.image_type = None
|
|
self.size_category = None
|
|
self.larger_than = None
|
|
self.exact_width = None
|
|
self.exact_height = None
|
|
self.color_type = None
|
|
self.color = None
|
|
self.license = None
|
|
|
|
def __repr__(self):
|
|
return unidecode(self.__dict__)
|
|
|
|
def get_tbs(self):
|
|
tbs = None
|
|
if self.image_type:
|
|
# clipart
|
|
tbs = self._add_to_tbs(tbs, "itp", self.image_type)
|
|
if self.size_category and not (self.larger_than or (self.exact_width and self.exact_height)):
|
|
# i = icon, l = large, m = medium, lt = larger than, ex = exact
|
|
tbs = self._add_to_tbs(tbs, "isz", self.size_category)
|
|
if self.larger_than:
|
|
# qsvga,4mp
|
|
tbs = self._add_to_tbs(tbs, "isz", SizeCategory.LARGER_THAN)
|
|
tbs = self._add_to_tbs(tbs, "islt", self.larger_than)
|
|
if self.exact_width and self.exact_height:
|
|
tbs = self._add_to_tbs(tbs, "isz", SizeCategory.EXACTLY)
|
|
tbs = self._add_to_tbs(tbs, "iszw", self.exact_width)
|
|
tbs = self._add_to_tbs(tbs, "iszh", self.exact_height)
|
|
if self.color_type and not self.color:
|
|
# color = color, gray = black and white, specific = user defined
|
|
tbs = self._add_to_tbs(tbs, "ic", self.color_type)
|
|
if self.color:
|
|
tbs = self._add_to_tbs(tbs, "ic", ColorType.SPECIFIC)
|
|
tbs = self._add_to_tbs(tbs, "isc", self.color)
|
|
if self.license:
|
|
tbs = self._add_to_tbs(tbs, "sur", self.license)
|
|
return tbs
|
|
|
|
def _add_to_tbs(self, tbs, name, value):
|
|
if tbs:
|
|
return "%s,%s:%s" % (tbs, name, value)
|
|
else:
|
|
return "&tbs=%s:%s" % (name, value)
|
|
|
|
|
|
class ImageResult(object):
|
|
|
|
"""Represents a google image search result."""
|
|
|
|
ROOT_FILENAME = "img"
|
|
DEFAULT_FORMAT = "jpg"
|
|
|
|
def __init__(self):
|
|
self.name = None
|
|
self.file_name = None
|
|
self.link = None
|
|
self.thumb = None
|
|
self.thumb_width = None
|
|
self.thumb_height = None
|
|
self.width = None
|
|
self.height = None
|
|
self.filesize = None
|
|
self.format = None
|
|
self.domain = None
|
|
self.page = None
|
|
self.index = None
|
|
self.site = None
|
|
|
|
def __eq__(self, other):
|
|
return self.link == other.link
|
|
|
|
def __hash__(self):
|
|
return id(self.link)
|
|
|
|
def __repr__(self):
|
|
string = "ImageResult(index={i}, page={p}, domain={d}, link={l})".format(
|
|
i=str(self.index),
|
|
p=str(self.page),
|
|
d=unidecode(self.domain) if self.domain else None,
|
|
l=unidecode(self.link) if self.link else None
|
|
)
|
|
return string
|
|
|
|
def download(self, path="images"):
|
|
"""Download an image to a given path."""
|
|
|
|
self._create_path(path)
|
|
# print path
|
|
|
|
try:
|
|
response = requests.get(self.link, stream=True)
|
|
# request a protected image (adding a referer to the request)
|
|
# referer = self.domain
|
|
# image = self.link
|
|
|
|
# req = urllib2.Request(image)
|
|
# req.add_header('Referer', referer) # here is the trick
|
|
# response = urllib2.urlopen(req)
|
|
|
|
if "image" in response.headers['content-type']:
|
|
path_filename = self._get_path_filename(path)
|
|
with open(path_filename, 'wb') as output_file:
|
|
shutil.copyfileobj(response.raw, output_file)
|
|
# output_file.write(response.content)
|
|
else:
|
|
print("\r\rskiped! cached image")
|
|
|
|
del response
|
|
|
|
except Exception as inst:
|
|
print(self.link, "has failed:")
|
|
print(inst)
|
|
|
|
def _get_path_filename(self, path):
|
|
"""Build the filename to download.
|
|
|
|
Checks that filename is not already in path. Otherwise looks for
|
|
another name.
|
|
|
|
>>> ir = ImageResult()
|
|
>>> ir._get_path_filename("test")
|
|
'test\\\img3.jpg'
|
|
>>> ir.name = "pirulo"
|
|
>>> ir.format = "jpg"
|
|
>>> ir._get_path_filename("test")
|
|
'test\\\pirulo.jpg'
|
|
"""
|
|
|
|
path_filename = None
|
|
|
|
# preserve the original name
|
|
if self.file_name:
|
|
original_filename = self.file_name
|
|
path_filename = os.path.join(path, original_filename)
|
|
|
|
# create a default name if there is no original name
|
|
if not path_filename or os.path.isfile(path_filename):
|
|
|
|
# take the format of the file, or use default
|
|
if self.format:
|
|
file_format = self.format
|
|
else:
|
|
file_format = self.DEFAULT_FORMAT
|
|
|
|
# create root of file, until reaching a non existent one
|
|
i = 1
|
|
default_filename = self.ROOT_FILENAME + str(i) + "." + file_format
|
|
path_filename = os.path.join(path, default_filename)
|
|
while os.path.isfile(path_filename):
|
|
i += 1
|
|
default_filename = self.ROOT_FILENAME + str(i) + "." + \
|
|
file_format
|
|
path_filename = os.path.join(path, default_filename)
|
|
|
|
return path_filename
|
|
|
|
def _create_path(self, path):
|
|
"""Create a path, if it doesn't exists."""
|
|
|
|
if not os.path.isdir(path):
|
|
os.mkdir(path)
|
|
|
|
|
|
# PRIVATE
|
|
def _parse_image_format(image_link):
|
|
"""Parse an image format from a download link.
|
|
|
|
Args:
|
|
image_link: link to download an image.
|
|
|
|
>>> link = "http://blogs.elpais.com/.a/6a00d8341bfb1653ef01a73dbb4a78970d-pi"
|
|
>>> Google._parse_image_format(link)
|
|
|
|
>>> link = "http://minionslovebananas.com/images/gallery/preview/Chiquita-DM2-minion-banana-3.jpg%3Fw%3D300%26h%3D429"
|
|
>>> Google._parse_image_format(link)
|
|
|
|
"""
|
|
parsed_format = image_link[image_link.rfind(".") + 1:]
|
|
|
|
# OLD: identify formats even with weird final characters
|
|
if parsed_format not in IMAGE_FORMATS:
|
|
for image_format in IMAGE_FORMATS:
|
|
if image_format in parsed_format:
|
|
parsed_format = image_format
|
|
break
|
|
|
|
if parsed_format not in IMAGE_FORMATS:
|
|
parsed_format = None
|
|
|
|
return parsed_format
|
|
|
|
|
|
def _get_images_req_url(query, image_options=None, page=0,
|
|
per_page=20):
|
|
query = query.strip().replace(":", "%3A").replace(
|
|
"+", "%2B").replace("&", "%26").replace(" ", "+")
|
|
|
|
url = "https://www.google.com.ar/search?q={}".format(query) + \
|
|
"&es_sm=122&source=lnms" + \
|
|
"&tbm=isch&sa=X&ei=DDdUVL-fE4SpNq-ngPgK&ved=0CAgQ_AUoAQ" + \
|
|
"&biw=1024&bih=719&dpr=1.25"
|
|
|
|
if image_options:
|
|
tbs = image_options.get_tbs()
|
|
if tbs:
|
|
url = url + tbs
|
|
|
|
return url
|
|
|
|
|
|
def _find_divs_with_images(soup):
|
|
|
|
try:
|
|
div_container = soup.find("div", {"id": "rg_s"})
|
|
divs = div_container.find_all("div", {"class": "rg_di"})
|
|
except:
|
|
divs = None
|
|
return divs
|
|
|
|
|
|
def _get_file_name(link):
|
|
|
|
temp_name = link.rsplit('/', 1)[-1]
|
|
image_format = _parse_image_format(link)
|
|
|
|
if image_format and temp_name.rsplit(".", 1)[-1] != image_format:
|
|
file_name = temp_name.rsplit(".", 1)[0] + "." + image_format
|
|
|
|
else:
|
|
file_name = temp_name
|
|
|
|
return file_name
|
|
|
|
|
|
def _get_name():
|
|
pass
|
|
|
|
|
|
def _get_filesize():
|
|
pass
|
|
|
|
|
|
def _get_image_data(res, a):
|
|
"""Parse image data and write it to an ImageResult object.
|
|
|
|
Args:
|
|
res: An ImageResult object.
|
|
a: An "a" html tag.
|
|
"""
|
|
google_middle_link = a["href"]
|
|
url_parsed = urllib.parse.urlparse(google_middle_link)
|
|
qry_parsed = urllib.parse.parse_qs(url_parsed.query)
|
|
res.name = _get_name()
|
|
res.link = qry_parsed["imgurl"][0]
|
|
res.file_name = _get_file_name(res.link)
|
|
res.format = _parse_image_format(res.link)
|
|
res.width = qry_parsed["w"][0]
|
|
res.height = qry_parsed["h"][0]
|
|
res.site = qry_parsed["imgrefurl"][0]
|
|
res.domain = urllib.parse.urlparse(res.site).netloc
|
|
res.filesize = _get_filesize()
|
|
|
|
|
|
def _get_thumb_data(res, img):
|
|
"""Parse thumb data and write it to an ImageResult object.
|
|
|
|
Args:
|
|
res: An ImageResult object.
|
|
a: An "a" html tag.
|
|
"""
|
|
try:
|
|
res.thumb = img[0]["src"]
|
|
except:
|
|
res.thumb = img[0]["data-src"]
|
|
|
|
try:
|
|
img_style = img[0]["style"].split(";")
|
|
img_style_dict = {i.split(":")[0]: i.split(":")[-1] for i in img_style}
|
|
res.thumb_width = img_style_dict["width"]
|
|
res.thumb_height = img_style_dict["height"]
|
|
except:
|
|
exc_type, exc_value, exc_traceback = sys.exc_info()
|
|
print(exc_type, exc_value, "index=", res.index)
|
|
|
|
|
|
# PUBLIC
|
|
def search_old(query, image_options=None, pages=1):
|
|
results = []
|
|
for i in range(pages):
|
|
url = get_image_search_url(query, image_options, i)
|
|
html = get_html(url)
|
|
if html:
|
|
if Google.DEBUG_MODE:
|
|
write_html_to_file(
|
|
html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i))
|
|
j = 0
|
|
soup = BeautifulSoup(html)
|
|
match = re.search("dyn.setResults\((.+)\);</script>", html)
|
|
if match:
|
|
init = str(match.group(1), errors="ignore")
|
|
tokens = init.split('],[')
|
|
for token in tokens:
|
|
res = ImageResult()
|
|
res.page = i
|
|
res.index = j
|
|
toks = token.split(",")
|
|
|
|
# should be 32 or 33, but seems to change, so just make sure no exceptions
|
|
# will be thrown by the indexing
|
|
if (len(toks) > 22):
|
|
for t in range(len(toks)):
|
|
toks[t] = toks[t].replace('\\x3cb\\x3e', '').replace(
|
|
'\\x3c/b\\x3e', '').replace('\\x3d', '=').replace('\\x26', '&')
|
|
match = re.search(
|
|
"imgurl=(?P<link>[^&]+)&imgrefurl", toks[0])
|
|
if match:
|
|
res.link = match.group("link")
|
|
res.name = toks[6].replace('"', '')
|
|
res.thumb = toks[21].replace('"', '')
|
|
res.format = toks[10].replace('"', '')
|
|
res.domain = toks[11].replace('"', '')
|
|
match = re.search(
|
|
"(?P<width>[0-9]+) × (?P<height>[0-9]+) - (?P<size>[^ ]+)", toks[9].replace('"', ''))
|
|
if match:
|
|
res.width = match.group("width")
|
|
res.height = match.group("height")
|
|
res.filesize = match.group("size")
|
|
results.append(res)
|
|
j = j + 1
|
|
return results
|
|
|
|
|
|
def search(query, image_options=None, num_images=50):
|
|
"""Search images in google.
|
|
|
|
Search images in google filtering by image type, size category, resolution,
|
|
exact width, exact height, color type or color. A simple search can be
|
|
performed without passing options. To filter the search, an ImageOptions
|
|
must be built with the different filter categories and passed.
|
|
|
|
Args:
|
|
query: string to search in google images
|
|
image_options: an ImageOptions object to filter the search
|
|
num_images: number of images to be scraped
|
|
|
|
Returns:
|
|
A list of ImageResult objects
|
|
"""
|
|
|
|
results = set()
|
|
curr_num_img = 1
|
|
page = 0
|
|
browser = get_browser_with_url("about:home")
|
|
while curr_num_img <= num_images:
|
|
|
|
page += 1
|
|
url = _get_images_req_url(query, image_options, page)
|
|
# html = get_html_from_dynamic_site(url)
|
|
browser.get(url)
|
|
html = browser.page_source
|
|
|
|
if html:
|
|
soup = BeautifulSoup(html)
|
|
|
|
# iterate over the divs containing images in one page
|
|
divs = _find_divs_with_images(soup)
|
|
|
|
# empty search result page case
|
|
if not divs:
|
|
break
|
|
|
|
for div in divs:
|
|
|
|
res = ImageResult()
|
|
|
|
# store indexing paramethers
|
|
res.page = page
|
|
res.index = curr_num_img
|
|
|
|
# get url of image and its secondary data
|
|
a = div.find("a")
|
|
if a:
|
|
_get_image_data(res, a)
|
|
|
|
# get url of thumb and its size paramethers
|
|
img = a.find_all("img")
|
|
if img:
|
|
_get_thumb_data(res, img)
|
|
|
|
# increment image counter only if a new image was added
|
|
prev_num_results = len(results)
|
|
results.add(res)
|
|
curr_num_results = len(results)
|
|
|
|
if curr_num_results > prev_num_results:
|
|
curr_num_img += 1
|
|
|
|
# break the loop when limit of images is reached
|
|
if curr_num_img >= num_images:
|
|
break
|
|
|
|
browser.quit()
|
|
|
|
return list(results)
|
|
|
|
|
|
def _download_image(image_result, path):
|
|
|
|
if image_result.format:
|
|
if path:
|
|
image_result.download(path)
|
|
else:
|
|
image_result.download()
|
|
|
|
|
|
@measure_time
|
|
def download(image_results, path=None):
|
|
"""Download a list of images.
|
|
|
|
Args:
|
|
images_list: a list of ImageResult instances
|
|
path: path to store downloaded images.
|
|
"""
|
|
|
|
total_images = len(image_results)
|
|
i = 1
|
|
for image_result in image_results:
|
|
|
|
progress = "".join(["Downloading image ", str(i),
|
|
" (", str(total_images), ")"])
|
|
print(progress)
|
|
sys.stdout.flush()
|
|
|
|
_download_image(image_result, path)
|
|
|
|
i += 1
|
|
|
|
|
|
class ThreadUrl(threading.Thread):
|
|
|
|
"""Threaded Url Grab"""
|
|
|
|
def __init__(self, queue, path, total):
|
|
threading.Thread.__init__(self)
|
|
self.queue = queue
|
|
self.path = path
|
|
self.total = total
|
|
|
|
def run(self):
|
|
while True:
|
|
# grabs host from queue
|
|
image_result = self.queue.get()
|
|
|
|
counter = self.total - self.queue.qsize()
|
|
progress = "".join(["Downloading image ", str(counter),
|
|
" (", str(self.total), ")"])
|
|
print(progress)
|
|
sys.stdout.flush()
|
|
_download_image(image_result, self.path)
|
|
|
|
# signals to queue job is done
|
|
self.queue.task_done()
|
|
|
|
|
|
@measure_time
|
|
def fast_download(image_results, path=None, threads=10):
|
|
# print path
|
|
queue = queue.Queue()
|
|
total = len(image_results)
|
|
|
|
for image_result in image_results:
|
|
queue.put(image_result)
|
|
|
|
# spawn a pool of threads, and pass them queue instance
|
|
for i in range(threads):
|
|
t = ThreadUrl(queue, path, total)
|
|
t.setDaemon(True)
|
|
t.start()
|
|
|
|
# wait on the queue until everything has been processed
|
|
queue.join()
|