From e9c4dcd7438639cfb469c8902c882f05a2723315 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Sun, 15 Apr 2018 12:17:35 +0200 Subject: [PATCH] Tune download settings. Enable dummy cache with 7 days of expiration. Fix generating spiider commands. Add redirected domain appenid to allowed domains. Configure loggers. Add more meta info to *processed.txt Enhance view raw data python jsnoline viewer --- Makefile | 9 ++- parishwebsites/generate_spider_commands.sh | 3 +- parishwebsites/parishwebsites/settings.py | 24 +++--- .../spiders/parishes_website_spider.py | 80 +++++++++++++++++-- parishwebsites/view_raw_data.py | 4 +- 5 files changed, 98 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 66f3b37..b00c303 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ SHELL := /bin/bash PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) include /tmp/makeenv -JOBS := 40 +JOBS := 100 -.PHONY: all update data clean clean-data +.PHONY: all update data clean clean-data clean-cache all: data data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt rm -f parishwebsites/*processed.txt - cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt + cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@ @@ -28,3 +28,6 @@ clean: clean-data: rm -rf parishwebsites/{data,processed.txt,crawler-log.txt} + +clean-cache: + rm -rf parishwebsites/.scrapy/httpcache diff --git a/parishwebsites/generate_spider_commands.sh b/parishwebsites/generate_spider_commands.sh index 104b464..bdce256 100755 --- a/parishwebsites/generate_spider_commands.sh +++ b/parishwebsites/generate_spider_commands.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash while IFS='$\n' read -r url; do - echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`" + filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`" + echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\" -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" " done diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py index aa73e30..3d340b8 100644 --- a/parishwebsites/parishwebsites/settings.py +++ b/parishwebsites/parishwebsites/settings.py @@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites' SPIDER_MODULES = ['parishwebsites.spiders'] NEWSPIDER_MODULE = 'parishwebsites.spiders' FEED_EXPORT_ENCODING = 'utf-8' -LOG_LEVEL = 'INFO' +LOG_LEVEL = 'DEBUG' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { @@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -DOWNLOAD_DELAY = 3 +DOWNLOAD_DELAY = 0 # The download delay setting will honor only one of: DOWNLOAD_TIMEOUT = 1000 CONCURRENT_REQUESTS_PER_DOMAIN = 4 @@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5 AUTOTHROTTLE_MAX_DELAY = 500 # The average number of requests Scrapy should be sending in parallel to # each remote server -AUTOTHROTTLE_TARGET_CONCURRENCY = 4 +AUTOTHROTTLE_TARGET_CONCURRENCY = 1 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = True +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401] +RETRY_TIMES = 5 # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 1209600 +HTTPCACHE_DIR = 'httpcache' +HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES +HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage' DEPTH_LIMIT = 3 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' -RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401] -RETRY_TIMES = 7 diff --git a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py index dd3c494..38d990e 100644 --- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py +++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py @@ -7,14 +7,67 @@ import requests from scrapy import signals from scrapy.http import HtmlResponse from binaryornot.helpers import is_binary_string +import logging +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +import time + +def requests_retry_session( + retries=3, + backoff_factor=1, + status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401), + session=None, +): + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + + +def get_redirected_url(url): + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + t0 = time.time() + final_url = None + try: + final_url = requests_retry_session().get( + url, + timeout=30 + ).url + except Exception as e: + logger.debug('Getting redirect url failed: {}'.format(e)) + else: + logger.debug(f'Redirect url: {final_url}') + finally: + t1 = time.time() + logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0)) + return final_url def _get_allowed_domains(urls): - domains = [] - for url in urls: + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + def extract_domain(url): ext = tldextract.extract(url) domain = '.'.join(ext).lstrip('.').rstrip('.') domain = re.sub('www.', '', domain) - domains.append(domain) + return domain + domains = set() + for url in urls: + domain = extract_domain(url) + domains.add(domain) + redirected_domain = extract_domain(get_redirected_url(url)) + if redirected_domain: + domains.add(redirected_domain) + domains = list(domains) + logger.debug('Allowed domains: {}'.format(domains)) return domains def get_deny_domains(): @@ -22,6 +75,20 @@ def get_deny_domains(): blacklisted_domains = [line.rstrip('\n') for line in f] return blacklisted_domains +def configure_loggers(): + logger = logging.getLogger('chardet.charsetprober') + logger.setLevel(logging.INFO) + + logger = logging.getLogger('scrapy.core.scraper') + logger.setLevel(logging.INFO) + + logger = logging.getLogger('binaryornot.helpers') + logger.setLevel(logging.INFO) + + logger = logging.getLogger('scrapy.spidermiddlewares.depth') + logger.setLevel(logging.INFO) + + class ParishesSpider(CrawlSpider): name = "parishes" rules = (Rule( @@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider): follow=True), ) def __init__(self, *args, **kwargs): + configure_loggers() super(ParishesSpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('url')] + self.filename = kwargs.get('filename') self.allowed_domains = _get_allowed_domains(self.start_urls) def parse_start_url(self, response): @@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider): yield rule.process_request(r) def closed(self, reason): + fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename) if reason == 'finished': with open('./processed.txt', mode='a', encoding='utf-8') as f: - print(self.start_urls[0], file=f) + print(fileinfo, file=f) else: with open('./not-processed.txt', mode='a', encoding='utf-8') as f: - print(self.start_urls[0], file=f) + print(fileinfo, file=f) diff --git a/parishwebsites/view_raw_data.py b/parishwebsites/view_raw_data.py index e0e31e8..86a3259 100755 --- a/parishwebsites/view_raw_data.py +++ b/parishwebsites/view_raw_data.py @@ -18,11 +18,13 @@ def main(): text_maker.ignore_images = True writer = jsonlines.Writer(sys.stdout) # text_maker.wrap_links = False - # text_maker.strong_mark = '' + text_maker.strong_mark = '' with jsonlines.open(sys.argv[1]) as reader: for parish in reader: parish = convert_html_to_text(parish, text_maker) + parish_content = parish.pop('content') pprint.pprint(parish) + print(parish_content) if __name__ == '__main__': main()