Tune download settings. Enable dummy cache with 7 days of expiration.

Fix generating spiider commands. Add redirected domain appenid to allowed domains. Configure loggers. Add more meta info to *processed.txt Enhance view raw data python jsnoline viewer
2018-04-15 12:17:35 +02:00 · 2018-04-15 12:17:35 +02:00 · e9c4dcd743
commit e9c4dcd743
parent a5cb3a090f
5 changed files with 98 additions and 22 deletions
--- a/9
+++ b/9
@ -1,15 +1,15 @@
 SHELL := /bin/bash
 PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
 include /tmp/makeenv
-JOBS := 40
+JOBS := 100

-.PHONY: all update data clean clean-data
+.PHONY: all update data clean clean-data clean-cache

 all: data

 data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
 	rm -f parishwebsites/*processed.txt
-	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
+	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt

 parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
 	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
@ -28,3 +28,6 @@ clean:

 clean-data:
 	rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
+
+clean-cache:
+	rm -rf parishwebsites/.scrapy/httpcache
--- a/parishwebsites/generate_spider_commands.sh
+++ b/parishwebsites/generate_spider_commands.sh
@ -1,5 +1,6 @@
 #!/usr/bin/env bash

 while IFS='$\n' read -r url; do
-    echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o  data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
+    filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
+    echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\"  -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
 done
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
 SPIDER_MODULES = ['parishwebsites.spiders']
 NEWSPIDER_MODULE = 'parishwebsites.spiders'
 FEED_EXPORT_ENCODING = 'utf-8'
-LOG_LEVEL = 'INFO'
+LOG_LEVEL = 'DEBUG'

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0
 # The download delay setting will honor only one of:
 DOWNLOAD_TIMEOUT = 1000
 CONCURRENT_REQUESTS_PER_DOMAIN = 4
@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
 AUTOTHROTTLE_MAX_DELAY = 500
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-AUTOTHROTTLE_TARGET_CONCURRENCY = 4
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1
 # Enable showing throttling stats for every response received:
 # AUTOTHROTTLE_DEBUG = True

+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
+RETRY_TIMES = 5
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 1209600
+HTTPCACHE_DIR = 'httpcache'
+HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
+HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
 DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

-RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
-RETRY_TIMES = 7
--- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
+++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
@ -7,14 +7,67 @@ import requests
 from scrapy import signals
 from scrapy.http import HtmlResponse
 from binaryornot.helpers import is_binary_string
+import logging
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+import time
+
+def requests_retry_session(
+    retries=3,
+    backoff_factor=1,
+    status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
+    session=None,
+):
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    return session
+
+
+def get_redirected_url(url):
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    t0 = time.time()
+    final_url = None
+    try:
+        final_url = requests_retry_session().get(
+            url,
+            timeout=30
+        ).url
+    except Exception as e:
+        logger.debug('Getting redirect url failed: {}'.format(e))
+    else:
+        logger.debug(f'Redirect url: {final_url}')
+    finally:
+        t1 = time.time()
+        logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
+        return final_url

 def _get_allowed_domains(urls):
-    domains = []
-    for url in urls:
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    def extract_domain(url):
        ext = tldextract.extract(url)
        domain = '.'.join(ext).lstrip('.').rstrip('.')
        domain = re.sub('www.', '', domain)
-        domains.append(domain)
+        return domain
+    domains = set() 
+    for url in urls:
+        domain = extract_domain(url)
+        domains.add(domain)
+        redirected_domain = extract_domain(get_redirected_url(url))
+        if redirected_domain:
+            domains.add(redirected_domain)
+    domains = list(domains)
+    logger.debug('Allowed domains: {}'.format(domains))
    return domains

 def get_deny_domains():
@ -22,6 +75,20 @@ def get_deny_domains():
        blacklisted_domains = [line.rstrip('\n') for line in f]
    return blacklisted_domains

+def configure_loggers():
+    logger = logging.getLogger('chardet.charsetprober')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('scrapy.core.scraper')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('binaryornot.helpers')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('scrapy.spidermiddlewares.depth')
+    logger.setLevel(logging.INFO)
+
+
 class ParishesSpider(CrawlSpider):
    name = "parishes"
    rules = (Rule(
@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
        follow=True), )

    def __init__(self, *args, **kwargs):
+        configure_loggers()
        super(ParishesSpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('url')]
+        self.filename = kwargs.get('filename')
        self.allowed_domains = _get_allowed_domains(self.start_urls)

    def parse_start_url(self, response):
@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
                yield rule.process_request(r)

    def closed(self, reason):
+        fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
        if reason == 'finished':
            with open('./processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.start_urls[0], file=f)
+                print(fileinfo, file=f)
        else:
            with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.start_urls[0], file=f)
+                print(fileinfo, file=f)
--- a/parishwebsites/view_raw_data.py
+++ b/parishwebsites/view_raw_data.py
@ -18,11 +18,13 @@ def main():
    text_maker.ignore_images = True
    writer = jsonlines.Writer(sys.stdout)
    # text_maker.wrap_links = False
-    # text_maker.strong_mark = ''
+    text_maker.strong_mark = ''
    with jsonlines.open(sys.argv[1]) as reader:
        for parish in reader:
            parish = convert_html_to_text(parish, text_maker)
+            parish_content = parish.pop('content')
            pprint.pprint(parish)
+            print(parish_content)

 if __name__ == '__main__':
    main()