Fix checking if response is a binary string.

Modyfiy Makefile - enlarge to 40 parallel crawles. Add 4XX http code to retry list. Remove processed.final.txt Probably fix remove_blacklisted.py
2018-04-13 21:45:20 +02:00 · 2018-04-13 21:45:20 +02:00 · 0bba61bbcd
commit 0bba61bbcd
parent 21ba56a8fa
5 changed files with 35 additions and 5418 deletions
--- a/3
+++ b/3
@ -1,13 +1,14 @@
 SHELL := /bin/bash
 PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
 include /tmp/makeenv
-JOBS := 6
+JOBS := 40

 .PHONY: all update data clean clean-data

 all: data

 data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
+	rm -f parishwebsites/*processed.txt
 	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt

 parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@ -22,17 +22,6 @@ LOG_LEVEL = 'INFO'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True

-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-CONCURRENT_REQUESTS_PER_DOMAIN = 6
-# CONCURRENT_REQUESTS_PER_IP = 8
-
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False

@ -69,16 +58,29 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 6
 #    'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
 #}

+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+DOWNLOAD_TIMEOUT = 1000
+CONCURRENT_REQUESTS_PER_DOMAIN = 4
+# CONCURRENT_REQUESTS_PER_IP = 8
+
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 500
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-AUTOTHROTTLE_TARGET_CONCURRENCY = 2
+AUTOTHROTTLE_TARGET_CONCURRENCY = 4
 # Enable showing throttling stats for every response received:
 # AUTOTHROTTLE_DEBUG = True

@ -93,3 +95,6 @@ DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
+
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
+RETRY_TIMES = 7
--- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
+++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
@ -31,8 +31,7 @@ class ParishesSpider(CrawlSpider):

    def __init__(self, *args, **kwargs):
        super(ParishesSpider, self).__init__(*args, **kwargs)
-        self.original_url = kwargs.get('url')
-        self.start_urls = [requests.get(self.original_url).url]
+        self.start_urls = [kwargs.get('url')]
        self.allowed_domains = _get_allowed_domains(self.start_urls)

    def parse_start_url(self, response):
@ -41,17 +40,18 @@ class ParishesSpider(CrawlSpider):
        previous_url = response.meta[
            'previous_url'] if 'previous_url' in response.meta else ''

-        if not is_binary_string(response.text.encode('utf-8')[:2048]):
+        if not is_binary_string(response.body[:2048]):
            yield {
                "url": response.url,
                "depth": response.meta['depth'],
                "button_text": link_text,
                "previous_url": previous_url,
-                "original_start_url": self.original_url,
                "start_url": self.start_urls[0],
                "domain": self.allowed_domains[0],
                "content": response.text
            }
+        else:
+            self.logger.info('Content at {} is not text.'.format(response.url))

    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
@ -73,7 +73,7 @@ class ParishesSpider(CrawlSpider):
    def closed(self, reason):
        if reason == 'finished':
            with open('./processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.original_url, file=f)
+                print(self.start_urls[0], file=f)
        else:
            with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.original_url, file=f)
+                print(self.start_urls[0], file=f)
--- a/parishwebsites/processed.final.txt
+++ b/parishwebsites/processed.final.txt
--- a/parishwebsites/remove_blacklisted.py
+++ b/parishwebsites/remove_blacklisted.py
@ -1,12 +1,18 @@
 #!/usr/bin/env python3
 import sys

+def is_blacklisted(line, blacklisted_domains):
+    for domain in blacklisted_domains:
+        if domain in line:
+            return True
+    return False
+
+
 with open(sys.argv[1]) as f:
    blacklisted_domains = [line.rstrip('\n') for line in f]

 for line in sys.stdin:
-    for domain in blacklisted_domains:
-        if domain not in line:
-            print(line, end='')
+    if not is_blacklisted(line, blacklisted_domains):
+        print(line, end='')