From e9c4dcd7438639cfb469c8902c882f05a2723315 Mon Sep 17 00:00:00 2001
From: siulkilulki <dawjur@st.amu.edu.pl>
Date: Sun, 15 Apr 2018 12:17:35 +0200
Subject: [PATCH] Tune download settings. Enable dummy cache with 7 days of
 expiration.

Fix generating spiider commands.
Add redirected domain appenid to allowed domains.
Configure loggers.
Add more meta info to *processed.txt
Enhance view raw data python jsnoline viewer
---
 Makefile                                      |  9 ++-
 parishwebsites/generate_spider_commands.sh    |  3 +-
 parishwebsites/parishwebsites/settings.py     | 24 +++---
 .../spiders/parishes_website_spider.py        | 80 +++++++++++++++++--
 parishwebsites/view_raw_data.py               |  4 +-
 5 files changed, 98 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index 66f3b37..b00c303 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,15 @@
 SHELL := /bin/bash
 PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
 include /tmp/makeenv
-JOBS := 40
+JOBS := 100
 
-.PHONY: all update data clean clean-data
+.PHONY: all update data clean clean-data clean-cache
 
 all: data
 
 data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
 	rm -f parishwebsites/*processed.txt
-	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
+	cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
 
 parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
 	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
@@ -28,3 +28,6 @@ clean:
 
 clean-data:
 	rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
+
+clean-cache:
+	rm -rf parishwebsites/.scrapy/httpcache
diff --git a/parishwebsites/generate_spider_commands.sh b/parishwebsites/generate_spider_commands.sh
index 104b464..bdce256 100755
--- a/parishwebsites/generate_spider_commands.sh
+++ b/parishwebsites/generate_spider_commands.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/env bash
 
 while IFS='$\n' read -r url; do
-    echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o  data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
+    filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
+    echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\"  -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
 done
diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py
index aa73e30..3d340b8 100644
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
 SPIDER_MODULES = ['parishwebsites.spiders']
 NEWSPIDER_MODULE = 'parishwebsites.spiders'
 FEED_EXPORT_ENCODING = 'utf-8'
-LOG_LEVEL = 'INFO'
+LOG_LEVEL = 'DEBUG'
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0
 # The download delay setting will honor only one of:
 DOWNLOAD_TIMEOUT = 1000
 CONCURRENT_REQUESTS_PER_DOMAIN = 4
@@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
 AUTOTHROTTLE_MAX_DELAY = 500
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-AUTOTHROTTLE_TARGET_CONCURRENCY = 4
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1
 # Enable showing throttling stats for every response received:
 # AUTOTHROTTLE_DEBUG = True
 
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
+RETRY_TIMES = 5
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 1209600
+HTTPCACHE_DIR = 'httpcache'
+HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
+HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
 DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
 
-RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
-RETRY_TIMES = 7
diff --git a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
index dd3c494..38d990e 100644
--- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
+++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py
@@ -7,14 +7,67 @@ import requests
 from scrapy import signals
 from scrapy.http import HtmlResponse
 from binaryornot.helpers import is_binary_string
+import logging
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+import time
+
+def requests_retry_session(
+    retries=3,
+    backoff_factor=1,
+    status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
+    session=None,
+):
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+    return session
+
+
+def get_redirected_url(url):
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    t0 = time.time()
+    final_url = None
+    try:
+        final_url = requests_retry_session().get(
+            url,
+            timeout=30
+        ).url
+    except Exception as e:
+        logger.debug('Getting redirect url failed: {}'.format(e))
+    else:
+        logger.debug(f'Redirect url: {final_url}')
+    finally:
+        t1 = time.time()
+        logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
+        return final_url
 
 def _get_allowed_domains(urls):
-    domains = []
-    for url in urls:
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.DEBUG)
+    def extract_domain(url):
         ext = tldextract.extract(url)
         domain = '.'.join(ext).lstrip('.').rstrip('.')
         domain = re.sub('www.', '', domain)
-        domains.append(domain)
+        return domain
+    domains = set() 
+    for url in urls:
+        domain = extract_domain(url)
+        domains.add(domain)
+        redirected_domain = extract_domain(get_redirected_url(url))
+        if redirected_domain:
+            domains.add(redirected_domain)
+    domains = list(domains)
+    logger.debug('Allowed domains: {}'.format(domains))
     return domains
 
 def get_deny_domains():
@@ -22,6 +75,20 @@ def get_deny_domains():
         blacklisted_domains = [line.rstrip('\n') for line in f]
     return blacklisted_domains
 
+def configure_loggers():
+    logger = logging.getLogger('chardet.charsetprober')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('scrapy.core.scraper')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('binaryornot.helpers')
+    logger.setLevel(logging.INFO)
+
+    logger = logging.getLogger('scrapy.spidermiddlewares.depth')
+    logger.setLevel(logging.INFO)
+
+
 class ParishesSpider(CrawlSpider):
     name = "parishes"
     rules = (Rule(
@@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
         follow=True), )
 
     def __init__(self, *args, **kwargs):
+        configure_loggers()
         super(ParishesSpider, self).__init__(*args, **kwargs)
         self.start_urls = [kwargs.get('url')]
+        self.filename = kwargs.get('filename')
         self.allowed_domains = _get_allowed_domains(self.start_urls)
 
     def parse_start_url(self, response):
@@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
                 yield rule.process_request(r)
 
     def closed(self, reason):
+        fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
         if reason == 'finished':
             with open('./processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.start_urls[0], file=f)
+                print(fileinfo, file=f)
         else:
             with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
-                print(self.start_urls[0], file=f)
+                print(fileinfo, file=f)
diff --git a/parishwebsites/view_raw_data.py b/parishwebsites/view_raw_data.py
index e0e31e8..86a3259 100755
--- a/parishwebsites/view_raw_data.py
+++ b/parishwebsites/view_raw_data.py
@@ -18,11 +18,13 @@ def main():
     text_maker.ignore_images = True
     writer = jsonlines.Writer(sys.stdout)
     # text_maker.wrap_links = False
-    # text_maker.strong_mark = ''
+    text_maker.strong_mark = ''
     with jsonlines.open(sys.argv[1]) as reader:
         for parish in reader:
             parish = convert_html_to_text(parish, text_maker)
+            parish_content = parish.pop('content')
             pprint.pprint(parish)
+            print(parish_content)
 
 if __name__ == '__main__':
     main()