Fix checking if response is a binary string.

Modyfiy Makefile - enlarge to 40 parallel crawles.
Add 4XX http code to retry list.
Remove processed.final.txt
Probably fix remove_blacklisted.py
This commit is contained in:
Dawid Jurkiewicz 2018-04-13 21:45:20 +02:00
parent 21ba56a8fa
commit 0bba61bbcd
5 changed files with 35 additions and 5418 deletions

View File

@ -1,13 +1,14 @@
SHELL := /bin/bash SHELL := /bin/bash
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv include /tmp/makeenv
JOBS := 6 JOBS := 40
.PHONY: all update data clean clean-data .PHONY: all update data clean clean-data
all: data all: data
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
rm -f parishwebsites/*processed.txt
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt

View File

@ -22,17 +22,6 @@ LOG_LEVEL = 'INFO'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 6
# CONCURRENT_REQUESTS_PER_IP = 8
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False #COOKIES_ENABLED = False
@ -69,16 +58,29 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 6
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300, # 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
#} #}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
DOWNLOAD_TIMEOUT = 1000
CONCURRENT_REQUESTS_PER_DOMAIN = 4
# CONCURRENT_REQUESTS_PER_IP = 8
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True AUTOTHROTTLE_ENABLED = True
# The initial download delay # The initial download delay
AUTOTHROTTLE_START_DELAY = 5 AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies # The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60 AUTOTHROTTLE_MAX_DELAY = 500
# The average number of requests Scrapy should be sending in parallel to # The average number of requests Scrapy should be sending in parallel to
# each remote server # each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 2 AUTOTHROTTLE_TARGET_CONCURRENCY = 4
# Enable showing throttling stats for every response received: # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = True # AUTOTHROTTLE_DEBUG = True
@ -93,3 +95,6 @@ DEPTH_LIMIT = 3
# DEPTH_PRIORITY = 1 # DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
RETRY_TIMES = 7

View File

@ -31,8 +31,7 @@ class ParishesSpider(CrawlSpider):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(ParishesSpider, self).__init__(*args, **kwargs) super(ParishesSpider, self).__init__(*args, **kwargs)
self.original_url = kwargs.get('url') self.start_urls = [kwargs.get('url')]
self.start_urls = [requests.get(self.original_url).url]
self.allowed_domains = _get_allowed_domains(self.start_urls) self.allowed_domains = _get_allowed_domains(self.start_urls)
def parse_start_url(self, response): def parse_start_url(self, response):
@ -41,17 +40,18 @@ class ParishesSpider(CrawlSpider):
previous_url = response.meta[ previous_url = response.meta[
'previous_url'] if 'previous_url' in response.meta else '' 'previous_url'] if 'previous_url' in response.meta else ''
if not is_binary_string(response.text.encode('utf-8')[:2048]): if not is_binary_string(response.body[:2048]):
yield { yield {
"url": response.url, "url": response.url,
"depth": response.meta['depth'], "depth": response.meta['depth'],
"button_text": link_text, "button_text": link_text,
"previous_url": previous_url, "previous_url": previous_url,
"original_start_url": self.original_url,
"start_url": self.start_urls[0], "start_url": self.start_urls[0],
"domain": self.allowed_domains[0], "domain": self.allowed_domains[0],
"content": response.text "content": response.text
} }
else:
self.logger.info('Content at {} is not text.'.format(response.url))
def _requests_to_follow(self, response): def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse): if not isinstance(response, HtmlResponse):
@ -73,7 +73,7 @@ class ParishesSpider(CrawlSpider):
def closed(self, reason): def closed(self, reason):
if reason == 'finished': if reason == 'finished':
with open('./processed.txt', mode='a', encoding='utf-8') as f: with open('./processed.txt', mode='a', encoding='utf-8') as f:
print(self.original_url, file=f) print(self.start_urls[0], file=f)
else: else:
with open('./not-processed.txt', mode='a', encoding='utf-8') as f: with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
print(self.original_url, file=f) print(self.start_urls[0], file=f)

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
def is_blacklisted(line, blacklisted_domains):
for domain in blacklisted_domains:
if domain in line:
return True
return False
with open(sys.argv[1]) as f: with open(sys.argv[1]) as f:
blacklisted_domains = [line.rstrip('\n') for line in f] blacklisted_domains = [line.rstrip('\n') for line in f]
for line in sys.stdin: for line in sys.stdin:
for domain in blacklisted_domains: if not is_blacklisted(line, blacklisted_domains):
if domain not in line:
print(line, end='') print(line, end='')