Fix checking if response is a binary string.
Modyfiy Makefile - enlarge to 40 parallel crawles. Add 4XX http code to retry list. Remove processed.final.txt Probably fix remove_blacklisted.py
This commit is contained in:
parent
21ba56a8fa
commit
0bba61bbcd
3
Makefile
3
Makefile
@ -1,13 +1,14 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
||||||
include /tmp/makeenv
|
include /tmp/makeenv
|
||||||
JOBS := 6
|
JOBS := 40
|
||||||
|
|
||||||
.PHONY: all update data clean clean-data
|
.PHONY: all update data clean clean-data
|
||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||||
|
rm -f parishwebsites/*processed.txt
|
||||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
||||||
|
|
||||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||||
|
@ -22,17 +22,6 @@ LOG_LEVEL = 'INFO'
|
|||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
|
||||||
#CONCURRENT_REQUESTS = 32
|
|
||||||
|
|
||||||
# Configure a delay for requests for the same website (default: 0)
|
|
||||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
|
||||||
# See also autothrottle settings and docs
|
|
||||||
#DOWNLOAD_DELAY = 3
|
|
||||||
# The download delay setting will honor only one of:
|
|
||||||
CONCURRENT_REQUESTS_PER_DOMAIN = 6
|
|
||||||
# CONCURRENT_REQUESTS_PER_IP = 8
|
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
# Disable cookies (enabled by default)
|
||||||
#COOKIES_ENABLED = False
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
@ -69,16 +58,29 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 6
|
|||||||
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
|
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
DOWNLOAD_TIMEOUT = 1000
|
||||||
|
CONCURRENT_REQUESTS_PER_DOMAIN = 4
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 8
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
AUTOTHROTTLE_ENABLED = True
|
AUTOTHROTTLE_ENABLED = True
|
||||||
# The initial download delay
|
# The initial download delay
|
||||||
AUTOTHROTTLE_START_DELAY = 5
|
AUTOTHROTTLE_START_DELAY = 5
|
||||||
# The maximum download delay to be set in case of high latencies
|
# The maximum download delay to be set in case of high latencies
|
||||||
AUTOTHROTTLE_MAX_DELAY = 60
|
AUTOTHROTTLE_MAX_DELAY = 500
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
# each remote server
|
# each remote server
|
||||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 2
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 4
|
||||||
# Enable showing throttling stats for every response received:
|
# Enable showing throttling stats for every response received:
|
||||||
# AUTOTHROTTLE_DEBUG = True
|
# AUTOTHROTTLE_DEBUG = True
|
||||||
|
|
||||||
@ -93,3 +95,6 @@ DEPTH_LIMIT = 3
|
|||||||
# DEPTH_PRIORITY = 1
|
# DEPTH_PRIORITY = 1
|
||||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||||
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||||
|
|
||||||
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
||||||
|
RETRY_TIMES = 7
|
||||||
|
@ -31,8 +31,7 @@ class ParishesSpider(CrawlSpider):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(ParishesSpider, self).__init__(*args, **kwargs)
|
super(ParishesSpider, self).__init__(*args, **kwargs)
|
||||||
self.original_url = kwargs.get('url')
|
self.start_urls = [kwargs.get('url')]
|
||||||
self.start_urls = [requests.get(self.original_url).url]
|
|
||||||
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
||||||
|
|
||||||
def parse_start_url(self, response):
|
def parse_start_url(self, response):
|
||||||
@ -41,17 +40,18 @@ class ParishesSpider(CrawlSpider):
|
|||||||
previous_url = response.meta[
|
previous_url = response.meta[
|
||||||
'previous_url'] if 'previous_url' in response.meta else ''
|
'previous_url'] if 'previous_url' in response.meta else ''
|
||||||
|
|
||||||
if not is_binary_string(response.text.encode('utf-8')[:2048]):
|
if not is_binary_string(response.body[:2048]):
|
||||||
yield {
|
yield {
|
||||||
"url": response.url,
|
"url": response.url,
|
||||||
"depth": response.meta['depth'],
|
"depth": response.meta['depth'],
|
||||||
"button_text": link_text,
|
"button_text": link_text,
|
||||||
"previous_url": previous_url,
|
"previous_url": previous_url,
|
||||||
"original_start_url": self.original_url,
|
|
||||||
"start_url": self.start_urls[0],
|
"start_url": self.start_urls[0],
|
||||||
"domain": self.allowed_domains[0],
|
"domain": self.allowed_domains[0],
|
||||||
"content": response.text
|
"content": response.text
|
||||||
}
|
}
|
||||||
|
else:
|
||||||
|
self.logger.info('Content at {} is not text.'.format(response.url))
|
||||||
|
|
||||||
def _requests_to_follow(self, response):
|
def _requests_to_follow(self, response):
|
||||||
if not isinstance(response, HtmlResponse):
|
if not isinstance(response, HtmlResponse):
|
||||||
@ -73,7 +73,7 @@ class ParishesSpider(CrawlSpider):
|
|||||||
def closed(self, reason):
|
def closed(self, reason):
|
||||||
if reason == 'finished':
|
if reason == 'finished':
|
||||||
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
print(self.original_url, file=f)
|
print(self.start_urls[0], file=f)
|
||||||
else:
|
else:
|
||||||
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
print(self.original_url, file=f)
|
print(self.start_urls[0], file=f)
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,18 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
def is_blacklisted(line, blacklisted_domains):
|
||||||
|
for domain in blacklisted_domains:
|
||||||
|
if domain in line:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
with open(sys.argv[1]) as f:
|
with open(sys.argv[1]) as f:
|
||||||
blacklisted_domains = [line.rstrip('\n') for line in f]
|
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||||
|
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
for domain in blacklisted_domains:
|
if not is_blacklisted(line, blacklisted_domains):
|
||||||
if domain not in line:
|
|
||||||
print(line, end='')
|
print(line, end='')
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user