Tune download settings. Enable dummy cache with 7 days of expiration.

Fix generating spiider commands.
Add redirected domain appenid to allowed domains.
Configure loggers.
Add more meta info to *processed.txt
Enhance view raw data python jsnoline viewer
This commit is contained in:
siulkilulki 2018-04-15 12:17:35 +02:00
parent a5cb3a090f
commit e9c4dcd743
5 changed files with 98 additions and 22 deletions

View File

@ -1,15 +1,15 @@
SHELL := /bin/bash SHELL := /bin/bash
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv include /tmp/makeenv
JOBS := 40 JOBS := 100
.PHONY: all update data clean clean-data .PHONY: all update data clean clean-data clean-cache
all: data all: data
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
rm -f parishwebsites/*processed.txt rm -f parishwebsites/*processed.txt
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@ cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
@ -28,3 +28,6 @@ clean:
clean-data: clean-data:
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt} rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
clean-cache:
rm -rf parishwebsites/.scrapy/httpcache

View File

@ -1,5 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
while IFS='$\n' read -r url; do while IFS='$\n' read -r url; do
echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`" filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\" -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
done done

View File

@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
SPIDER_MODULES = ['parishwebsites.spiders'] SPIDER_MODULES = ['parishwebsites.spiders']
NEWSPIDER_MODULE = 'parishwebsites.spiders' NEWSPIDER_MODULE = 'parishwebsites.spiders'
FEED_EXPORT_ENCODING = 'utf-8' FEED_EXPORT_ENCODING = 'utf-8'
LOG_LEVEL = 'INFO' LOG_LEVEL = 'DEBUG'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)' #USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False #COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False TELNETCONSOLE_ENABLED = False
# Override the default request headers: # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = { #DEFAULT_REQUEST_HEADERS = {
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
DOWNLOAD_TIMEOUT = 1000 DOWNLOAD_TIMEOUT = 1000
CONCURRENT_REQUESTS_PER_DOMAIN = 4 CONCURRENT_REQUESTS_PER_DOMAIN = 4
@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 500 AUTOTHROTTLE_MAX_DELAY = 500
# The average number of requests Scrapy should be sending in parallel to # The average number of requests Scrapy should be sending in parallel to
# each remote server # each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 4 AUTOTHROTTLE_TARGET_CONCURRENCY = 1
# Enable showing throttling stats for every response received: # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = True # AUTOTHROTTLE_DEBUG = True
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
RETRY_TIMES = 5
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0 HTTPCACHE_EXPIRATION_SECS = 1209600
#HTTPCACHE_DIR = 'httpcache' HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
DEPTH_LIMIT = 3 DEPTH_LIMIT = 3
# DEPTH_PRIORITY = 1 # DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
RETRY_TIMES = 7

View File

@ -7,14 +7,67 @@ import requests
from scrapy import signals from scrapy import signals
from scrapy.http import HtmlResponse from scrapy.http import HtmlResponse
from binaryornot.helpers import is_binary_string from binaryornot.helpers import is_binary_string
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
def requests_retry_session(
retries=3,
backoff_factor=1,
status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def get_redirected_url(url):
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
t0 = time.time()
final_url = None
try:
final_url = requests_retry_session().get(
url,
timeout=30
).url
except Exception as e:
logger.debug('Getting redirect url failed: {}'.format(e))
else:
logger.debug(f'Redirect url: {final_url}')
finally:
t1 = time.time()
logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
return final_url
def _get_allowed_domains(urls): def _get_allowed_domains(urls):
domains = [] logger = logging.getLogger(__name__)
for url in urls: logger.setLevel(logging.DEBUG)
def extract_domain(url):
ext = tldextract.extract(url) ext = tldextract.extract(url)
domain = '.'.join(ext).lstrip('.').rstrip('.') domain = '.'.join(ext).lstrip('.').rstrip('.')
domain = re.sub('www.', '', domain) domain = re.sub('www.', '', domain)
domains.append(domain) return domain
domains = set()
for url in urls:
domain = extract_domain(url)
domains.add(domain)
redirected_domain = extract_domain(get_redirected_url(url))
if redirected_domain:
domains.add(redirected_domain)
domains = list(domains)
logger.debug('Allowed domains: {}'.format(domains))
return domains return domains
def get_deny_domains(): def get_deny_domains():
@ -22,6 +75,20 @@ def get_deny_domains():
blacklisted_domains = [line.rstrip('\n') for line in f] blacklisted_domains = [line.rstrip('\n') for line in f]
return blacklisted_domains return blacklisted_domains
def configure_loggers():
logger = logging.getLogger('chardet.charsetprober')
logger.setLevel(logging.INFO)
logger = logging.getLogger('scrapy.core.scraper')
logger.setLevel(logging.INFO)
logger = logging.getLogger('binaryornot.helpers')
logger.setLevel(logging.INFO)
logger = logging.getLogger('scrapy.spidermiddlewares.depth')
logger.setLevel(logging.INFO)
class ParishesSpider(CrawlSpider): class ParishesSpider(CrawlSpider):
name = "parishes" name = "parishes"
rules = (Rule( rules = (Rule(
@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
follow=True), ) follow=True), )
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
configure_loggers()
super(ParishesSpider, self).__init__(*args, **kwargs) super(ParishesSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('url')] self.start_urls = [kwargs.get('url')]
self.filename = kwargs.get('filename')
self.allowed_domains = _get_allowed_domains(self.start_urls) self.allowed_domains = _get_allowed_domains(self.start_urls)
def parse_start_url(self, response): def parse_start_url(self, response):
@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
yield rule.process_request(r) yield rule.process_request(r)
def closed(self, reason): def closed(self, reason):
fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
if reason == 'finished': if reason == 'finished':
with open('./processed.txt', mode='a', encoding='utf-8') as f: with open('./processed.txt', mode='a', encoding='utf-8') as f:
print(self.start_urls[0], file=f) print(fileinfo, file=f)
else: else:
with open('./not-processed.txt', mode='a', encoding='utf-8') as f: with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
print(self.start_urls[0], file=f) print(fileinfo, file=f)

View File

@ -18,11 +18,13 @@ def main():
text_maker.ignore_images = True text_maker.ignore_images = True
writer = jsonlines.Writer(sys.stdout) writer = jsonlines.Writer(sys.stdout)
# text_maker.wrap_links = False # text_maker.wrap_links = False
# text_maker.strong_mark = '' text_maker.strong_mark = ''
with jsonlines.open(sys.argv[1]) as reader: with jsonlines.open(sys.argv[1]) as reader:
for parish in reader: for parish in reader:
parish = convert_html_to_text(parish, text_maker) parish = convert_html_to_text(parish, text_maker)
parish_content = parish.pop('content')
pprint.pprint(parish) pprint.pprint(parish)
print(parish_content)
if __name__ == '__main__': if __name__ == '__main__':
main() main()