Tune download settings. Enable dummy cache with 7 days of expiration.

Fix generating spiider commands.
Add redirected domain appenid to allowed domains.
Configure loggers.
Add more meta info to *processed.txt
Enhance view raw data python jsnoline viewer
This commit is contained in:
siulkilulki 2018-04-15 12:17:35 +02:00
parent a5cb3a090f
commit e9c4dcd743
5 changed files with 98 additions and 22 deletions

View File

@ -1,15 +1,15 @@
SHELL := /bin/bash
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv
JOBS := 40
JOBS := 100
.PHONY: all update data clean clean-data
.PHONY: all update data clean clean-data clean-cache
all: data
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
rm -f parishwebsites/*processed.txt
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
@ -28,3 +28,6 @@ clean:
clean-data:
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
clean-cache:
rm -rf parishwebsites/.scrapy/httpcache

View File

@ -1,5 +1,6 @@
#!/usr/bin/env bash
while IFS='$\n' read -r url; do
echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\" -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
done

View File

@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
SPIDER_MODULES = ['parishwebsites.spiders']
NEWSPIDER_MODULE = 'parishwebsites.spiders'
FEED_EXPORT_ENCODING = 'utf-8'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'DEBUG'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
DOWNLOAD_TIMEOUT = 1000
CONCURRENT_REQUESTS_PER_DOMAIN = 4
@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 500
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 4
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = True
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
RETRY_TIMES = 5
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 1209600
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
DEPTH_LIMIT = 3
# DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
RETRY_TIMES = 7

View File

@ -7,14 +7,67 @@ import requests
from scrapy import signals
from scrapy.http import HtmlResponse
from binaryornot.helpers import is_binary_string
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
def requests_retry_session(
retries=3,
backoff_factor=1,
status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def get_redirected_url(url):
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
t0 = time.time()
final_url = None
try:
final_url = requests_retry_session().get(
url,
timeout=30
).url
except Exception as e:
logger.debug('Getting redirect url failed: {}'.format(e))
else:
logger.debug(f'Redirect url: {final_url}')
finally:
t1 = time.time()
logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
return final_url
def _get_allowed_domains(urls):
domains = []
for url in urls:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def extract_domain(url):
ext = tldextract.extract(url)
domain = '.'.join(ext).lstrip('.').rstrip('.')
domain = re.sub('www.', '', domain)
domains.append(domain)
return domain
domains = set()
for url in urls:
domain = extract_domain(url)
domains.add(domain)
redirected_domain = extract_domain(get_redirected_url(url))
if redirected_domain:
domains.add(redirected_domain)
domains = list(domains)
logger.debug('Allowed domains: {}'.format(domains))
return domains
def get_deny_domains():
@ -22,6 +75,20 @@ def get_deny_domains():
blacklisted_domains = [line.rstrip('\n') for line in f]
return blacklisted_domains
def configure_loggers():
logger = logging.getLogger('chardet.charsetprober')
logger.setLevel(logging.INFO)
logger = logging.getLogger('scrapy.core.scraper')
logger.setLevel(logging.INFO)
logger = logging.getLogger('binaryornot.helpers')
logger.setLevel(logging.INFO)
logger = logging.getLogger('scrapy.spidermiddlewares.depth')
logger.setLevel(logging.INFO)
class ParishesSpider(CrawlSpider):
name = "parishes"
rules = (Rule(
@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
follow=True), )
def __init__(self, *args, **kwargs):
configure_loggers()
super(ParishesSpider, self).__init__(*args, **kwargs)
self.start_urls = [kwargs.get('url')]
self.filename = kwargs.get('filename')
self.allowed_domains = _get_allowed_domains(self.start_urls)
def parse_start_url(self, response):
@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
yield rule.process_request(r)
def closed(self, reason):
fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
if reason == 'finished':
with open('./processed.txt', mode='a', encoding='utf-8') as f:
print(self.start_urls[0], file=f)
print(fileinfo, file=f)
else:
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
print(self.start_urls[0], file=f)
print(fileinfo, file=f)

View File

@ -18,11 +18,13 @@ def main():
text_maker.ignore_images = True
writer = jsonlines.Writer(sys.stdout)
# text_maker.wrap_links = False
# text_maker.strong_mark = ''
text_maker.strong_mark = ''
with jsonlines.open(sys.argv[1]) as reader:
for parish in reader:
parish = convert_html_to_text(parish, text_maker)
parish_content = parish.pop('content')
pprint.pprint(parish)
print(parish_content)
if __name__ == '__main__':
main()