Tune download settings. Enable dummy cache with 7 days of expiration.
Fix generating spiider commands. Add redirected domain appenid to allowed domains. Configure loggers. Add more meta info to *processed.txt Enhance view raw data python jsnoline viewer
This commit is contained in:
parent
a5cb3a090f
commit
e9c4dcd743
9
Makefile
9
Makefile
@ -1,15 +1,15 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
||||||
include /tmp/makeenv
|
include /tmp/makeenv
|
||||||
JOBS := 40
|
JOBS := 100
|
||||||
|
|
||||||
.PHONY: all update data clean clean-data
|
.PHONY: all update data clean clean-data clean-cache
|
||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||||
rm -f parishwebsites/*processed.txt
|
rm -f parishwebsites/*processed.txt
|
||||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
|
||||||
|
|
||||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
||||||
@ -28,3 +28,6 @@ clean:
|
|||||||
|
|
||||||
clean-data:
|
clean-data:
|
||||||
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
|
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
|
||||||
|
|
||||||
|
clean-cache:
|
||||||
|
rm -rf parishwebsites/.scrapy/httpcache
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
while IFS='$\n' read -r url; do
|
while IFS='$\n' read -r url; do
|
||||||
echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
|
filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
|
||||||
|
echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\" -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
|
||||||
done
|
done
|
||||||
|
@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
|
|||||||
SPIDER_MODULES = ['parishwebsites.spiders']
|
SPIDER_MODULES = ['parishwebsites.spiders']
|
||||||
NEWSPIDER_MODULE = 'parishwebsites.spiders'
|
NEWSPIDER_MODULE = 'parishwebsites.spiders'
|
||||||
FEED_EXPORT_ENCODING = 'utf-8'
|
FEED_EXPORT_ENCODING = 'utf-8'
|
||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'DEBUG'
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
|
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# Obey robots.txt rules
|
||||||
ROBOTSTXT_OBEY = True
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
# Disable cookies (enabled by default)
|
||||||
#COOKIES_ENABLED = False
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
# Disable Telnet Console (enabled by default)
|
# Disable Telnet Console (enabled by default)
|
||||||
#TELNETCONSOLE_ENABLED = False
|
TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
# Override the default request headers:
|
# Override the default request headers:
|
||||||
#DEFAULT_REQUEST_HEADERS = {
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
|
|||||||
# Configure a delay for requests for the same website (default: 0)
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||||
# See also autothrottle settings and docs
|
# See also autothrottle settings and docs
|
||||||
DOWNLOAD_DELAY = 3
|
DOWNLOAD_DELAY = 0
|
||||||
# The download delay setting will honor only one of:
|
# The download delay setting will honor only one of:
|
||||||
DOWNLOAD_TIMEOUT = 1000
|
DOWNLOAD_TIMEOUT = 1000
|
||||||
CONCURRENT_REQUESTS_PER_DOMAIN = 4
|
CONCURRENT_REQUESTS_PER_DOMAIN = 4
|
||||||
@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
|
|||||||
AUTOTHROTTLE_MAX_DELAY = 500
|
AUTOTHROTTLE_MAX_DELAY = 500
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
# each remote server
|
# each remote server
|
||||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 4
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
||||||
# Enable showing throttling stats for every response received:
|
# Enable showing throttling stats for every response received:
|
||||||
# AUTOTHROTTLE_DEBUG = True
|
# AUTOTHROTTLE_DEBUG = True
|
||||||
|
|
||||||
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
||||||
|
RETRY_TIMES = 5
|
||||||
# Enable and configure HTTP caching (disabled by default)
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
#HTTPCACHE_ENABLED = True
|
HTTPCACHE_ENABLED = True
|
||||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
HTTPCACHE_EXPIRATION_SECS = 1209600
|
||||||
#HTTPCACHE_DIR = 'httpcache'
|
HTTPCACHE_DIR = 'httpcache'
|
||||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
|
||||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
|
||||||
DEPTH_LIMIT = 3
|
DEPTH_LIMIT = 3
|
||||||
# DEPTH_PRIORITY = 1
|
# DEPTH_PRIORITY = 1
|
||||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||||
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||||
|
|
||||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
|
||||||
RETRY_TIMES = 7
|
|
||||||
|
@ -7,14 +7,67 @@ import requests
|
|||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy.http import HtmlResponse
|
from scrapy.http import HtmlResponse
|
||||||
from binaryornot.helpers import is_binary_string
|
from binaryornot.helpers import is_binary_string
|
||||||
|
import logging
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests.packages.urllib3.util.retry import Retry
|
||||||
|
import time
|
||||||
|
|
||||||
|
def requests_retry_session(
|
||||||
|
retries=3,
|
||||||
|
backoff_factor=1,
|
||||||
|
status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
|
||||||
|
session=None,
|
||||||
|
):
|
||||||
|
session = session or requests.Session()
|
||||||
|
retry = Retry(
|
||||||
|
total=retries,
|
||||||
|
read=retries,
|
||||||
|
connect=retries,
|
||||||
|
backoff_factor=backoff_factor,
|
||||||
|
status_forcelist=status_forcelist,
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retry)
|
||||||
|
session.mount('http://', adapter)
|
||||||
|
session.mount('https://', adapter)
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
def get_redirected_url(url):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
t0 = time.time()
|
||||||
|
final_url = None
|
||||||
|
try:
|
||||||
|
final_url = requests_retry_session().get(
|
||||||
|
url,
|
||||||
|
timeout=30
|
||||||
|
).url
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug('Getting redirect url failed: {}'.format(e))
|
||||||
|
else:
|
||||||
|
logger.debug(f'Redirect url: {final_url}')
|
||||||
|
finally:
|
||||||
|
t1 = time.time()
|
||||||
|
logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
|
||||||
|
return final_url
|
||||||
|
|
||||||
def _get_allowed_domains(urls):
|
def _get_allowed_domains(urls):
|
||||||
domains = []
|
logger = logging.getLogger(__name__)
|
||||||
for url in urls:
|
logger.setLevel(logging.DEBUG)
|
||||||
|
def extract_domain(url):
|
||||||
ext = tldextract.extract(url)
|
ext = tldextract.extract(url)
|
||||||
domain = '.'.join(ext).lstrip('.').rstrip('.')
|
domain = '.'.join(ext).lstrip('.').rstrip('.')
|
||||||
domain = re.sub('www.', '', domain)
|
domain = re.sub('www.', '', domain)
|
||||||
domains.append(domain)
|
return domain
|
||||||
|
domains = set()
|
||||||
|
for url in urls:
|
||||||
|
domain = extract_domain(url)
|
||||||
|
domains.add(domain)
|
||||||
|
redirected_domain = extract_domain(get_redirected_url(url))
|
||||||
|
if redirected_domain:
|
||||||
|
domains.add(redirected_domain)
|
||||||
|
domains = list(domains)
|
||||||
|
logger.debug('Allowed domains: {}'.format(domains))
|
||||||
return domains
|
return domains
|
||||||
|
|
||||||
def get_deny_domains():
|
def get_deny_domains():
|
||||||
@ -22,6 +75,20 @@ def get_deny_domains():
|
|||||||
blacklisted_domains = [line.rstrip('\n') for line in f]
|
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||||
return blacklisted_domains
|
return blacklisted_domains
|
||||||
|
|
||||||
|
def configure_loggers():
|
||||||
|
logger = logging.getLogger('chardet.charsetprober')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
logger = logging.getLogger('scrapy.core.scraper')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
logger = logging.getLogger('binaryornot.helpers')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
logger = logging.getLogger('scrapy.spidermiddlewares.depth')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class ParishesSpider(CrawlSpider):
|
class ParishesSpider(CrawlSpider):
|
||||||
name = "parishes"
|
name = "parishes"
|
||||||
rules = (Rule(
|
rules = (Rule(
|
||||||
@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
|
|||||||
follow=True), )
|
follow=True), )
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
configure_loggers()
|
||||||
super(ParishesSpider, self).__init__(*args, **kwargs)
|
super(ParishesSpider, self).__init__(*args, **kwargs)
|
||||||
self.start_urls = [kwargs.get('url')]
|
self.start_urls = [kwargs.get('url')]
|
||||||
|
self.filename = kwargs.get('filename')
|
||||||
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
||||||
|
|
||||||
def parse_start_url(self, response):
|
def parse_start_url(self, response):
|
||||||
@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
|
|||||||
yield rule.process_request(r)
|
yield rule.process_request(r)
|
||||||
|
|
||||||
def closed(self, reason):
|
def closed(self, reason):
|
||||||
|
fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
|
||||||
if reason == 'finished':
|
if reason == 'finished':
|
||||||
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
print(self.start_urls[0], file=f)
|
print(fileinfo, file=f)
|
||||||
else:
|
else:
|
||||||
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
print(self.start_urls[0], file=f)
|
print(fileinfo, file=f)
|
||||||
|
@ -18,11 +18,13 @@ def main():
|
|||||||
text_maker.ignore_images = True
|
text_maker.ignore_images = True
|
||||||
writer = jsonlines.Writer(sys.stdout)
|
writer = jsonlines.Writer(sys.stdout)
|
||||||
# text_maker.wrap_links = False
|
# text_maker.wrap_links = False
|
||||||
# text_maker.strong_mark = ''
|
text_maker.strong_mark = ''
|
||||||
with jsonlines.open(sys.argv[1]) as reader:
|
with jsonlines.open(sys.argv[1]) as reader:
|
||||||
for parish in reader:
|
for parish in reader:
|
||||||
parish = convert_html_to_text(parish, text_maker)
|
parish = convert_html_to_text(parish, text_maker)
|
||||||
|
parish_content = parish.pop('content')
|
||||||
pprint.pprint(parish)
|
pprint.pprint(parish)
|
||||||
|
print(parish_content)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user