Tune download settings. Enable dummy cache with 7 days of expiration.
Fix generating spiider commands. Add redirected domain appenid to allowed domains. Configure loggers. Add more meta info to *processed.txt Enhance view raw data python jsnoline viewer
This commit is contained in:
parent
a5cb3a090f
commit
e9c4dcd743
9
Makefile
9
Makefile
@ -1,15 +1,15 @@
|
||||
SHELL := /bin/bash
|
||||
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
||||
include /tmp/makeenv
|
||||
JOBS := 40
|
||||
JOBS := 100
|
||||
|
||||
.PHONY: all update data clean clean-data
|
||||
.PHONY: all update data clean clean-data clean-cache
|
||||
|
||||
all: data
|
||||
|
||||
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||
rm -f parishwebsites/*processed.txt
|
||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
|
||||
|
||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
||||
@ -28,3 +28,6 @@ clean:
|
||||
|
||||
clean-data:
|
||||
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
|
||||
|
||||
clean-cache:
|
||||
rm -rf parishwebsites/.scrapy/httpcache
|
||||
|
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
while IFS='$\n' read -r url; do
|
||||
echo "scrapy crawl parishes -a url=\"$url\" -t jsonlines -o data/`echo "$url" | sed -Ee 's@/|:@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
|
||||
filename="`echo "$url" | sed -Ee 's@/|:|\?|\!|\*|\(|\)|=|'"'"'|\+|;|,|\@|#|\[|\]|\$|&@@g' | sed 's/^http//g' | sed 's/^www\.//g'`"
|
||||
echo "scrapy crawl parishes -a url=\"$url\" -a filename=\"$filename\" -t jsonlines -o \"data/$filename\" 2> \"logs/$filename\" "
|
||||
done
|
||||
|
@ -14,19 +14,19 @@ BOT_NAME = 'parishwebsites'
|
||||
SPIDER_MODULES = ['parishwebsites.spiders']
|
||||
NEWSPIDER_MODULE = 'parishwebsites.spiders'
|
||||
FEED_EXPORT_ENCODING = 'utf-8'
|
||||
LOG_LEVEL = 'INFO'
|
||||
LOG_LEVEL = 'DEBUG'
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = 3
|
||||
DOWNLOAD_DELAY = 0
|
||||
# The download delay setting will honor only one of:
|
||||
DOWNLOAD_TIMEOUT = 1000
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 4
|
||||
@ -80,21 +80,21 @@ AUTOTHROTTLE_START_DELAY = 5
|
||||
AUTOTHROTTLE_MAX_DELAY = 500
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 4
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = True
|
||||
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
||||
RETRY_TIMES = 5
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
HTTPCACHE_ENABLED = True
|
||||
HTTPCACHE_EXPIRATION_SECS = 1209600
|
||||
HTTPCACHE_DIR = 'httpcache'
|
||||
HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES
|
||||
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage'
|
||||
DEPTH_LIMIT = 3
|
||||
# DEPTH_PRIORITY = 1
|
||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401]
|
||||
RETRY_TIMES = 7
|
||||
|
@ -7,14 +7,67 @@ import requests
|
||||
from scrapy import signals
|
||||
from scrapy.http import HtmlResponse
|
||||
from binaryornot.helpers import is_binary_string
|
||||
import logging
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
import time
|
||||
|
||||
def requests_retry_session(
|
||||
retries=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=(500, 502, 503, 504, 408, 400, 404, 429, 401),
|
||||
session=None,
|
||||
):
|
||||
session = session or requests.Session()
|
||||
retry = Retry(
|
||||
total=retries,
|
||||
read=retries,
|
||||
connect=retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=status_forcelist,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry)
|
||||
session.mount('http://', adapter)
|
||||
session.mount('https://', adapter)
|
||||
return session
|
||||
|
||||
|
||||
def get_redirected_url(url):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
t0 = time.time()
|
||||
final_url = None
|
||||
try:
|
||||
final_url = requests_retry_session().get(
|
||||
url,
|
||||
timeout=30
|
||||
).url
|
||||
except Exception as e:
|
||||
logger.debug('Getting redirect url failed: {}'.format(e))
|
||||
else:
|
||||
logger.debug(f'Redirect url: {final_url}')
|
||||
finally:
|
||||
t1 = time.time()
|
||||
logger.debug('Getting redirect url took: {} seconds'.format(t1 - t0))
|
||||
return final_url
|
||||
|
||||
def _get_allowed_domains(urls):
|
||||
domains = []
|
||||
for url in urls:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
def extract_domain(url):
|
||||
ext = tldextract.extract(url)
|
||||
domain = '.'.join(ext).lstrip('.').rstrip('.')
|
||||
domain = re.sub('www.', '', domain)
|
||||
domains.append(domain)
|
||||
return domain
|
||||
domains = set()
|
||||
for url in urls:
|
||||
domain = extract_domain(url)
|
||||
domains.add(domain)
|
||||
redirected_domain = extract_domain(get_redirected_url(url))
|
||||
if redirected_domain:
|
||||
domains.add(redirected_domain)
|
||||
domains = list(domains)
|
||||
logger.debug('Allowed domains: {}'.format(domains))
|
||||
return domains
|
||||
|
||||
def get_deny_domains():
|
||||
@ -22,6 +75,20 @@ def get_deny_domains():
|
||||
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||
return blacklisted_domains
|
||||
|
||||
def configure_loggers():
|
||||
logger = logging.getLogger('chardet.charsetprober')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
logger = logging.getLogger('scrapy.core.scraper')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
logger = logging.getLogger('binaryornot.helpers')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
logger = logging.getLogger('scrapy.spidermiddlewares.depth')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class ParishesSpider(CrawlSpider):
|
||||
name = "parishes"
|
||||
rules = (Rule(
|
||||
@ -30,8 +97,10 @@ class ParishesSpider(CrawlSpider):
|
||||
follow=True), )
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
configure_loggers()
|
||||
super(ParishesSpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = [kwargs.get('url')]
|
||||
self.filename = kwargs.get('filename')
|
||||
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
||||
|
||||
def parse_start_url(self, response):
|
||||
@ -71,9 +140,10 @@ class ParishesSpider(CrawlSpider):
|
||||
yield rule.process_request(r)
|
||||
|
||||
def closed(self, reason):
|
||||
fileinfo = '{}\t{}'.format(self.start_urls[0], self.filename)
|
||||
if reason == 'finished':
|
||||
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
||||
print(self.start_urls[0], file=f)
|
||||
print(fileinfo, file=f)
|
||||
else:
|
||||
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
||||
print(self.start_urls[0], file=f)
|
||||
print(fileinfo, file=f)
|
||||
|
@ -18,11 +18,13 @@ def main():
|
||||
text_maker.ignore_images = True
|
||||
writer = jsonlines.Writer(sys.stdout)
|
||||
# text_maker.wrap_links = False
|
||||
# text_maker.strong_mark = ''
|
||||
text_maker.strong_mark = ''
|
||||
with jsonlines.open(sys.argv[1]) as reader:
|
||||
for parish in reader:
|
||||
parish = convert_html_to_text(parish, text_maker)
|
||||
parish_content = parish.pop('content')
|
||||
pprint.pprint(parish)
|
||||
print(parish_content)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user