diff --git a/Makefile b/Makefile index 9d11b73..4dab6b2 100644 --- a/Makefile +++ b/Makefile @@ -3,13 +3,12 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) include /tmp/makeenv JOBS := 6 -.PHONY: all clean data +.PHONY: all update data clean clean-data all: data data: parishwebsites/spider-commands.txt - rm -rf data - parallel --jobs $(JOBS) < $< + cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt parishwebsites/spider-commands.txt: parishes-with-urls.tsv cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ @@ -20,5 +19,11 @@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.p parishes-deon.tsv: scraper/crawl_deon.py scraper/crawl_deon.py > $@ 2> crawl-deon.log +update: environment.yml + conda env update -f $< + clean: rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt + +clean-data: + rm -rf parishwebsites/{data,processed.txt,crawler-log.txt} diff --git a/environment.yml b/environment.yml index 6cb353a..37f92f2 100644 --- a/environment.yml +++ b/environment.yml @@ -13,3 +13,4 @@ dependencies: - jsonlines - ipdb - colorama + - html2text diff --git a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py index 5879023..8e0767b 100644 --- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py +++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py @@ -20,7 +20,16 @@ def _get_allowed_domains(urls): class ParishesSpider(CrawlSpider): name = "parishes" - rules = (Rule(LinkExtractor(), callback='parse_start_url', follow=True), ) + deny_regex = [ + 'wikipedia', 'facebook', + 'http://www\.sluzew\.dominikanie\.pl/nagrania/', + 'http://pasierbiec.info/parafia-z-sercem/\?replytocom=', + 'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$' + ] + rules = (Rule( + LinkExtractor(deny=deny_regex), + callback='parse_start_url', + follow=True), ) def __init__(self, *args, **kwargs): super(ParishesSpider, self).__init__(*args, **kwargs) @@ -37,12 +46,12 @@ class ParishesSpider(CrawlSpider): yield { "url": response.url, "depth": response.meta['depth'], - "button_text": link_text + "button_text": link_text, "previous_url": previous_url, "original_start_url": self.original_url, "start_url": self.start_urls[0], "domain": self.allowed_domains[0], - "content": response.text + "content": response.text } def _requests_to_follow(self, response): diff --git a/prepare-environment.sh b/prepare-environment.sh index c0c7139..509a777 100755 --- a/prepare-environment.sh +++ b/prepare-environment.sh @@ -1,10 +1,11 @@ #!/usr/bin/env bash +. /home/siulkilulki/pkgs/miniconda3/etc/profile.d/conda.sh if conda info --envs | grep -q "polish-masses"; then (>&2 echo "Environment exist. Ready to process.") else (>&2 conda env create -f environment.yml) fi -source activate polish-masses +conda activate polish-masses export PYTHONIOENCODING=utf8 env | sed 's/=/:=/' | sed 's/^/export /'