2018-03-01 18:16:11 +01:00
|
|
|
SHELL := /bin/bash
|
|
|
|
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
|
|
|
include /tmp/makeenv
|
2018-04-15 12:17:35 +02:00
|
|
|
JOBS := 100
|
2018-03-01 18:16:11 +01:00
|
|
|
|
2018-04-15 12:17:35 +02:00
|
|
|
.PHONY: all update data clean clean-data clean-cache
|
2018-03-01 18:16:11 +01:00
|
|
|
|
|
|
|
all: data
|
|
|
|
|
2018-04-16 23:54:03 +02:00
|
|
|
|
|
|
|
data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
|
|
|
|
cd parishwebsites && ./deal-with-not-completed.sh
|
|
|
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands-add.txt
|
|
|
|
|
2018-04-09 23:52:11 +02:00
|
|
|
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
2018-04-13 21:45:20 +02:00
|
|
|
rm -f parishwebsites/*processed.txt
|
2018-04-15 12:17:35 +02:00
|
|
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt
|
2018-03-01 18:16:11 +01:00
|
|
|
|
2018-04-09 23:52:11 +02:00
|
|
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
2018-04-16 23:54:03 +02:00
|
|
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) | parishwebsites/remove_duplicate_commands.py > $@
|
2018-03-01 18:16:11 +01:00
|
|
|
|
|
|
|
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
2018-04-06 23:33:18 +02:00
|
|
|
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
2018-03-01 18:16:11 +01:00
|
|
|
|
|
|
|
parishes-deon.tsv: scraper/crawl_deon.py
|
|
|
|
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
|
|
|
|
2018-03-11 18:02:31 +01:00
|
|
|
update: environment.yml
|
|
|
|
conda env update -f $<
|
|
|
|
|
2018-03-01 18:16:11 +01:00
|
|
|
clean:
|
|
|
|
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
|
2018-03-11 18:02:31 +01:00
|
|
|
|
|
|
|
clean-data:
|
|
|
|
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
|
2018-04-15 12:17:35 +02:00
|
|
|
|
|
|
|
clean-cache:
|
|
|
|
rm -rf parishwebsites/.scrapy/httpcache
|