diff --git a/AAAI96-155.pdf b/AAAI96-155.pdf deleted file mode 100644 index a67be6b..0000000 Binary files a/AAAI96-155.pdf and /dev/null differ diff --git a/Makefile b/Makefile index e69de29..9d11b73 100644 --- a/Makefile +++ b/Makefile @@ -0,0 +1,24 @@ +SHELL := /bin/bash +PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) +include /tmp/makeenv +JOBS := 6 + +.PHONY: all clean data + +all: data + +data: parishwebsites/spider-commands.txt + rm -rf data + parallel --jobs $(JOBS) < $< + +parishwebsites/spider-commands.txt: parishes-with-urls.tsv + cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ + +parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py + scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log + +parishes-deon.tsv: scraper/crawl_deon.py + scraper/crawl_deon.py > $@ 2> crawl-deon.log + +clean: + rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt diff --git a/dev-requirements.in b/dev-requirements.in deleted file mode 100644 index 998c563..0000000 --- a/dev-requirements.in +++ /dev/null @@ -1,3 +0,0 @@ -pip-tools -ipdb -pytest diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index a744629..0000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,24 +0,0 @@ -# -# This file is autogenerated by pip-compile -# To update, run: -# -# pip-compile --output-file dev-requirements.txt dev-requirements.in -# -click==6.7 # via pip-tools -decorator==4.0.11 # via ipython, traitlets -first==2.0.1 # via pip-tools -ipdb==0.10.2 -ipython-genutils==0.2.0 # via traitlets -ipython==5.3.0 # via ipdb -pexpect==4.2.1 # via ipython -pickleshare==0.7.4 # via ipython -pip-tools==1.9.0 -prompt-toolkit==1.0.14 # via ipython -ptyprocess==0.5.1 # via pexpect -py==1.4.34 # via pytest -pygments==2.2.0 # via ipython -pytest==3.1.2 -simplegeneric==0.8.1 # via ipython -six==1.10.0 # via pip-tools, prompt-toolkit, traitlets -traitlets==4.3.2 # via ipython -wcwidth==0.1.7 # via prompt-toolkit diff --git a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py index 363aca9..5879023 100644 --- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py +++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py @@ -1,4 +1,3 @@ -from bs4 import BeautifulSoup import scrapy import tldextract import re @@ -30,32 +29,20 @@ class ParishesSpider(CrawlSpider): self.allowed_domains = _get_allowed_domains(self.start_urls) def parse_start_url(self, response): - soup = BeautifulSoup(response.text, 'lxml') - [ - s.extract() - for s in soup(['style', 'script', '[document]', 'head', 'title']) - ] link_text = response.meta[ 'link_text'] if 'link_text' in response.meta else '' - button_soup = BeautifulSoup(link_text, 'lxml') - [ - s.extract() - for s in button_soup( - ['style', 'script', '[document]', 'head', 'title']) - ] - previous_url = response.meta[ 'previous_url'] if 'previous_url' in response.meta else '' yield { "url": response.url, "depth": response.meta['depth'], - "button_text": button_soup.get_text(separator='\n', strip=True), + "button_text": link_text "previous_url": previous_url, "original_start_url": self.original_url, "start_url": self.start_urls[0], "domain": self.allowed_domains[0], - "content": soup.get_text(separator='\n', strip=True) + "content": response.text } def _requests_to_follow(self, response): @@ -79,3 +66,6 @@ class ParishesSpider(CrawlSpider): if reason == 'finished': with open('./processed.txt', mode='a', encoding='utf-8') as f: print(self.original_url, file=f) + else: + with open('./not-processed.txt', mode='a', encoding='utf-8') as f: + print(self.original_url, file=f) diff --git a/scraper/prepare-environment.sh b/prepare-environment.sh similarity index 100% rename from scraper/prepare-environment.sh rename to prepare-environment.sh diff --git a/requirements.in b/requirements.in deleted file mode 100644 index cc018e7..0000000 --- a/requirements.in +++ /dev/null @@ -1,4 +0,0 @@ -requests -beautifulsoup4 -selenium -lxml diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 60333a3..0000000 --- a/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -# -# This file is autogenerated by pip-compile -# To update, run: -# -# pip-compile --output-file requirements.txt requirements.in -# -beautifulsoup4==4.6.0 -lxml==3.8.0 -requests==2.13.0 -selenium==3.4.3 diff --git a/scraper/Makefile b/scraper/Makefile index bde90a1..a737fe8 100644 --- a/scraper/Makefile +++ b/scraper/Makefile @@ -5,7 +5,10 @@ include /tmp/makeenv .PHONY: all clean -all: parishes-deon.txt +all: parishes-with-urls.tsv + +parishes-with-urls.tsv: apikey.txt parishes-deon.tsv + ./get_parishes_urls -a $< -p $(word 2,$^) parishes-deon.tsv: ./crawl_deon.py > $@ @@ -21,7 +24,7 @@ annotated-poznan.tsv: ./crawl_poznan.py > $@ 2> error_poznan clean: - rm -rf parishes-deon.txt + rm -rf parishes-deon.tsv update: conda env update -f ../environment.yml