Code refactorings.
This commit is contained in:
parent
0070ffe07d
commit
b433a5e297
BIN
AAAI96-155.pdf
BIN
AAAI96-155.pdf
Binary file not shown.
24
Makefile
24
Makefile
@ -0,0 +1,24 @@
|
|||||||
|
SHELL := /bin/bash
|
||||||
|
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
||||||
|
include /tmp/makeenv
|
||||||
|
JOBS := 6
|
||||||
|
|
||||||
|
.PHONY: all clean data
|
||||||
|
|
||||||
|
all: data
|
||||||
|
|
||||||
|
data: parishwebsites/spider-commands.txt
|
||||||
|
rm -rf data
|
||||||
|
parallel --jobs $(JOBS) < $<
|
||||||
|
|
||||||
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
||||||
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
||||||
|
|
||||||
|
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
||||||
|
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log
|
||||||
|
|
||||||
|
parishes-deon.tsv: scraper/crawl_deon.py
|
||||||
|
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
|
@ -1,3 +0,0 @@
|
|||||||
pip-tools
|
|
||||||
ipdb
|
|
||||||
pytest
|
|
@ -1,24 +0,0 @@
|
|||||||
#
|
|
||||||
# This file is autogenerated by pip-compile
|
|
||||||
# To update, run:
|
|
||||||
#
|
|
||||||
# pip-compile --output-file dev-requirements.txt dev-requirements.in
|
|
||||||
#
|
|
||||||
click==6.7 # via pip-tools
|
|
||||||
decorator==4.0.11 # via ipython, traitlets
|
|
||||||
first==2.0.1 # via pip-tools
|
|
||||||
ipdb==0.10.2
|
|
||||||
ipython-genutils==0.2.0 # via traitlets
|
|
||||||
ipython==5.3.0 # via ipdb
|
|
||||||
pexpect==4.2.1 # via ipython
|
|
||||||
pickleshare==0.7.4 # via ipython
|
|
||||||
pip-tools==1.9.0
|
|
||||||
prompt-toolkit==1.0.14 # via ipython
|
|
||||||
ptyprocess==0.5.1 # via pexpect
|
|
||||||
py==1.4.34 # via pytest
|
|
||||||
pygments==2.2.0 # via ipython
|
|
||||||
pytest==3.1.2
|
|
||||||
simplegeneric==0.8.1 # via ipython
|
|
||||||
six==1.10.0 # via pip-tools, prompt-toolkit, traitlets
|
|
||||||
traitlets==4.3.2 # via ipython
|
|
||||||
wcwidth==0.1.7 # via prompt-toolkit
|
|
@ -1,4 +1,3 @@
|
|||||||
from bs4 import BeautifulSoup
|
|
||||||
import scrapy
|
import scrapy
|
||||||
import tldextract
|
import tldextract
|
||||||
import re
|
import re
|
||||||
@ -30,32 +29,20 @@ class ParishesSpider(CrawlSpider):
|
|||||||
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
self.allowed_domains = _get_allowed_domains(self.start_urls)
|
||||||
|
|
||||||
def parse_start_url(self, response):
|
def parse_start_url(self, response):
|
||||||
soup = BeautifulSoup(response.text, 'lxml')
|
|
||||||
[
|
|
||||||
s.extract()
|
|
||||||
for s in soup(['style', 'script', '[document]', 'head', 'title'])
|
|
||||||
]
|
|
||||||
link_text = response.meta[
|
link_text = response.meta[
|
||||||
'link_text'] if 'link_text' in response.meta else ''
|
'link_text'] if 'link_text' in response.meta else ''
|
||||||
button_soup = BeautifulSoup(link_text, 'lxml')
|
|
||||||
[
|
|
||||||
s.extract()
|
|
||||||
for s in button_soup(
|
|
||||||
['style', 'script', '[document]', 'head', 'title'])
|
|
||||||
]
|
|
||||||
|
|
||||||
previous_url = response.meta[
|
previous_url = response.meta[
|
||||||
'previous_url'] if 'previous_url' in response.meta else ''
|
'previous_url'] if 'previous_url' in response.meta else ''
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
"url": response.url,
|
"url": response.url,
|
||||||
"depth": response.meta['depth'],
|
"depth": response.meta['depth'],
|
||||||
"button_text": button_soup.get_text(separator='\n', strip=True),
|
"button_text": link_text
|
||||||
"previous_url": previous_url,
|
"previous_url": previous_url,
|
||||||
"original_start_url": self.original_url,
|
"original_start_url": self.original_url,
|
||||||
"start_url": self.start_urls[0],
|
"start_url": self.start_urls[0],
|
||||||
"domain": self.allowed_domains[0],
|
"domain": self.allowed_domains[0],
|
||||||
"content": soup.get_text(separator='\n', strip=True)
|
"content": response.text
|
||||||
}
|
}
|
||||||
|
|
||||||
def _requests_to_follow(self, response):
|
def _requests_to_follow(self, response):
|
||||||
@ -79,3 +66,6 @@ class ParishesSpider(CrawlSpider):
|
|||||||
if reason == 'finished':
|
if reason == 'finished':
|
||||||
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
with open('./processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
print(self.original_url, file=f)
|
print(self.original_url, file=f)
|
||||||
|
else:
|
||||||
|
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
|
||||||
|
print(self.original_url, file=f)
|
||||||
|
@ -1,4 +0,0 @@
|
|||||||
requests
|
|
||||||
beautifulsoup4
|
|
||||||
selenium
|
|
||||||
lxml
|
|
@ -1,10 +0,0 @@
|
|||||||
#
|
|
||||||
# This file is autogenerated by pip-compile
|
|
||||||
# To update, run:
|
|
||||||
#
|
|
||||||
# pip-compile --output-file requirements.txt requirements.in
|
|
||||||
#
|
|
||||||
beautifulsoup4==4.6.0
|
|
||||||
lxml==3.8.0
|
|
||||||
requests==2.13.0
|
|
||||||
selenium==3.4.3
|
|
@ -5,7 +5,10 @@ include /tmp/makeenv
|
|||||||
|
|
||||||
.PHONY: all clean
|
.PHONY: all clean
|
||||||
|
|
||||||
all: parishes-deon.txt
|
all: parishes-with-urls.tsv
|
||||||
|
|
||||||
|
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv
|
||||||
|
./get_parishes_urls -a $< -p $(word 2,$^)
|
||||||
|
|
||||||
parishes-deon.tsv:
|
parishes-deon.tsv:
|
||||||
./crawl_deon.py > $@
|
./crawl_deon.py > $@
|
||||||
@ -21,7 +24,7 @@ annotated-poznan.tsv:
|
|||||||
./crawl_poznan.py > $@ 2> error_poznan
|
./crawl_poznan.py > $@ 2> error_poznan
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf parishes-deon.txt
|
rm -rf parishes-deon.tsv
|
||||||
|
|
||||||
update:
|
update:
|
||||||
conda env update -f ../environment.yml
|
conda env update -f ../environment.yml
|
||||||
|
Loading…
Reference in New Issue
Block a user