Code refactorings.

This commit is contained in:
Dawid Jurkiewicz 2018-03-01 18:16:11 +01:00
parent 0070ffe07d
commit b433a5e297
9 changed files with 34 additions and 58 deletions

Binary file not shown.

View File

@ -0,0 +1,24 @@
SHELL := /bin/bash
PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv
JOBS := 6
.PHONY: all clean data
all: data
data: parishwebsites/spider-commands.txt
rm -rf data
parallel --jobs $(JOBS) < $<
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log
parishes-deon.tsv: scraper/crawl_deon.py
scraper/crawl_deon.py > $@ 2> crawl-deon.log
clean:
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt

View File

@ -1,3 +0,0 @@
pip-tools
ipdb
pytest

View File

@ -1,24 +0,0 @@
#
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file dev-requirements.txt dev-requirements.in
#
click==6.7 # via pip-tools
decorator==4.0.11 # via ipython, traitlets
first==2.0.1 # via pip-tools
ipdb==0.10.2
ipython-genutils==0.2.0 # via traitlets
ipython==5.3.0 # via ipdb
pexpect==4.2.1 # via ipython
pickleshare==0.7.4 # via ipython
pip-tools==1.9.0
prompt-toolkit==1.0.14 # via ipython
ptyprocess==0.5.1 # via pexpect
py==1.4.34 # via pytest
pygments==2.2.0 # via ipython
pytest==3.1.2
simplegeneric==0.8.1 # via ipython
six==1.10.0 # via pip-tools, prompt-toolkit, traitlets
traitlets==4.3.2 # via ipython
wcwidth==0.1.7 # via prompt-toolkit

View File

@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
import scrapy
import tldextract
import re
@ -30,32 +29,20 @@ class ParishesSpider(CrawlSpider):
self.allowed_domains = _get_allowed_domains(self.start_urls)
def parse_start_url(self, response):
soup = BeautifulSoup(response.text, 'lxml')
[
s.extract()
for s in soup(['style', 'script', '[document]', 'head', 'title'])
]
link_text = response.meta[
'link_text'] if 'link_text' in response.meta else ''
button_soup = BeautifulSoup(link_text, 'lxml')
[
s.extract()
for s in button_soup(
['style', 'script', '[document]', 'head', 'title'])
]
previous_url = response.meta[
'previous_url'] if 'previous_url' in response.meta else ''
yield {
"url": response.url,
"depth": response.meta['depth'],
"button_text": button_soup.get_text(separator='\n', strip=True),
"button_text": link_text
"previous_url": previous_url,
"original_start_url": self.original_url,
"start_url": self.start_urls[0],
"domain": self.allowed_domains[0],
"content": soup.get_text(separator='\n', strip=True)
"content": response.text
}
def _requests_to_follow(self, response):
@ -79,3 +66,6 @@ class ParishesSpider(CrawlSpider):
if reason == 'finished':
with open('./processed.txt', mode='a', encoding='utf-8') as f:
print(self.original_url, file=f)
else:
with open('./not-processed.txt', mode='a', encoding='utf-8') as f:
print(self.original_url, file=f)

View File

@ -1,4 +0,0 @@
requests
beautifulsoup4
selenium
lxml

View File

@ -1,10 +0,0 @@
#
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file requirements.txt requirements.in
#
beautifulsoup4==4.6.0
lxml==3.8.0
requests==2.13.0
selenium==3.4.3

View File

@ -5,7 +5,10 @@ include /tmp/makeenv
.PHONY: all clean
all: parishes-deon.txt
all: parishes-with-urls.tsv
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv
./get_parishes_urls -a $< -p $(word 2,$^)
parishes-deon.tsv:
./crawl_deon.py > $@
@ -21,7 +24,7 @@ annotated-poznan.tsv:
./crawl_poznan.py > $@ 2> error_poznan
clean:
rm -rf parishes-deon.txt
rm -rf parishes-deon.tsv
update:
conda env update -f ../environment.yml