Switch to pure html download. Enhanced urls filtering.

Update Makefile.
This commit is contained in:
Dawid Jurkiewicz 2018-03-11 18:02:31 +01:00
parent b433a5e297
commit 3027e1e7cc
4 changed files with 23 additions and 7 deletions

View File

@ -3,13 +3,12 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv include /tmp/makeenv
JOBS := 6 JOBS := 6
.PHONY: all clean data .PHONY: all update data clean clean-data
all: data all: data
data: parishwebsites/spider-commands.txt data: parishwebsites/spider-commands.txt
rm -rf data cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
parallel --jobs $(JOBS) < $<
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/spider-commands.txt: parishes-with-urls.tsv
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
@ -20,5 +19,11 @@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.p
parishes-deon.tsv: scraper/crawl_deon.py parishes-deon.tsv: scraper/crawl_deon.py
scraper/crawl_deon.py > $@ 2> crawl-deon.log scraper/crawl_deon.py > $@ 2> crawl-deon.log
update: environment.yml
conda env update -f $<
clean: clean:
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
clean-data:
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}

View File

@ -13,3 +13,4 @@ dependencies:
- jsonlines - jsonlines
- ipdb - ipdb
- colorama - colorama
- html2text

View File

@ -20,7 +20,16 @@ def _get_allowed_domains(urls):
class ParishesSpider(CrawlSpider): class ParishesSpider(CrawlSpider):
name = "parishes" name = "parishes"
rules = (Rule(LinkExtractor(), callback='parse_start_url', follow=True), ) deny_regex = [
'wikipedia', 'facebook',
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
]
rules = (Rule(
LinkExtractor(deny=deny_regex),
callback='parse_start_url',
follow=True), )
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super(ParishesSpider, self).__init__(*args, **kwargs) super(ParishesSpider, self).__init__(*args, **kwargs)
@ -37,7 +46,7 @@ class ParishesSpider(CrawlSpider):
yield { yield {
"url": response.url, "url": response.url,
"depth": response.meta['depth'], "depth": response.meta['depth'],
"button_text": link_text "button_text": link_text,
"previous_url": previous_url, "previous_url": previous_url,
"original_start_url": self.original_url, "original_start_url": self.original_url,
"start_url": self.start_urls[0], "start_url": self.start_urls[0],

View File

@ -1,10 +1,11 @@
#!/usr/bin/env bash #!/usr/bin/env bash
. /home/siulkilulki/pkgs/miniconda3/etc/profile.d/conda.sh
if conda info --envs | grep -q "polish-masses"; then if conda info --envs | grep -q "polish-masses"; then
(>&2 echo "Environment exist. Ready to process.") (>&2 echo "Environment exist. Ready to process.")
else else
(>&2 conda env create -f environment.yml) (>&2 conda env create -f environment.yml)
fi fi
source activate polish-masses conda activate polish-masses
export PYTHONIOENCODING=utf8 export PYTHONIOENCODING=utf8
env | sed 's/=/:=/' | sed 's/^/export /' env | sed 's/=/:=/' | sed 's/^/export /'