Switch to pure html download. Enhanced urls filtering.

Update Makefile.
This commit is contained in:
Dawid Jurkiewicz 2018-03-11 18:02:31 +01:00
parent b433a5e297
commit 3027e1e7cc
4 changed files with 23 additions and 7 deletions

View File

@ -3,13 +3,12 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv
JOBS := 6
.PHONY: all clean data
.PHONY: all update data clean clean-data
all: data
data: parishwebsites/spider-commands.txt
rm -rf data
parallel --jobs $(JOBS) < $<
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
@ -20,5 +19,11 @@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.p
parishes-deon.tsv: scraper/crawl_deon.py
scraper/crawl_deon.py > $@ 2> crawl-deon.log
update: environment.yml
conda env update -f $<
clean:
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
clean-data:
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}

View File

@ -13,3 +13,4 @@ dependencies:
- jsonlines
- ipdb
- colorama
- html2text

View File

@ -20,7 +20,16 @@ def _get_allowed_domains(urls):
class ParishesSpider(CrawlSpider):
name = "parishes"
rules = (Rule(LinkExtractor(), callback='parse_start_url', follow=True), )
deny_regex = [
'wikipedia', 'facebook',
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
]
rules = (Rule(
LinkExtractor(deny=deny_regex),
callback='parse_start_url',
follow=True), )
def __init__(self, *args, **kwargs):
super(ParishesSpider, self).__init__(*args, **kwargs)
@ -37,12 +46,12 @@ class ParishesSpider(CrawlSpider):
yield {
"url": response.url,
"depth": response.meta['depth'],
"button_text": link_text
"button_text": link_text,
"previous_url": previous_url,
"original_start_url": self.original_url,
"start_url": self.start_urls[0],
"domain": self.allowed_domains[0],
"content": response.text
"content": response.text
}
def _requests_to_follow(self, response):

View File

@ -1,10 +1,11 @@
#!/usr/bin/env bash
. /home/siulkilulki/pkgs/miniconda3/etc/profile.d/conda.sh
if conda info --envs | grep -q "polish-masses"; then
(>&2 echo "Environment exist. Ready to process.")
else
(>&2 conda env create -f environment.yml)
fi
source activate polish-masses
conda activate polish-masses
export PYTHONIOENCODING=utf8
env | sed 's/=/:=/' | sed 's/^/export /'