Switch to pure html download. Enhanced urls filtering.
Update Makefile.
This commit is contained in:
parent
b433a5e297
commit
3027e1e7cc
11
Makefile
11
Makefile
@ -3,13 +3,12 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
|||||||
include /tmp/makeenv
|
include /tmp/makeenv
|
||||||
JOBS := 6
|
JOBS := 6
|
||||||
|
|
||||||
.PHONY: all clean data
|
.PHONY: all update data clean clean-data
|
||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
data: parishwebsites/spider-commands.txt
|
data: parishwebsites/spider-commands.txt
|
||||||
rm -rf data
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
||||||
parallel --jobs $(JOBS) < $<
|
|
||||||
|
|
||||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
||||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
||||||
@ -20,5 +19,11 @@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.p
|
|||||||
parishes-deon.tsv: scraper/crawl_deon.py
|
parishes-deon.tsv: scraper/crawl_deon.py
|
||||||
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
||||||
|
|
||||||
|
update: environment.yml
|
||||||
|
conda env update -f $<
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
|
rm -rf parishes-deon.tsv parishes-with-urls.tsv spider-commands.txt
|
||||||
|
|
||||||
|
clean-data:
|
||||||
|
rm -rf parishwebsites/{data,processed.txt,crawler-log.txt}
|
||||||
|
@ -13,3 +13,4 @@ dependencies:
|
|||||||
- jsonlines
|
- jsonlines
|
||||||
- ipdb
|
- ipdb
|
||||||
- colorama
|
- colorama
|
||||||
|
- html2text
|
||||||
|
@ -20,7 +20,16 @@ def _get_allowed_domains(urls):
|
|||||||
|
|
||||||
class ParishesSpider(CrawlSpider):
|
class ParishesSpider(CrawlSpider):
|
||||||
name = "parishes"
|
name = "parishes"
|
||||||
rules = (Rule(LinkExtractor(), callback='parse_start_url', follow=True), )
|
deny_regex = [
|
||||||
|
'wikipedia', 'facebook',
|
||||||
|
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
|
||||||
|
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
|
||||||
|
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
|
||||||
|
]
|
||||||
|
rules = (Rule(
|
||||||
|
LinkExtractor(deny=deny_regex),
|
||||||
|
callback='parse_start_url',
|
||||||
|
follow=True), )
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(ParishesSpider, self).__init__(*args, **kwargs)
|
super(ParishesSpider, self).__init__(*args, **kwargs)
|
||||||
@ -37,7 +46,7 @@ class ParishesSpider(CrawlSpider):
|
|||||||
yield {
|
yield {
|
||||||
"url": response.url,
|
"url": response.url,
|
||||||
"depth": response.meta['depth'],
|
"depth": response.meta['depth'],
|
||||||
"button_text": link_text
|
"button_text": link_text,
|
||||||
"previous_url": previous_url,
|
"previous_url": previous_url,
|
||||||
"original_start_url": self.original_url,
|
"original_start_url": self.original_url,
|
||||||
"start_url": self.start_urls[0],
|
"start_url": self.start_urls[0],
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
. /home/siulkilulki/pkgs/miniconda3/etc/profile.d/conda.sh
|
||||||
if conda info --envs | grep -q "polish-masses"; then
|
if conda info --envs | grep -q "polish-masses"; then
|
||||||
(>&2 echo "Environment exist. Ready to process.")
|
(>&2 echo "Environment exist. Ready to process.")
|
||||||
else
|
else
|
||||||
(>&2 conda env create -f environment.yml)
|
(>&2 conda env create -f environment.yml)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
source activate polish-masses
|
conda activate polish-masses
|
||||||
export PYTHONIOENCODING=utf8
|
export PYTHONIOENCODING=utf8
|
||||||
env | sed 's/=/:=/' | sed 's/^/export /'
|
env | sed 's/=/:=/' | sed 's/^/export /'
|
||||||
|
Loading…
Reference in New Issue
Block a user