Add domain-blacklist.txt, domain filter, modify crawler.
Add binary or not checker.
This commit is contained in:
parent
6107a89c78
commit
21ba56a8fa
6
Makefile
6
Makefile
@ -7,11 +7,11 @@ JOBS := 6
|
|||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
data: parishwebsites/spider-commands.txt
|
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
||||||
|
|
||||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
||||||
|
|
||||||
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
||||||
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
name: polish-masses
|
name: polish-masses
|
||||||
channels:
|
channels:
|
||||||
|
- defaults
|
||||||
- conda-forge
|
- conda-forge
|
||||||
dependencies:
|
dependencies:
|
||||||
- python
|
- python
|
||||||
@ -14,3 +15,4 @@ dependencies:
|
|||||||
- ipdb
|
- ipdb
|
||||||
- colorama
|
- colorama
|
||||||
- html2text
|
- html2text
|
||||||
|
- binaryornot
|
||||||
|
100
parishwebsites/domain-blacklist.txt
Normal file
100
parishwebsites/domain-blacklist.txt
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
google.pl
|
||||||
|
facebook.com
|
||||||
|
google.com
|
||||||
|
allegro.pl
|
||||||
|
onet.pl
|
||||||
|
youtube.com
|
||||||
|
wp.pl
|
||||||
|
wikipedia.org
|
||||||
|
gazeta.pl
|
||||||
|
olx.pl
|
||||||
|
interia.pl
|
||||||
|
blogspot.com
|
||||||
|
mbank.com.pl
|
||||||
|
o2.pl
|
||||||
|
wiocha.pl
|
||||||
|
filmweb.pl
|
||||||
|
gumtree.pl
|
||||||
|
yahoo.com
|
||||||
|
ceneo.pl
|
||||||
|
otomoto.pl
|
||||||
|
tvn24.pl
|
||||||
|
wykop.pl
|
||||||
|
pudelek.pl
|
||||||
|
cda.pl
|
||||||
|
chomikuj.pl
|
||||||
|
sport.pl
|
||||||
|
instagram.com
|
||||||
|
kwejk.pl
|
||||||
|
gemius.pl
|
||||||
|
money.pl
|
||||||
|
fakt.pl
|
||||||
|
ingbank.pl
|
||||||
|
googleadservices.com
|
||||||
|
biztok.pl
|
||||||
|
demotywatory.pl
|
||||||
|
home.pl
|
||||||
|
twitter.com
|
||||||
|
wyborcza.pl
|
||||||
|
bycontext.com
|
||||||
|
redtube.com
|
||||||
|
nocoty.pl
|
||||||
|
plotek.pl
|
||||||
|
zalukaj.tv
|
||||||
|
linkedin.com
|
||||||
|
centrum24.pl
|
||||||
|
amazon.com
|
||||||
|
goldenline.pl
|
||||||
|
aliexpress.com
|
||||||
|
adcash.com
|
||||||
|
orange.pl
|
||||||
|
wyborcza.biz
|
||||||
|
ipko.pl
|
||||||
|
joemonster.org
|
||||||
|
redakcja.pl
|
||||||
|
nazwa.pl
|
||||||
|
bezuzyteczna.pl
|
||||||
|
dobreprogramy.pl
|
||||||
|
xvideos.com
|
||||||
|
tvp.pl
|
||||||
|
xhamster.com
|
||||||
|
vod.pl
|
||||||
|
natemat.pl
|
||||||
|
tumblr.com
|
||||||
|
play.pl
|
||||||
|
microsoft.com
|
||||||
|
nk.pl
|
||||||
|
bankier.pl
|
||||||
|
pracuj.pl
|
||||||
|
elektroda.pl
|
||||||
|
gem.pl
|
||||||
|
pornhub.com
|
||||||
|
ankieta-online.pl
|
||||||
|
sportowefakty.pl
|
||||||
|
wordpress.com
|
||||||
|
bet365.com
|
||||||
|
przegladsportowy.pl
|
||||||
|
showup.tv
|
||||||
|
naszemiasto.pl
|
||||||
|
stackoverflow.com
|
||||||
|
gry.pl
|
||||||
|
googleusercontent.com
|
||||||
|
gratka.pl
|
||||||
|
aliorbank.pl
|
||||||
|
imgur.com
|
||||||
|
ask.fm
|
||||||
|
pclab.pl
|
||||||
|
otodom.pl
|
||||||
|
infor.pl
|
||||||
|
adobe.com
|
||||||
|
euro.com.pl
|
||||||
|
blox.pl
|
||||||
|
torrenty.org
|
||||||
|
pekao24.pl
|
||||||
|
skapiec.pl
|
||||||
|
gameforge.com
|
||||||
|
lotto.pl
|
||||||
|
zalando.pl
|
||||||
|
zumi.pl
|
||||||
|
ask.com
|
||||||
|
gry-online.pl
|
@ -6,7 +6,7 @@ from scrapy.linkextractors import LinkExtractor
|
|||||||
import requests
|
import requests
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
from scrapy.http import HtmlResponse
|
from scrapy.http import HtmlResponse
|
||||||
|
from binaryornot.helpers import is_binary_string
|
||||||
|
|
||||||
def _get_allowed_domains(urls):
|
def _get_allowed_domains(urls):
|
||||||
domains = []
|
domains = []
|
||||||
@ -17,17 +17,15 @@ def _get_allowed_domains(urls):
|
|||||||
domains.append(domain)
|
domains.append(domain)
|
||||||
return domains
|
return domains
|
||||||
|
|
||||||
|
def get_deny_domains():
|
||||||
|
with open('domain-blacklist.txt') as f:
|
||||||
|
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||||
|
return blacklisted_domains
|
||||||
|
|
||||||
class ParishesSpider(CrawlSpider):
|
class ParishesSpider(CrawlSpider):
|
||||||
name = "parishes"
|
name = "parishes"
|
||||||
deny_regex = [
|
|
||||||
'wikipedia', 'facebook',
|
|
||||||
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
|
|
||||||
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
|
|
||||||
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
|
|
||||||
]
|
|
||||||
rules = (Rule(
|
rules = (Rule(
|
||||||
LinkExtractor(deny=deny_regex),
|
LinkExtractor(deny_domains=get_deny_domains()),
|
||||||
callback='parse_start_url',
|
callback='parse_start_url',
|
||||||
follow=True), )
|
follow=True), )
|
||||||
|
|
||||||
@ -43,16 +41,17 @@ class ParishesSpider(CrawlSpider):
|
|||||||
previous_url = response.meta[
|
previous_url = response.meta[
|
||||||
'previous_url'] if 'previous_url' in response.meta else ''
|
'previous_url'] if 'previous_url' in response.meta else ''
|
||||||
|
|
||||||
yield {
|
if not is_binary_string(response.text.encode('utf-8')[:2048]):
|
||||||
"url": response.url,
|
yield {
|
||||||
"depth": response.meta['depth'],
|
"url": response.url,
|
||||||
"button_text": link_text,
|
"depth": response.meta['depth'],
|
||||||
"previous_url": previous_url,
|
"button_text": link_text,
|
||||||
"original_start_url": self.original_url,
|
"previous_url": previous_url,
|
||||||
"start_url": self.start_urls[0],
|
"original_start_url": self.original_url,
|
||||||
"domain": self.allowed_domains[0],
|
"start_url": self.start_urls[0],
|
||||||
"content": response.text
|
"domain": self.allowed_domains[0],
|
||||||
}
|
"content": response.text
|
||||||
|
}
|
||||||
|
|
||||||
def _requests_to_follow(self, response):
|
def _requests_to_follow(self, response):
|
||||||
if not isinstance(response, HtmlResponse):
|
if not isinstance(response, HtmlResponse):
|
||||||
|
12
parishwebsites/remove_blacklisted.py
Executable file
12
parishwebsites/remove_blacklisted.py
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
for domain in blacklisted_domains:
|
||||||
|
if domain not in line:
|
||||||
|
print(line, end='')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user