Add domain-blacklist.txt, domain filter, modify crawler.

Add binary or not checker.
This commit is contained in:
Dawid Jurkiewicz 2018-04-09 23:52:11 +02:00
parent 6107a89c78
commit 21ba56a8fa
5 changed files with 134 additions and 21 deletions

View File

@ -7,11 +7,11 @@ JOBS := 6
all: data
data: parishwebsites/spider-commands.txt
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log

View File

@ -1,5 +1,6 @@
name: polish-masses
channels:
- defaults
- conda-forge
dependencies:
- python
@ -14,3 +15,4 @@ dependencies:
- ipdb
- colorama
- html2text
- binaryornot

View File

@ -0,0 +1,100 @@
google.pl
facebook.com
google.com
allegro.pl
onet.pl
youtube.com
wp.pl
wikipedia.org
gazeta.pl
olx.pl
interia.pl
blogspot.com
mbank.com.pl
o2.pl
wiocha.pl
filmweb.pl
gumtree.pl
yahoo.com
ceneo.pl
otomoto.pl
tvn24.pl
wykop.pl
pudelek.pl
cda.pl
chomikuj.pl
sport.pl
instagram.com
kwejk.pl
gemius.pl
money.pl
fakt.pl
ingbank.pl
googleadservices.com
biztok.pl
demotywatory.pl
home.pl
twitter.com
wyborcza.pl
bycontext.com
redtube.com
nocoty.pl
plotek.pl
zalukaj.tv
linkedin.com
centrum24.pl
amazon.com
goldenline.pl
aliexpress.com
adcash.com
orange.pl
wyborcza.biz
ipko.pl
joemonster.org
redakcja.pl
nazwa.pl
bezuzyteczna.pl
dobreprogramy.pl
xvideos.com
tvp.pl
xhamster.com
vod.pl
natemat.pl
tumblr.com
play.pl
microsoft.com
nk.pl
bankier.pl
pracuj.pl
elektroda.pl
gem.pl
pornhub.com
ankieta-online.pl
sportowefakty.pl
wordpress.com
bet365.com
przegladsportowy.pl
showup.tv
naszemiasto.pl
stackoverflow.com
gry.pl
googleusercontent.com
gratka.pl
aliorbank.pl
imgur.com
ask.fm
pclab.pl
otodom.pl
infor.pl
adobe.com
euro.com.pl
blox.pl
torrenty.org
pekao24.pl
skapiec.pl
gameforge.com
lotto.pl
zalando.pl
zumi.pl
ask.com
gry-online.pl

View File

@ -6,7 +6,7 @@ from scrapy.linkextractors import LinkExtractor
import requests
from scrapy import signals
from scrapy.http import HtmlResponse
from binaryornot.helpers import is_binary_string
def _get_allowed_domains(urls):
domains = []
@ -17,17 +17,15 @@ def _get_allowed_domains(urls):
domains.append(domain)
return domains
def get_deny_domains():
with open('domain-blacklist.txt') as f:
blacklisted_domains = [line.rstrip('\n') for line in f]
return blacklisted_domains
class ParishesSpider(CrawlSpider):
name = "parishes"
deny_regex = [
'wikipedia', 'facebook',
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
]
rules = (Rule(
LinkExtractor(deny=deny_regex),
LinkExtractor(deny_domains=get_deny_domains()),
callback='parse_start_url',
follow=True), )
@ -43,16 +41,17 @@ class ParishesSpider(CrawlSpider):
previous_url = response.meta[
'previous_url'] if 'previous_url' in response.meta else ''
yield {
"url": response.url,
"depth": response.meta['depth'],
"button_text": link_text,
"previous_url": previous_url,
"original_start_url": self.original_url,
"start_url": self.start_urls[0],
"domain": self.allowed_domains[0],
"content": response.text
}
if not is_binary_string(response.text.encode('utf-8')[:2048]):
yield {
"url": response.url,
"depth": response.meta['depth'],
"button_text": link_text,
"previous_url": previous_url,
"original_start_url": self.original_url,
"start_url": self.start_urls[0],
"domain": self.allowed_domains[0],
"content": response.text
}
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
import sys
with open(sys.argv[1]) as f:
blacklisted_domains = [line.rstrip('\n') for line in f]
for line in sys.stdin:
for domain in blacklisted_domains:
if domain not in line:
print(line, end='')