Add domain-blacklist.txt, domain filter, modify crawler.
Add binary or not checker.
This commit is contained in:
parent
6107a89c78
commit
21ba56a8fa
6
Makefile
6
Makefile
@ -7,11 +7,11 @@ JOBS := 6
|
||||
|
||||
all: data
|
||||
|
||||
data: parishwebsites/spider-commands.txt
|
||||
data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt
|
||||
cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt
|
||||
|
||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
||||
parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt
|
||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@
|
||||
|
||||
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
||||
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
||||
|
@ -1,5 +1,6 @@
|
||||
name: polish-masses
|
||||
channels:
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python
|
||||
@ -14,3 +15,4 @@ dependencies:
|
||||
- ipdb
|
||||
- colorama
|
||||
- html2text
|
||||
- binaryornot
|
||||
|
100
parishwebsites/domain-blacklist.txt
Normal file
100
parishwebsites/domain-blacklist.txt
Normal file
@ -0,0 +1,100 @@
|
||||
google.pl
|
||||
facebook.com
|
||||
google.com
|
||||
allegro.pl
|
||||
onet.pl
|
||||
youtube.com
|
||||
wp.pl
|
||||
wikipedia.org
|
||||
gazeta.pl
|
||||
olx.pl
|
||||
interia.pl
|
||||
blogspot.com
|
||||
mbank.com.pl
|
||||
o2.pl
|
||||
wiocha.pl
|
||||
filmweb.pl
|
||||
gumtree.pl
|
||||
yahoo.com
|
||||
ceneo.pl
|
||||
otomoto.pl
|
||||
tvn24.pl
|
||||
wykop.pl
|
||||
pudelek.pl
|
||||
cda.pl
|
||||
chomikuj.pl
|
||||
sport.pl
|
||||
instagram.com
|
||||
kwejk.pl
|
||||
gemius.pl
|
||||
money.pl
|
||||
fakt.pl
|
||||
ingbank.pl
|
||||
googleadservices.com
|
||||
biztok.pl
|
||||
demotywatory.pl
|
||||
home.pl
|
||||
twitter.com
|
||||
wyborcza.pl
|
||||
bycontext.com
|
||||
redtube.com
|
||||
nocoty.pl
|
||||
plotek.pl
|
||||
zalukaj.tv
|
||||
linkedin.com
|
||||
centrum24.pl
|
||||
amazon.com
|
||||
goldenline.pl
|
||||
aliexpress.com
|
||||
adcash.com
|
||||
orange.pl
|
||||
wyborcza.biz
|
||||
ipko.pl
|
||||
joemonster.org
|
||||
redakcja.pl
|
||||
nazwa.pl
|
||||
bezuzyteczna.pl
|
||||
dobreprogramy.pl
|
||||
xvideos.com
|
||||
tvp.pl
|
||||
xhamster.com
|
||||
vod.pl
|
||||
natemat.pl
|
||||
tumblr.com
|
||||
play.pl
|
||||
microsoft.com
|
||||
nk.pl
|
||||
bankier.pl
|
||||
pracuj.pl
|
||||
elektroda.pl
|
||||
gem.pl
|
||||
pornhub.com
|
||||
ankieta-online.pl
|
||||
sportowefakty.pl
|
||||
wordpress.com
|
||||
bet365.com
|
||||
przegladsportowy.pl
|
||||
showup.tv
|
||||
naszemiasto.pl
|
||||
stackoverflow.com
|
||||
gry.pl
|
||||
googleusercontent.com
|
||||
gratka.pl
|
||||
aliorbank.pl
|
||||
imgur.com
|
||||
ask.fm
|
||||
pclab.pl
|
||||
otodom.pl
|
||||
infor.pl
|
||||
adobe.com
|
||||
euro.com.pl
|
||||
blox.pl
|
||||
torrenty.org
|
||||
pekao24.pl
|
||||
skapiec.pl
|
||||
gameforge.com
|
||||
lotto.pl
|
||||
zalando.pl
|
||||
zumi.pl
|
||||
ask.com
|
||||
gry-online.pl
|
@ -6,7 +6,7 @@ from scrapy.linkextractors import LinkExtractor
|
||||
import requests
|
||||
from scrapy import signals
|
||||
from scrapy.http import HtmlResponse
|
||||
|
||||
from binaryornot.helpers import is_binary_string
|
||||
|
||||
def _get_allowed_domains(urls):
|
||||
domains = []
|
||||
@ -17,17 +17,15 @@ def _get_allowed_domains(urls):
|
||||
domains.append(domain)
|
||||
return domains
|
||||
|
||||
def get_deny_domains():
|
||||
with open('domain-blacklist.txt') as f:
|
||||
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||
return blacklisted_domains
|
||||
|
||||
class ParishesSpider(CrawlSpider):
|
||||
name = "parishes"
|
||||
deny_regex = [
|
||||
'wikipedia', 'facebook',
|
||||
'http://www\.sluzew\.dominikanie\.pl/nagrania/',
|
||||
'http://pasierbiec.info/parafia-z-sercem/\?replytocom=',
|
||||
'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$'
|
||||
]
|
||||
rules = (Rule(
|
||||
LinkExtractor(deny=deny_regex),
|
||||
LinkExtractor(deny_domains=get_deny_domains()),
|
||||
callback='parse_start_url',
|
||||
follow=True), )
|
||||
|
||||
@ -43,16 +41,17 @@ class ParishesSpider(CrawlSpider):
|
||||
previous_url = response.meta[
|
||||
'previous_url'] if 'previous_url' in response.meta else ''
|
||||
|
||||
yield {
|
||||
"url": response.url,
|
||||
"depth": response.meta['depth'],
|
||||
"button_text": link_text,
|
||||
"previous_url": previous_url,
|
||||
"original_start_url": self.original_url,
|
||||
"start_url": self.start_urls[0],
|
||||
"domain": self.allowed_domains[0],
|
||||
"content": response.text
|
||||
}
|
||||
if not is_binary_string(response.text.encode('utf-8')[:2048]):
|
||||
yield {
|
||||
"url": response.url,
|
||||
"depth": response.meta['depth'],
|
||||
"button_text": link_text,
|
||||
"previous_url": previous_url,
|
||||
"original_start_url": self.original_url,
|
||||
"start_url": self.start_urls[0],
|
||||
"domain": self.allowed_domains[0],
|
||||
"content": response.text
|
||||
}
|
||||
|
||||
def _requests_to_follow(self, response):
|
||||
if not isinstance(response, HtmlResponse):
|
||||
|
12
parishwebsites/remove_blacklisted.py
Executable file
12
parishwebsites/remove_blacklisted.py
Executable file
@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
|
||||
with open(sys.argv[1]) as f:
|
||||
blacklisted_domains = [line.rstrip('\n') for line in f]
|
||||
|
||||
for line in sys.stdin:
|
||||
for domain in blacklisted_domains:
|
||||
if domain not in line:
|
||||
print(line, end='')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user