diff --git a/Makefile b/Makefile index a9c229c..e586289 100644 --- a/Makefile +++ b/Makefile @@ -7,11 +7,11 @@ JOBS := 6 all: data -data: parishwebsites/spider-commands.txt +data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt 2> crawler-log.txt -parishwebsites/spider-commands.txt: parishes-with-urls.tsv - cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ +parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt + cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log diff --git a/environment.yml b/environment.yml index 37f92f2..558ffa6 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,6 @@ name: polish-masses channels: + - defaults - conda-forge dependencies: - python @@ -14,3 +15,4 @@ dependencies: - ipdb - colorama - html2text + - binaryornot diff --git a/parishwebsites/domain-blacklist.txt b/parishwebsites/domain-blacklist.txt new file mode 100644 index 0000000..1078579 --- /dev/null +++ b/parishwebsites/domain-blacklist.txt @@ -0,0 +1,100 @@ +google.pl +facebook.com +google.com +allegro.pl +onet.pl +youtube.com +wp.pl +wikipedia.org +gazeta.pl +olx.pl +interia.pl +blogspot.com +mbank.com.pl +o2.pl +wiocha.pl +filmweb.pl +gumtree.pl +yahoo.com +ceneo.pl +otomoto.pl +tvn24.pl +wykop.pl +pudelek.pl +cda.pl +chomikuj.pl +sport.pl +instagram.com +kwejk.pl +gemius.pl +money.pl +fakt.pl +ingbank.pl +googleadservices.com +biztok.pl +demotywatory.pl +home.pl +twitter.com +wyborcza.pl +bycontext.com +redtube.com +nocoty.pl +plotek.pl +zalukaj.tv +linkedin.com +centrum24.pl +amazon.com +goldenline.pl +aliexpress.com +adcash.com +orange.pl +wyborcza.biz +ipko.pl +joemonster.org +redakcja.pl +nazwa.pl +bezuzyteczna.pl +dobreprogramy.pl +xvideos.com +tvp.pl +xhamster.com +vod.pl +natemat.pl +tumblr.com +play.pl +microsoft.com +nk.pl +bankier.pl +pracuj.pl +elektroda.pl +gem.pl +pornhub.com +ankieta-online.pl +sportowefakty.pl +wordpress.com +bet365.com +przegladsportowy.pl +showup.tv +naszemiasto.pl +stackoverflow.com +gry.pl +googleusercontent.com +gratka.pl +aliorbank.pl +imgur.com +ask.fm +pclab.pl +otodom.pl +infor.pl +adobe.com +euro.com.pl +blox.pl +torrenty.org +pekao24.pl +skapiec.pl +gameforge.com +lotto.pl +zalando.pl +zumi.pl +ask.com +gry-online.pl diff --git a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py index 8e0767b..af744ee 100644 --- a/parishwebsites/parishwebsites/spiders/parishes_website_spider.py +++ b/parishwebsites/parishwebsites/spiders/parishes_website_spider.py @@ -6,7 +6,7 @@ from scrapy.linkextractors import LinkExtractor import requests from scrapy import signals from scrapy.http import HtmlResponse - +from binaryornot.helpers import is_binary_string def _get_allowed_domains(urls): domains = [] @@ -17,17 +17,15 @@ def _get_allowed_domains(urls): domains.append(domain) return domains +def get_deny_domains(): + with open('domain-blacklist.txt') as f: + blacklisted_domains = [line.rstrip('\n') for line in f] + return blacklisted_domains class ParishesSpider(CrawlSpider): name = "parishes" - deny_regex = [ - 'wikipedia', 'facebook', - 'http://www\.sluzew\.dominikanie\.pl/nagrania/', - 'http://pasierbiec.info/parafia-z-sercem/\?replytocom=', - 'http://www\.swzygmunt\.knc\.pl/(GALLERIES|galerie)', '^http.*\.flv$' - ] rules = (Rule( - LinkExtractor(deny=deny_regex), + LinkExtractor(deny_domains=get_deny_domains()), callback='parse_start_url', follow=True), ) @@ -43,16 +41,17 @@ class ParishesSpider(CrawlSpider): previous_url = response.meta[ 'previous_url'] if 'previous_url' in response.meta else '' - yield { - "url": response.url, - "depth": response.meta['depth'], - "button_text": link_text, - "previous_url": previous_url, - "original_start_url": self.original_url, - "start_url": self.start_urls[0], - "domain": self.allowed_domains[0], - "content": response.text - } + if not is_binary_string(response.text.encode('utf-8')[:2048]): + yield { + "url": response.url, + "depth": response.meta['depth'], + "button_text": link_text, + "previous_url": previous_url, + "original_start_url": self.original_url, + "start_url": self.start_urls[0], + "domain": self.allowed_domains[0], + "content": response.text + } def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): diff --git a/parishwebsites/remove_blacklisted.py b/parishwebsites/remove_blacklisted.py new file mode 100755 index 0000000..f03ba9c --- /dev/null +++ b/parishwebsites/remove_blacklisted.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +import sys + +with open(sys.argv[1]) as f: + blacklisted_domains = [line.rstrip('\n') for line in f] + +for line in sys.stdin: + for domain in blacklisted_domains: + if domain not in line: + print(line, end='') + +