From 9b76f4e8aa8e4f776aa4d7cea8568c5afc624b63 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Mon, 16 Apr 2018 23:54:03 +0200 Subject: [PATCH] Add robust recrawling of not completed data. Add annotator.py (highlighing hout within context done) Enhance parish2text.py (enable more flags, convert button) --- Makefile | 7 +- annotator.py | 40 ++++++ extractor/extract.py | 151 -------------------- parishwebsites/convert_content2text.py | 29 ---- parishwebsites/deal-with-not-completed.sh | 24 ++++ parishwebsites/find-not-completed.sh | 5 + parishwebsites/parish2text.py | 40 ++++++ parishwebsites/parishwebsites/settings.py | 5 +- parishwebsites/remove_duplicate_commands.py | 13 ++ parishwebsites/view_raw_data.py | 30 ---- 10 files changed, 131 insertions(+), 213 deletions(-) create mode 100755 annotator.py delete mode 100755 extractor/extract.py delete mode 100755 parishwebsites/convert_content2text.py create mode 100755 parishwebsites/deal-with-not-completed.sh create mode 100755 parishwebsites/find-not-completed.sh create mode 100755 parishwebsites/parish2text.py create mode 100755 parishwebsites/remove_duplicate_commands.py delete mode 100755 parishwebsites/view_raw_data.py diff --git a/Makefile b/Makefile index b00c303..3d3353c 100644 --- a/Makefile +++ b/Makefile @@ -7,12 +7,17 @@ JOBS := 100 all: data + +data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh + cd parishwebsites && ./deal-with-not-completed.sh + cd parishwebsites && parallel --jobs $(JOBS) < spider-commands-add.txt + data: parishwebsites/spider-commands.txt parishwebsites/domain-blacklist.txt rm -f parishwebsites/*processed.txt cd parishwebsites && parallel --jobs $(JOBS) < spider-commands.txt parishwebsites/spider-commands.txt: parishes-with-urls.tsv parishwebsites/domain-blacklist.txt - cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) > $@ + cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u | parishwebsites/remove_blacklisted.py $(word 2,$^) | parishwebsites/remove_duplicate_commands.py > $@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log diff --git a/annotator.py b/annotator.py new file mode 100755 index 0000000..41bf6d1 --- /dev/null +++ b/annotator.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import jsonlines +from extractor.find_hours import hours_iterator +from parishwebsites.parish2text import Parish2Text +import os +import random + +parish2text = Parish2Text() + +CONTEXT = 100 + + + +def process_parish_page(parish_page): + content = parish_page.pop('content') + for utterance, utterance_colored in hours_iterator(content): + print(utterance_colored) + import ipdb; ipdb.set_trace() + + +def process_parish_file(parish_reader): + for parish_page in parish_reader: + parish_page = parish2text.convert(parish_page) + process_parish_page(parish_page) + + +def process_directory(directory): + for root, dirs, files in os.walk(directory): + # random.shuffle(files) + for fname in sorted(files): + filepath = os.path.join(root, fname) + if os.path.getsize(filepath) > 0: + with jsonlines.open(filepath) as parish_reader: + process_parish_file(parish_reader) + +def main(): + process_directory('./parishwebsites/data') + +if __name__ == '__main__': + main() diff --git a/extractor/extract.py b/extractor/extract.py deleted file mode 100755 index 4acccfe..0000000 --- a/extractor/extract.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python3 -from colorama import Fore, Back, Style -import os -import jsonlines -import re -import pprint - - -class Extractor: - def __init__(self, page): - "docstring" - self.page = page - self.content = page['content'] - self.header = self.wrap_with_name_group( - 'header', - 'porządek mszy (świętych|św|św\.)|msz[ea][ \n]+([śs]wi[eę]t[ea]|św|św\.)' - ) - - self.sunday_title = self.wrap_with_name_group( - 'sunday_title', - 'niedziel[a|e][ \n]+i[ \n]+(dni[ \n]+(świąteczne|św|św\.)|święta)' - '|niedziel[ea]' - '|porządek świąteczny') - #'|święta' - self.sunday_masses = self.wrap_with_name_group( - 'sunday_masses', '.*[^\d]\d{1,2}[^\d].*?') - self.everyday_title = self.wrap_with_name_group( - 'everyday_title', 'dzień powszedni' - '|dni powszednie' - '|w tygodniu' - '|porządek zwykły' - '|od poniedziałku do soboty') - self.everyday_masses = self.wrap_with_name_group( - 'everyday_masses', - '(.*?[^\d\n]?\d{1,2}[^\d\n]?.*?\n)+') # \n lub koniec stringa - - def wrap_with_name_group(self, name, pattern): - return '(?P<{}>{})'.format(name, pattern) - - def extract(self, search_space=None): - if not search_space: - search_space = self.content - header_match = re.search(self.header, search_space, re.I) - if not header_match: - return None - search_space = search_space[header_match.end():] - - sunday_title_match = re.search(self.sunday_title, search_space, re.I) - if not sunday_title_match: - return None - if re.search(self.header, search_space[:sunday_title_match.start()], - re.I): # found header closer to sunday title - return self.extract(search_space) - if sunday_title_match.start() > 50: - return self.extract(search_space[sunday_title_match.end()]) - - everyday_title_match = re.search(self.everyday_title, search_space, - re.I) - if not everyday_title_match: - return None - sunday_masses_hours = search_space[sunday_title_match.end(): - everyday_title_match.start()] - if not re.search(self.sunday_masses, sunday_masses_hours, - re.DOTALL | re.I): - return None - if len(sunday_masses_hours) > 500: - return self.extract(search_space[sunday_title_match.end():]) - everyday_masses_match = re.search( - self.everyday_masses, search_space[everyday_title_match.end():], - re.I) - if not everyday_masses_match: - return None - if everyday_masses_match.start() > 150: - return self.extract(search_space[sunday_title_match.end():]) - - whole_result = header_match.group( - 0) + search_space[:everyday_masses_match.end() + - everyday_title_match.end()] - groups = (header_match.group(0), sunday_title_match.group(0), - sunday_masses_hours, everyday_title_match.group(0), - everyday_masses_match.group(0)) - # print(whole_result) - # print(groups) - # obsłużyć # TODO: - # w dni powszednie (w roku szkolnym) - górny kościół - # 6:30, 7:00, 8:00, 18:00 - # w dni powszednie (czas wakacji) - górny kościół - # 7:00, 8:00, 18:00 - - print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[ - 'url'], self.page['depth'], self.page['button_text'])) - return whole_result, groups - - -def process_directory(directory): - found = 0 - not_found = 0 - for root, dirs, files in os.walk(directory): - for fname in files: - filepath = os.path.join(root, fname) - if os.path.getsize(filepath) > 0: - with jsonlines.open(filepath) as reader: - # print(filepath) - if process_parish(reader): - found += 1 - else: - not_found += 1 - # print('found: {}\nnot_found: {}'.format(found, not_found)) - else: - pass # empty file - - -def color_match(whole_match, groups, background, colors, style): - for i in range(len(groups)): - whole_match = whole_match.replace( - groups[i], colors[i] + background + style + groups[i] + - Style.RESET_ALL + background + style, 1) - return whole_match + Style.RESET_ALL - - -def process_parish(reader): - for page in sorted(reader, key=lambda x: x['depth']): #sort by depth - extractor = Extractor(page) - result = extractor.extract() - if result: - whole_result, groups = result - if whole_result not in page['content']: - import ipdb - ipdb.set_trace() - pretty_text = page['content'].replace( - whole_result, - color_match(whole_result, groups, Back.BLACK, [ - Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN - ], Style.BRIGHT)) - print(pretty_text) - import ipdb - ipdb.set_trace() - return True - else: - return False - # import ipdb - # ipdb.set_trace() - pass - - -def main(): - process_directory('./parishwebsites/data-final') - - -if __name__ == '__main__': - main() diff --git a/parishwebsites/convert_content2text.py b/parishwebsites/convert_content2text.py deleted file mode 100755 index 754c0a7..0000000 --- a/parishwebsites/convert_content2text.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -import jsonlines -import sys -import html2text - - -def convert_html_to_text(parish, text_maker): - html = parish['content'] - text = text_maker.handle(html) - parish['content'] = text - return parish - - -def main(): - text_maker = html2text.HTML2Text() - text_maker.ignore_links = True - text_maker.ignore_images = True - writer = jsonlines.Writer(sys.stdout) - # text_maker.wrap_links = False - # text_maker.strong_mark = '' - with jsonlines.open(sys.argv[1]) as reader: - for parish in reader: - parish = convert_html_to_text(parish, text_maker) - writer.write(parish) - writer.close() - - -if __name__ == '__main__': - main() diff --git a/parishwebsites/deal-with-not-completed.sh b/parishwebsites/deal-with-not-completed.sh new file mode 100755 index 0000000..035e757 --- /dev/null +++ b/parishwebsites/deal-with-not-completed.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +./find-not-completed.sh > not-completed +# cat duplicate-data >> not-completed +#removes not truly finished in processed.txt +grep -v -f <(cat not-completed | sed -e 's@^@\t@' | sed -e 's@$@\$@') processed.txt | sponge processed.txt + +#appends filenames from spider-commands.txt which are not in processed.txt +comm -13 <(cut -f2 processed.txt | sort -u) <(grep -o 'data/.*" 2>' spider-commands.txt | sed -Ee 's@data/|" 2>@@g' | sort) >> not-completed + +sort -u not-completed | sponge not-completed + +# remove data connected with not-completed e.g. logs/ data/ + +echo data directory file count: `ls -1 data | wc -l` +cd data && xargs rm -f < ../not-completed +cd .. +echo data directory file count: `ls -1 data | wc -l` +echo logs directory file count: `ls -1 logs | wc -l` +cd logs && xargs rm -f < ../not-completed +cd .. +echo logs directory file count: `ls -1 logs | wc -l` + +grep -f <(cat not-completed | sed -e 's@^@"data/'@ | sed -e 's@$@"@') spider-commands.txt > spider-commands-add.txt diff --git a/parishwebsites/find-not-completed.sh b/parishwebsites/find-not-completed.sh new file mode 100755 index 0000000..7d81d25 --- /dev/null +++ b/parishwebsites/find-not-completed.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +(grep -r "No space left on device" logs | sort -u | grep "^logs/.*:OSError" -o | sed -Ee 's@^logs/|:OSError$@@g' | sort -u &&\ + grep -r 'Received SIGTERM' logs/ | grep '^logs/.*:20' -o | sed -Ee 's@^logs/|:20$@@g' | sort -u &&\ + find data -empty -type f | sed -e 's@data/@@' | sort +) | sort -u diff --git a/parishwebsites/parish2text.py b/parishwebsites/parish2text.py new file mode 100755 index 0000000..7fb93a0 --- /dev/null +++ b/parishwebsites/parish2text.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import jsonlines +import sys +import html2text +import pprint +import re + +class Parish2Text(): + def __init__(self): + "docstring" + self.text_maker = html2text.HTML2Text() + self.text_maker.ignore_links = True + self.text_maker.ignore_images = True + self.text_maker.images_to_alt = True + self.text_maker.strong_mark = '' + self.text_maker.ul_item_mark = '' + self.text_maker.emphasis_mark = '' + self.text_maker.ignore_tables = True + + def convert(self, parish): + parish['content'] = self.text_maker.handle(parish['content']) + parish['button_text'] = self.text_maker.handle(parish['button_text']) + parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split()) + return parish + + +def main(): + parish2text = Parish2Text() + writer = jsonlines.Writer(sys.stdout) + # text_maker.wrap_links = False + reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin)) + for parish in reader: + parish = parish2text.convert(parish) + parish_content = parish.pop('content') + pprint.pprint(parish) + print(parish_content) + reader.close() + +if __name__ == '__main__': + main() diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py index 3d340b8..bccc109 100644 --- a/parishwebsites/parishwebsites/settings.py +++ b/parishwebsites/parishwebsites/settings.py @@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING = 'utf-8' LOG_LEVEL = 'DEBUG' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'parishwebsites (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False @@ -85,7 +85,7 @@ AUTOTHROTTLE_TARGET_CONCURRENCY = 1 # AUTOTHROTTLE_DEBUG = True RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 404, 429, 401] -RETRY_TIMES = 5 +RETRY_TIMES = 3 # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True @@ -93,6 +93,7 @@ HTTPCACHE_EXPIRATION_SECS = 1209600 HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = RETRY_HTTP_CODES HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.DbmCacheStorage' +# HTTPCACHE_GZIP = 'True' DEPTH_LIMIT = 3 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' diff --git a/parishwebsites/remove_duplicate_commands.py b/parishwebsites/remove_duplicate_commands.py new file mode 100755 index 0000000..6309e33 --- /dev/null +++ b/parishwebsites/remove_duplicate_commands.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +import sys +import re + +d = {} +for line in sys.stdin: + line = line.rstrip('\n') + id_ = re.search('"(data/.*)" 2>', line).group(1) + d[id_] = line + +for line in d.values(): + print(line) + diff --git a/parishwebsites/view_raw_data.py b/parishwebsites/view_raw_data.py deleted file mode 100755 index 86a3259..0000000 --- a/parishwebsites/view_raw_data.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -import jsonlines -import sys -import html2text -import pprint - - -def convert_html_to_text(parish, text_maker): - html = parish['content'] - text = text_maker.handle(html) - parish['content'] = text - return parish - - -def main(): - text_maker = html2text.HTML2Text() - text_maker.ignore_links = True - text_maker.ignore_images = True - writer = jsonlines.Writer(sys.stdout) - # text_maker.wrap_links = False - text_maker.strong_mark = '' - with jsonlines.open(sys.argv[1]) as reader: - for parish in reader: - parish = convert_html_to_text(parish, text_maker) - parish_content = parish.pop('content') - pprint.pprint(parish) - print(parish_content) - -if __name__ == '__main__': - main()