From 1f6b1e6ffeecf19d71372522aa18109ee5769565 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Mon, 14 May 2018 01:51:40 +0200 Subject: [PATCH] Working utterances getting/pickling Working converting parishes from html2text. Add makefile parish2text goal. Change to non-html(text) parishes in extract_rule_based and get_utterances Enhance find_hours.py Wrap render_template in make_response in webapp/app.py --- Makefile | 3 + extract_rule_based.py | 11 +-- extractor/find_hours.py | 9 +- get_utterances.py | 162 ++++++++++++++++++++++++++++++++++ parishwebsites/parish2text.py | 29 ++++-- test.py | 73 --------------- utils/iterator.py | 11 +-- webapp/app.py | 14 +-- 8 files changed, 210 insertions(+), 102 deletions(-) create mode 100755 get_utterances.py delete mode 100755 test.py diff --git a/Makefile b/Makefile index 3d3353c..30ee207 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,9 @@ JOBS := 100 all: data +parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh + mkdir -p parishwebsites/{text-data,text-data-logs} + cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh cd parishwebsites && ./deal-with-not-completed.sh diff --git a/extract_rule_based.py b/extract_rule_based.py index d8a1738..6c780dc 100755 --- a/extract_rule_based.py +++ b/extract_rule_based.py @@ -88,6 +88,7 @@ class Extractor: groups = (header_match.group(0), sunday_title_match.group(0), sunday_masses_hours, everyday_title_match.group(0), everyday_masses_match.group(0)) + print(sunday_masses_hours + '\n' + everyday_masses_match.group(0)) # print(whole_result) # print(groups) # obsłużyć # TODO: @@ -109,10 +110,12 @@ def process_directory(directory): for fname in sorted(files): filepath = os.path.join(root, fname) if os.path.getsize(filepath) > 0: - if process_parish(iterator.parish_page_iterator(filepath)): - print(filepath) + if process_parish( + iterator.parish_page_iterator(filepath, html=False)): + print('found: {}'.format(filepath)) found += 1 else: + print('missed: {}'.format(filepath)) not_found += 1 print('found: {}\nnot_found: {}'.format(found, not_found)) else: @@ -146,14 +149,12 @@ def process_parish(reader): # print(page['depth']) # print(page['url']) # print(page['button_text']) - # import ipdb - # ipdb.set_trace() return True return False def main(): - process_directory('./parishwebsites/data') + process_directory('./parishwebsites/text-data') if __name__ == '__main__': diff --git a/extractor/find_hours.py b/extractor/find_hours.py index 7eabcac..b3e4e1b 100755 --- a/extractor/find_hours.py +++ b/extractor/find_hours.py @@ -1,16 +1,17 @@ import re from colorama import Fore, Back, Style -hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9') +hour_regex = re.compile( + '(0[6-9]|1\d|2[0-2])[:.](oo|[0-5]\d)|6|7|8|9|1\d|2[0-2]') def borders_ok(text, start, end): text = ' ' + text + ' ' before_start_char = text[start] after_end_char = text[end + 1] - if (before_start_char.isspace() - or before_start_char == ',') and (after_end_char.isspace() - or after_end_char in ',;'): + if ((before_start_char.isspace() or before_start_char in ',(/') + and (after_end_char.isspace() or after_end_char in ',;)/') + and (before_start_char != '(' or after_end_char != ')')): return True else: return False diff --git a/get_utterances.py b/get_utterances.py new file mode 100755 index 0000000..d8d4d6e --- /dev/null +++ b/get_utterances.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +import urllib +import logging +import redis +from utils import iterator +from extractor.find_hours import hours_iterator +import re +import pickle + + +# r = redis.StrictRedis(host='localhost', port=6379, db=0) +class Utterance(): + def __init__(self, utterance, url, button_text, depth, filename, line_no): + "docstring" + self.utterance = utterance + self.url = url + self.button_text = button_text + self.depth = depth + self.filename = filename + self.line_no = line_no + + +def add_utterances(parish_page, parish_path, utterances): + utterances_nr = 0 + content = parish_page['content'] + for utterances_nr, utterance in enumerate(hours_iterator(content)): + utterance_inst = Utterance( + utterance, parish_page['url'], parish_page['button_text'], + parish_page['depth'], parish_path, parish_page['line_no']) + utterances.append(utterance_inst) + return utterances_nr + + +def has_mass_metadata(url, button_text, page): + path = urllib.parse.urlparse(url).path + url_suffix = path.rsplit('/', 1)[1] if '/' in path else path + regex = re.compile( + 'msze|nabo[żz]e[ńn]stw(a|(?=\W\d)|$)|porz[ąa]dek($|\.htm)|porz[aą]dek.(liturgi|mszy)|(rozk[lł]ad|plan|godziny|uk[lł]ad|harmonogram|grafik|rozpiska).mszy', + flags=re.IGNORECASE) + bad_regex = re.compile( + 'nabo[zż]e[nń]stwa.(majowe|wielk|czerwcowe|maryjne|pasyjne|pokutne|fatimskie|do|ro[żz]a|czterdzie|w.wielk)', + re.IGNORECASE) + url_match = regex.search(url_suffix) + bad_url_match = bad_regex.search(url_suffix) + button_match = regex.search(button_text) + bad_button_match = bad_regex.search(button_text) + if url_match and button_match and not (bad_button_match or bad_url_match): + # print('both - url_metch: {}'.format(url_match.group(0))) + # print('button_metch: {}'.format(button_match.group(0))) + return True + elif url_match and not bad_url_match: + # print('url_match: {}'.format(url_match.group(0))) + return True + elif button_match and not button_match: + # print('button_match: {}'.format(button_match.group(0))) + return True + return False + + +def remove_http_www(url): + url = re.sub('^https?://', '', url) + return re.sub('^www\.', '', url) + + +def gather_parish_pages(parish_path, unique_urls): + parish_pages = {} + for page_nr, parish_page in enumerate( + iterator.parish_page_iterator(parish_path, html=False)): + url = remove_http_www(parish_page['url']) + button_text = parish_page['button_text'] + if url not in unique_urls and has_mass_metadata( + url, button_text, parish_page): + unique_urls.add(url) + parish_page['line_no'] = page_nr + parish_pages[url] = parish_page + return parish_pages + + +def get_best_parish_pages(parish_pages, n=3): + def pop_best_and_clear(pages): + shortest_url = min(parish_pages.keys(), key=lambda x: len(x)) + best = pages.pop(shortest_url) + for key in list(parish_pages.keys()): + if key.startswith(shortest_url): + del pages[key] + return best + + best_n = [] + for i in range(n): + if parish_pages: + best_n.append(pop_best_and_clear(parish_pages)) + return best_n + + +def remove_duplicates(utterances): + seen = set() + res = [] + for utt in utterances: + if utt.utterance not in seen: + res.append(utt) + seen.add(utt.utterance) + return res + + +def load_parishes(directory, extracted_by_rules): + utterances = [] + utterances_count = 0 + last = 0 + maximum = 0 + unique_urls = set() + for file_nr, parish_path in enumerate( + iterator.parish_path_iterator(directory)): + if parish_path in extracted_by_rules: + continue + # print(parish_path) + metadata_count = 0 + file_utterances = 0 + parish_pages_dict = gather_parish_pages(parish_path, unique_urls) + parish_pages = get_best_parish_pages(parish_pages_dict) + maximum = max(len(parish_pages), maximum) + for pages_count, parish_page in enumerate(parish_pages): + new_utterances = add_utterances(parish_page, parish_path, + utterances) + # if new_utterances > 100: # TODO: in future check this value if it's to big then dont add new_utterances + # pass + utterances_count += new_utterances + file_utterances += new_utterances + url = parish_page['url'] # TODO delete + button_text = parish_page['button_text'] # TODO: delete + logging.warning('{}\t||| {} ||| {} ||| {}'.format( + new_utterances, url, button_text, parish_page['depth'])) + + if utterances_count != last: + curr_str = 'file: {}, page: {}, utterances: {}'.format( + file_nr, parish_page['line_no'], utterances_count) + print(curr_str) + last = utterances_count + print(maximum) + return remove_duplicates(utterances) + + +def get_extracted_by_rules(filename): + extracted_by_rules = set() + with open(filename) as f: + for line in f: + extracted_by_rules.add(line.rstrip('\n')) + return extracted_by_rules + + +def main(): + extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt') + utterances = load_parishes('./parishwebsites/text-data', + extracted_by_rules) + print(len(utterances)) + with open('utterances.pkl', 'wb') as f: + pickle.dump(utterances, f, pickle.HIGHEST_PROTOCOL) + # r.set('foo', 'bar') + # print(r.get('foo')) + + +if __name__ == '__main__': + main() diff --git a/parishwebsites/parish2text.py b/parishwebsites/parish2text.py index 7fb93a0..94af060 100755 --- a/parishwebsites/parish2text.py +++ b/parishwebsites/parish2text.py @@ -4,10 +4,13 @@ import sys import html2text import pprint import re +import logging + class Parish2Text(): def __init__(self): - "docstring" + '''Don't use this object for long period of time, because convertion + will slowdown. Destroy it after every convertion.''' self.text_maker = html2text.HTML2Text() self.text_maker.ignore_links = True self.text_maker.ignore_images = True @@ -16,25 +19,33 @@ class Parish2Text(): self.text_maker.ul_item_mark = '' self.text_maker.emphasis_mark = '' self.text_maker.ignore_tables = True - + def convert(self, parish): parish['content'] = self.text_maker.handle(parish['content']) parish['button_text'] = self.text_maker.handle(parish['button_text']) - parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split()) + parish['button_text'] = ' '.join( + re.sub('[\W_]+', ' ', parish['button_text']).split()) return parish def main(): - parish2text = Parish2Text() writer = jsonlines.Writer(sys.stdout) # text_maker.wrap_links = False reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin)) - for parish in reader: - parish = parish2text.convert(parish) - parish_content = parish.pop('content') - pprint.pprint(parish) - print(parish_content) + for page_nr, parish_page in enumerate(reader): + parish2text = Parish2Text() + try: + parish_page = parish2text.convert(parish_page) + except Exception: + logging.warning('page: {},url: {}'.format(page_nr, + parish_page['url'])) + continue + writer.write(parish_page) + # parish_content = parish_page.pop('content') + # pprint.pprint(parish_page) + # print(parish_content) reader.close() + if __name__ == '__main__': main() diff --git a/test.py b/test.py deleted file mode 100755 index 1ab463b..0000000 --- a/test.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -import redis -from utils import iterator -from extractor.find_hours import hours_iterator -import re - -# r = redis.StrictRedis(host='localhost', port=6379, db=0) - - -def add_utterances(content, utterances): - utterances_nr = 0 - for utterances_nr, utterance in enumerate(hours_iterator(content)): - utterances.append(utterance) - return utterances_nr - - -def has_mass_metadata(url, button_text): - regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek') - url_match = regex.search(url) - button_match = regex.search(button_text) - if url_match and button_match: - print('both - url_metch: {}'.format(url_match.group(0))) - print('button_metch: {}'.format(button_match.group(0))) - return True - elif url_match: - print('url_match: {}'.format(url_match.group(0))) - return True - elif button_match: - print('button_match: {}'.format(button_match.group(0))) - return True - return False - - -def load_parishes(directory): - utterances = [] - utterances_count = 0 - for file_nr, parish_path in enumerate( - iterator.parish_path_iterator(directory)): - print(parish_path) - metadata_count = 0 - for page_nr, parish_page in enumerate( - iterator.parish_page_iterator(parish_path)): - content = parish_page.pop('content') - # if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']): - if page_nr == 0: - utterances_count += add_utterances(content, utterances) - if has_mass_metadata(parish_page['url'], - parish_page['button_text']): - metadata_count += 1 - utterances_count += add_utterances(content, utterances) - - if metadata_count == 1: - break - - if page_nr == 100: - print(utterances_count) - break - print('file: {}, page: {}, utterances: {}'.format( - file_nr, page_nr, utterances_count)) - return {} - - -utterances = {} - - -def main(): - load_parishes('./parishwebsites/data') - # r.set('foo', 'bar') - # print(r.get('foo')) - - -if __name__ == '__main__': - main() diff --git a/utils/iterator.py b/utils/iterator.py index b8297a1..414825c 100644 --- a/utils/iterator.py +++ b/utils/iterator.py @@ -12,13 +12,14 @@ def parish_path_iterator(directory): yield filepath -def parish_page_iterator(filepath): +def parish_page_iterator(filepath, html=True): with jsonlines.open(filepath) as parish_reader: - page_nr = 0 for parish_page in parish_reader: - page_nr += 1 if 'Maximum execution time of 30 seconds exceeded in' in parish_page[ 'content']: continue - parish2text = Parish2Text() - yield parish2text.convert(parish_page) + if html: + parish2text = Parish2Text() + yield parish2text.convert(parish_page) + else: + yield parish_page diff --git a/webapp/app.py b/webapp/app.py index 898157e..1e2c4d3 100644 --- a/webapp/app.py +++ b/webapp/app.py @@ -1,4 +1,4 @@ -from flask import Flask, render_template, request +from flask import Flask, render_template, request, make_response import redis app = Flask(__name__) @@ -18,11 +18,13 @@ def get_action(): hour = '12.00' left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie ' right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.' - return render_template( - 'index.html', - hour=hour, - left_context=left_context, - right_context=right_context) + resp = make_response( + render_template( + 'index.html', + hour=hour, + left_context=left_context, + right_context=right_context)) + return resp @app.route("/", methods=['GET', 'POST'])