diff --git a/annotator.py b/annotator.py index 41bf6d1..e3d8b7f 100755 --- a/annotator.py +++ b/annotator.py @@ -10,31 +10,35 @@ parish2text = Parish2Text() CONTEXT = 100 - def process_parish_page(parish_page): content = parish_page.pop('content') - for utterance, utterance_colored in hours_iterator(content): + for utterance, utterance_colored in hours_iterator(content, color=True): print(utterance_colored) - import ipdb; ipdb.set_trace() + import ipdb + ipdb.set_trace() def process_parish_file(parish_reader): for parish_page in parish_reader: parish_page = parish2text.convert(parish_page) + import ipdb + ipdb.set_trace() process_parish_page(parish_page) def process_directory(directory): for root, dirs, files in os.walk(directory): - # random.shuffle(files) - for fname in sorted(files): + random.shuffle(files) + for fname in files: filepath = os.path.join(root, fname) if os.path.getsize(filepath) > 0: with jsonlines.open(filepath) as parish_reader: process_parish_file(parish_reader) + def main(): process_directory('./parishwebsites/data') + if __name__ == '__main__': main() diff --git a/environment.yml b/environment.yml index 4db358c..1fc5167 100644 --- a/environment.yml +++ b/environment.yml @@ -1,20 +1,20 @@ name: polish-masses channels: - defaults - - conda-forge dependencies: - python - scrapy - gunicorn - flask + - redis-py + - lxml + - requests + - beautifulsoup4 + - colorama - pip: - - lxml - tldextract - - requests - - beautifulsoup4 - python-google-places - jsonlines - ipdb - - colorama - html2text - binaryornot diff --git a/extractor-rule-based/extract.py b/extract_rule_based.py similarity index 81% rename from extractor-rule-based/extract.py rename to extract_rule_based.py index 4acccfe..d8a1738 100755 --- a/extractor-rule-based/extract.py +++ b/extract_rule_based.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 +from utils import iterator from colorama import Fore, Back, Style import os import jsonlines import re import pprint +import sys class Extractor: @@ -38,6 +40,8 @@ class Extractor: return '(?P<{}>{})'.format(name, pattern) def extract(self, search_space=None): + if search_space == '': + return None if not search_space: search_space = self.content header_match = re.search(self.header, search_space, re.I) @@ -51,8 +55,13 @@ class Extractor: if re.search(self.header, search_space[:sunday_title_match.start()], re.I): # found header closer to sunday title return self.extract(search_space) - if sunday_title_match.start() > 50: - return self.extract(search_space[sunday_title_match.end()]) + if sunday_title_match.start( + ) > 50: #sunday_title za daleko header'a wiec szukaj dalej + try: + return self.extract(search_space[sunday_title_match.end():]) + except Exception: + import ipdb + ipdb.set_trace() everyday_title_match = re.search(self.everyday_title, search_space, re.I) @@ -87,8 +96,9 @@ class Extractor: # w dni powszednie (czas wakacji) - górny kościół # 7:00, 8:00, 18:00 - print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[ - 'url'], self.page['depth'], self.page['button_text'])) + print('url: {}\ndepth: {}\nbutton: {}'.format( + self.page['url'], self.page['depth'], self.page['button_text'])) + sys.stdout.flush() return whole_result, groups @@ -96,16 +106,15 @@ def process_directory(directory): found = 0 not_found = 0 for root, dirs, files in os.walk(directory): - for fname in files: + for fname in sorted(files): filepath = os.path.join(root, fname) if os.path.getsize(filepath) > 0: - with jsonlines.open(filepath) as reader: - # print(filepath) - if process_parish(reader): - found += 1 - else: - not_found += 1 - # print('found: {}\nnot_found: {}'.format(found, not_found)) + if process_parish(iterator.parish_page_iterator(filepath)): + print(filepath) + found += 1 + else: + not_found += 1 + print('found: {}\nnot_found: {}'.format(found, not_found)) else: pass # empty file @@ -125,6 +134,7 @@ def process_parish(reader): if result: whole_result, groups = result if whole_result not in page['content']: + pass import ipdb ipdb.set_trace() pretty_text = page['content'].replace( @@ -132,19 +142,18 @@ def process_parish(reader): color_match(whole_result, groups, Back.BLACK, [ Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN ], Style.BRIGHT)) - print(pretty_text) - import ipdb - ipdb.set_trace() - return True - else: - return False + # print(pretty_text) + # print(page['depth']) + # print(page['url']) + # print(page['button_text']) # import ipdb # ipdb.set_trace() - pass + return True + return False def main(): - process_directory('./parishwebsites/data-final') + process_directory('./parishwebsites/data') if __name__ == '__main__': diff --git a/extractor/find_hours.py b/extractor/find_hours.py index 88f8070..7eabcac 100755 --- a/extractor/find_hours.py +++ b/extractor/find_hours.py @@ -3,22 +3,29 @@ from colorama import Fore, Back, Style hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9') + def borders_ok(text, start, end): text = ' ' + text + ' ' before_start_char = text[start] after_end_char = text[end + 1] - if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'): + if (before_start_char.isspace() + or before_start_char == ',') and (after_end_char.isspace() + or after_end_char in ',;'): return True else: return False + def get_context(text, start, end, minsize): hour = text[start:end] - prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:] - suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1] + prefix = re.sub(' +', ' ', text[:start]).rsplit( + ' ', maxsplit=minsize + 2)[1:] + suffix = re.sub(' +', ' ', text[end:]).split( + ' ', maxsplit=minsize + 2)[:-1] return ' '.join(prefix), hour, ' '.join(suffix) -def hours_iterator(text, minsize=20): + +def hours_iterator(text, minsize=20, color=False): for hour_match in hour_regex.finditer(text): start = hour_match.start(0) end = hour_match.end(0) @@ -26,12 +33,15 @@ def hours_iterator(text, minsize=20): continue prefix, hour, suffix = get_context(text, start, end, minsize) utterance = f'{prefix}&&&{hour}###{suffix}' - yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT) + if color: + yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, + Style.BRIGHT) + else: + yield utterance + # w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie def color_hour(prefix, hour, suffix, color, style): return prefix + color + style + hour + Style.RESET_ALL + suffix - - diff --git a/test.py b/test.py new file mode 100755 index 0000000..1ab463b --- /dev/null +++ b/test.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import redis +from utils import iterator +from extractor.find_hours import hours_iterator +import re + +# r = redis.StrictRedis(host='localhost', port=6379, db=0) + + +def add_utterances(content, utterances): + utterances_nr = 0 + for utterances_nr, utterance in enumerate(hours_iterator(content)): + utterances.append(utterance) + return utterances_nr + + +def has_mass_metadata(url, button_text): + regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek') + url_match = regex.search(url) + button_match = regex.search(button_text) + if url_match and button_match: + print('both - url_metch: {}'.format(url_match.group(0))) + print('button_metch: {}'.format(button_match.group(0))) + return True + elif url_match: + print('url_match: {}'.format(url_match.group(0))) + return True + elif button_match: + print('button_match: {}'.format(button_match.group(0))) + return True + return False + + +def load_parishes(directory): + utterances = [] + utterances_count = 0 + for file_nr, parish_path in enumerate( + iterator.parish_path_iterator(directory)): + print(parish_path) + metadata_count = 0 + for page_nr, parish_page in enumerate( + iterator.parish_page_iterator(parish_path)): + content = parish_page.pop('content') + # if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']): + if page_nr == 0: + utterances_count += add_utterances(content, utterances) + if has_mass_metadata(parish_page['url'], + parish_page['button_text']): + metadata_count += 1 + utterances_count += add_utterances(content, utterances) + + if metadata_count == 1: + break + + if page_nr == 100: + print(utterances_count) + break + print('file: {}, page: {}, utterances: {}'.format( + file_nr, page_nr, utterances_count)) + return {} + + +utterances = {} + + +def main(): + load_parishes('./parishwebsites/data') + # r.set('foo', 'bar') + # print(r.get('foo')) + + +if __name__ == '__main__': + main() diff --git a/utils/iterator.py b/utils/iterator.py new file mode 100644 index 0000000..b8297a1 --- /dev/null +++ b/utils/iterator.py @@ -0,0 +1,24 @@ +import os +import jsonlines +import random +from parishwebsites.parish2text import Parish2Text + + +def parish_path_iterator(directory): + for root, dirs, files in os.walk(directory): + for fname in sorted(files): + filepath = os.path.join(root, fname) + if os.path.getsize(filepath) > 0: + yield filepath + + +def parish_page_iterator(filepath): + with jsonlines.open(filepath) as parish_reader: + page_nr = 0 + for parish_page in parish_reader: + page_nr += 1 + if 'Maximum execution time of 30 seconds exceeded in' in parish_page[ + 'content']: + continue + parish2text = Parish2Text() + yield parish2text.convert(parish_page) diff --git a/webapp/app.py b/webapp/app.py index 221bdcd..898157e 100644 --- a/webapp/app.py +++ b/webapp/app.py @@ -1,23 +1,36 @@ from flask import Flask, render_template, request +import redis app = Flask(__name__) + +def load_parishes(directory): + return {} + + +parishes = load_parishes('dir') + + def post_action(): return get_action() + def get_action(): hour = '12.00' left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie ' - right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białego rana.' - return render_template('index.html', hour=hour, left_context=left_context, right_context=right_context) + right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.' + return render_template( + 'index.html', + hour=hour, + left_context=left_context, + right_context=right_context) + @app.route("/", methods=['GET', 'POST']) def root(): if request.method == 'POST': - return post_action() + return post_action() else: - return get_action() - - + return get_action() if __name__ == "__main__": diff --git a/webapp/templates/index.html b/webapp/templates/index.html index d42b77d..388c488 100644 --- a/webapp/templates/index.html +++ b/webapp/templates/index.html @@ -14,7 +14,7 @@
- +

Czy zaznaczono godzinę mszy świętej?

@@ -26,8 +26,8 @@
- - + +
@@ -36,9 +36,37 @@