diff --git a/.gitignore b/.gitignore index 72364f9..87d0039 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +*.py.rej *$py.class # C extensions diff --git a/annotator/app.py b/annotator/app.py new file mode 100644 index 0000000..34b1c5c --- /dev/null +++ b/annotator/app.py @@ -0,0 +1,9 @@ +from flask import Flask +app = Flask(__name__) + +@app.route("/") +def hello(): + return "

Hello There!

" + +if __name__ == "__main__": + app.run(host='0.0.0.0') diff --git a/annotator/wsgi.py b/annotator/wsgi.py new file mode 100644 index 0000000..6026b0f --- /dev/null +++ b/annotator/wsgi.py @@ -0,0 +1,4 @@ +from app import app + +if __name__ == "__main__": + app.run() diff --git a/environment.yml b/environment.yml index 558ffa6..4db358c 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,8 @@ channels: dependencies: - python - scrapy + - gunicorn + - flask - pip: - lxml - tldextract diff --git a/extractor-rule-based/extract.py b/extractor-rule-based/extract.py new file mode 100755 index 0000000..4acccfe --- /dev/null +++ b/extractor-rule-based/extract.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +from colorama import Fore, Back, Style +import os +import jsonlines +import re +import pprint + + +class Extractor: + def __init__(self, page): + "docstring" + self.page = page + self.content = page['content'] + self.header = self.wrap_with_name_group( + 'header', + 'porządek mszy (świętych|św|św\.)|msz[ea][ \n]+([śs]wi[eę]t[ea]|św|św\.)' + ) + + self.sunday_title = self.wrap_with_name_group( + 'sunday_title', + 'niedziel[a|e][ \n]+i[ \n]+(dni[ \n]+(świąteczne|św|św\.)|święta)' + '|niedziel[ea]' + '|porządek świąteczny') + #'|święta' + self.sunday_masses = self.wrap_with_name_group( + 'sunday_masses', '.*[^\d]\d{1,2}[^\d].*?') + self.everyday_title = self.wrap_with_name_group( + 'everyday_title', 'dzień powszedni' + '|dni powszednie' + '|w tygodniu' + '|porządek zwykły' + '|od poniedziałku do soboty') + self.everyday_masses = self.wrap_with_name_group( + 'everyday_masses', + '(.*?[^\d\n]?\d{1,2}[^\d\n]?.*?\n)+') # \n lub koniec stringa + + def wrap_with_name_group(self, name, pattern): + return '(?P<{}>{})'.format(name, pattern) + + def extract(self, search_space=None): + if not search_space: + search_space = self.content + header_match = re.search(self.header, search_space, re.I) + if not header_match: + return None + search_space = search_space[header_match.end():] + + sunday_title_match = re.search(self.sunday_title, search_space, re.I) + if not sunday_title_match: + return None + if re.search(self.header, search_space[:sunday_title_match.start()], + re.I): # found header closer to sunday title + return self.extract(search_space) + if sunday_title_match.start() > 50: + return self.extract(search_space[sunday_title_match.end()]) + + everyday_title_match = re.search(self.everyday_title, search_space, + re.I) + if not everyday_title_match: + return None + sunday_masses_hours = search_space[sunday_title_match.end(): + everyday_title_match.start()] + if not re.search(self.sunday_masses, sunday_masses_hours, + re.DOTALL | re.I): + return None + if len(sunday_masses_hours) > 500: + return self.extract(search_space[sunday_title_match.end():]) + everyday_masses_match = re.search( + self.everyday_masses, search_space[everyday_title_match.end():], + re.I) + if not everyday_masses_match: + return None + if everyday_masses_match.start() > 150: + return self.extract(search_space[sunday_title_match.end():]) + + whole_result = header_match.group( + 0) + search_space[:everyday_masses_match.end() + + everyday_title_match.end()] + groups = (header_match.group(0), sunday_title_match.group(0), + sunday_masses_hours, everyday_title_match.group(0), + everyday_masses_match.group(0)) + # print(whole_result) + # print(groups) + # obsłużyć # TODO: + # w dni powszednie (w roku szkolnym) - górny kościół + # 6:30, 7:00, 8:00, 18:00 + # w dni powszednie (czas wakacji) - górny kościół + # 7:00, 8:00, 18:00 + + print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[ + 'url'], self.page['depth'], self.page['button_text'])) + return whole_result, groups + + +def process_directory(directory): + found = 0 + not_found = 0 + for root, dirs, files in os.walk(directory): + for fname in files: + filepath = os.path.join(root, fname) + if os.path.getsize(filepath) > 0: + with jsonlines.open(filepath) as reader: + # print(filepath) + if process_parish(reader): + found += 1 + else: + not_found += 1 + # print('found: {}\nnot_found: {}'.format(found, not_found)) + else: + pass # empty file + + +def color_match(whole_match, groups, background, colors, style): + for i in range(len(groups)): + whole_match = whole_match.replace( + groups[i], colors[i] + background + style + groups[i] + + Style.RESET_ALL + background + style, 1) + return whole_match + Style.RESET_ALL + + +def process_parish(reader): + for page in sorted(reader, key=lambda x: x['depth']): #sort by depth + extractor = Extractor(page) + result = extractor.extract() + if result: + whole_result, groups = result + if whole_result not in page['content']: + import ipdb + ipdb.set_trace() + pretty_text = page['content'].replace( + whole_result, + color_match(whole_result, groups, Back.BLACK, [ + Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN + ], Style.BRIGHT)) + print(pretty_text) + import ipdb + ipdb.set_trace() + return True + else: + return False + # import ipdb + # ipdb.set_trace() + pass + + +def main(): + process_directory('./parishwebsites/data-final') + + +if __name__ == '__main__': + main() diff --git a/extractor/find_hours.py b/extractor/find_hours.py new file mode 100755 index 0000000..88f8070 --- /dev/null +++ b/extractor/find_hours.py @@ -0,0 +1,37 @@ +import re +from colorama import Fore, Back, Style + +hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9') + +def borders_ok(text, start, end): + text = ' ' + text + ' ' + before_start_char = text[start] + after_end_char = text[end + 1] + if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'): + return True + else: + return False + +def get_context(text, start, end, minsize): + hour = text[start:end] + prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:] + suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1] + return ' '.join(prefix), hour, ' '.join(suffix) + +def hours_iterator(text, minsize=20): + for hour_match in hour_regex.finditer(text): + start = hour_match.start(0) + end = hour_match.end(0) + if not borders_ok(text, start, end): + continue + prefix, hour, suffix = get_context(text, start, end, minsize) + utterance = f'{prefix}&&&{hour}###{suffix}' + yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT) + +# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie + + +def color_hour(prefix, hour, suffix, color, style): + return prefix + color + style + hour + Style.RESET_ALL + suffix + +