Add basic wsgi app. Rename extractors, change directories.

Add gunicorn and flask to environment.yml Update .gitignore
2018-04-27 22:44:15 +02:00 · 2018-04-27 22:44:15 +02:00 · 6982ac2e59
commit 6982ac2e59
parent 9b76f4e8aa
6 changed files with 204 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
+*.py.rej
 *$py.class

 # C extensions
--- a/annotator/app.py
+++ b/annotator/app.py
@ -0,0 +1,9 @@
+from flask import Flask
+app = Flask(__name__)
+
+@app.route("/")
+def hello():
+    return "<h1 style='color:blue'>Hello There!</h1>"
+
+if __name__ == "__main__":
+    app.run(host='0.0.0.0')
--- a/annotator/wsgi.py
+++ b/annotator/wsgi.py
@ -0,0 +1,4 @@
+from app import app
+
+if __name__ == "__main__":
+    app.run()
--- a/environment.yml
+++ b/environment.yml
@ -5,6 +5,8 @@ channels:
 dependencies:
  - python
  - scrapy
+  - gunicorn
+  - flask
  - pip:
    - lxml
    - tldextract
--- a/extractor-rule-based/extract.py
+++ b/extractor-rule-based/extract.py
@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+from colorama import Fore, Back, Style
+import os
+import jsonlines
+import re
+import pprint
+
+
+class Extractor:
+    def __init__(self, page):
+        "docstring"
+        self.page = page
+        self.content = page['content']
+        self.header = self.wrap_with_name_group(
+            'header',
+            'porządek mszy (świętych|św|św\.)|msz[ea][ \n]+([śs]wi[eę]t[ea]|św|św\.)'
+        )
+
+        self.sunday_title = self.wrap_with_name_group(
+            'sunday_title',
+            'niedziel[a|e][ \n]+i[ \n]+(dni[ \n]+(świąteczne|św|św\.)|święta)'
+            '|niedziel[ea]'
+            '|porządek świąteczny')
+        #'|święta'
+        self.sunday_masses = self.wrap_with_name_group(
+            'sunday_masses', '.*[^\d]\d{1,2}[^\d].*?')
+        self.everyday_title = self.wrap_with_name_group(
+            'everyday_title', 'dzień powszedni'
+            '|dni powszednie'
+            '|w tygodniu'
+            '|porządek zwykły'
+            '|od poniedziałku do soboty')
+        self.everyday_masses = self.wrap_with_name_group(
+            'everyday_masses',
+            '(.*?[^\d\n]?\d{1,2}[^\d\n]?.*?\n)+')  # \n lub koniec stringa
+
+    def wrap_with_name_group(self, name, pattern):
+        return '(?P<{}>{})'.format(name, pattern)
+
+    def extract(self, search_space=None):
+        if not search_space:
+            search_space = self.content
+        header_match = re.search(self.header, search_space, re.I)
+        if not header_match:
+            return None
+        search_space = search_space[header_match.end():]
+
+        sunday_title_match = re.search(self.sunday_title, search_space, re.I)
+        if not sunday_title_match:
+            return None
+        if re.search(self.header, search_space[:sunday_title_match.start()],
+                     re.I):  # found header closer to sunday title
+            return self.extract(search_space)
+        if sunday_title_match.start() > 50:
+            return self.extract(search_space[sunday_title_match.end()])
+
+        everyday_title_match = re.search(self.everyday_title, search_space,
+                                         re.I)
+        if not everyday_title_match:
+            return None
+        sunday_masses_hours = search_space[sunday_title_match.end():
+                                           everyday_title_match.start()]
+        if not re.search(self.sunday_masses, sunday_masses_hours,
+                         re.DOTALL | re.I):
+            return None
+        if len(sunday_masses_hours) > 500:
+            return self.extract(search_space[sunday_title_match.end():])
+        everyday_masses_match = re.search(
+            self.everyday_masses, search_space[everyday_title_match.end():],
+            re.I)
+        if not everyday_masses_match:
+            return None
+        if everyday_masses_match.start() > 150:
+            return self.extract(search_space[sunday_title_match.end():])
+
+        whole_result = header_match.group(
+            0) + search_space[:everyday_masses_match.end() +
+                              everyday_title_match.end()]
+        groups = (header_match.group(0), sunday_title_match.group(0),
+                  sunday_masses_hours, everyday_title_match.group(0),
+                  everyday_masses_match.group(0))
+        # print(whole_result)
+        # print(groups)
+        # obsłużyć # TODO:
+        # w dni powszednie (w roku szkolnym) - górny kościół
+        # 6:30, 7:00, 8:00, 18:00
+        # w dni powszednie (czas wakacji) - górny kościół
+        # 7:00, 8:00, 18:00
+
+        print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[
+            'url'], self.page['depth'], self.page['button_text']))
+        return whole_result, groups
+
+
+def process_directory(directory):
+    found = 0
+    not_found = 0
+    for root, dirs, files in os.walk(directory):
+        for fname in files:
+            filepath = os.path.join(root, fname)
+            if os.path.getsize(filepath) > 0:
+                with jsonlines.open(filepath) as reader:
+                    # print(filepath)
+                    if process_parish(reader):
+                        found += 1
+                    else:
+                        not_found += 1
+                    # print('found: {}\nnot_found: {}'.format(found, not_found))
+            else:
+                pass  # empty file
+
+
+def color_match(whole_match, groups, background, colors, style):
+    for i in range(len(groups)):
+        whole_match = whole_match.replace(
+            groups[i], colors[i] + background + style + groups[i] +
+            Style.RESET_ALL + background + style, 1)
+    return whole_match + Style.RESET_ALL
+
+
+def process_parish(reader):
+    for page in sorted(reader, key=lambda x: x['depth']):  #sort by depth
+        extractor = Extractor(page)
+        result = extractor.extract()
+        if result:
+            whole_result, groups = result
+            if whole_result not in page['content']:
+                import ipdb
+                ipdb.set_trace()
+            pretty_text = page['content'].replace(
+                whole_result,
+                color_match(whole_result, groups, Back.BLACK, [
+                    Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
+                ], Style.BRIGHT))
+            print(pretty_text)
+            import ipdb
+            ipdb.set_trace()
+            return True
+        else:
+            return False
+            # import ipdb
+            # ipdb.set_trace()
+            pass
+
+
+def main():
+    process_directory('./parishwebsites/data-final')
+
+
+if __name__ == '__main__':
+    main()
--- a/extractor/find_hours.py
+++ b/extractor/find_hours.py
@ -0,0 +1,37 @@
+import re
+from colorama import Fore, Back, Style
+
+hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
+
+def borders_ok(text, start, end):
+    text = ' ' + text + ' '
+    before_start_char = text[start]
+    after_end_char = text[end + 1]
+    if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'):
+        return True
+    else:
+        return False
+
+def get_context(text, start, end, minsize):
+    hour = text[start:end]
+    prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:]
+    suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1]
+    return ' '.join(prefix), hour, ' '.join(suffix)
+
+def hours_iterator(text, minsize=20):
+    for hour_match in hour_regex.finditer(text):
+        start = hour_match.start(0)
+        end = hour_match.end(0)
+        if not borders_ok(text, start, end):
+            continue
+        prefix, hour, suffix = get_context(text, start, end, minsize)
+        utterance = f'{prefix}&&&{hour}###{suffix}'
+        yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT)
+
+# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
+
+
+def color_hour(prefix, hour, suffix, color, style):
+    return prefix + color + style + hour + Style.RESET_ALL + suffix
+    
+