Add test.py for data gathering (data for annotation)

Small changes to annotator.py (to be deleted in near future) Add utils/iterator Add redis to enviroment.yml Rename, adapt and move rule based extractor. Adapt find_hours. Yapify webapp app (probalby nothing more) Rename buttons in index.html
2018-05-11 23:12:21 +02:00 · 2018-05-11 23:12:21 +02:00 · 382666c563
commit 382666c563
parent c617018611
8 changed files with 209 additions and 48 deletions
--- a/annotator.py
+++ b/annotator.py
@ -10,31 +10,35 @@ parish2text = Parish2Text()
 CONTEXT = 100
 def process_parish_page(parish_page):
    content = parish_page.pop('content')
-    for utterance, utterance_colored in hours_iterator(content):
+    for utterance, utterance_colored in hours_iterator(content, color=True):
        print(utterance_colored)
-        import ipdb; ipdb.set_trace()
+        import ipdb
        ipdb.set_trace()
 def process_parish_file(parish_reader):
    for parish_page in parish_reader:
        parish_page = parish2text.convert(parish_page)
        import ipdb
        ipdb.set_trace()
        process_parish_page(parish_page)
 def process_directory(directory):
    for root, dirs, files in os.walk(directory):
-        # random.shuffle(files)
+        random.shuffle(files)
-        for fname in sorted(files):
+        for fname in files:
            filepath = os.path.join(root, fname)
            if os.path.getsize(filepath) > 0:
                with jsonlines.open(filepath) as parish_reader:
                    process_parish_file(parish_reader)
 def main():
    process_directory('./parishwebsites/data')
 if __name__ == '__main__':
    main()
--- a/environment.yml
+++ b/environment.yml
@ -1,20 +1,20 @@
 name: polish-masses
 channels:
  - defaults
  - conda-forge
 dependencies:
  - python
  - scrapy
  - gunicorn
  - flask
  - redis-py
  - lxml
  - requests
  - beautifulsoup4
  - colorama
  - pip:
    - lxml
    - tldextract
    - requests
    - beautifulsoup4
    - python-google-places
    - jsonlines
    - ipdb
    - colorama
    - html2text
    - binaryornot
--- a/extractor-rule-based/extract.py
+++ b/extractor-rule-based/extract.py
@ -1,9 +1,11 @@
 #!/usr/bin/env python3
 from utils import iterator
 from colorama import Fore, Back, Style
 import os
 import jsonlines
 import re
 import pprint
 import sys
 class Extractor:
@ -38,6 +40,8 @@ class Extractor:
        return '(?P<{}>{})'.format(name, pattern)
    def extract(self, search_space=None):
        if search_space == '':
            return None
        if not search_space:
            search_space = self.content
        header_match = re.search(self.header, search_space, re.I)
@ -51,8 +55,13 @@ class Extractor:
        if re.search(self.header, search_space[:sunday_title_match.start()],
                     re.I):  # found header closer to sunday title
            return self.extract(search_space)
-        if sunday_title_match.start() > 50:
+        if sunday_title_match.start(
-            return self.extract(search_space[sunday_title_match.end()])
+        ) > 50:  #sunday_title za daleko header'a wiec szukaj dalej
            try:
                return self.extract(search_space[sunday_title_match.end():])
            except Exception:
                import ipdb
                ipdb.set_trace()
        everyday_title_match = re.search(self.everyday_title, search_space,
                                         re.I)
@ -87,8 +96,9 @@ class Extractor:
        # w dni powszednie (czas wakacji) - górny kościół
        # 7:00, 8:00, 18:00
-        print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[
+        print('url: {}\ndepth: {}\nbutton: {}'.format(
-            'url'], self.page['depth'], self.page['button_text']))
+            self.page['url'], self.page['depth'], self.page['button_text']))
        sys.stdout.flush()
        return whole_result, groups
@ -96,16 +106,15 @@ def process_directory(directory):
    found = 0
    not_found = 0
    for root, dirs, files in os.walk(directory):
-        for fname in files:
+        for fname in sorted(files):
            filepath = os.path.join(root, fname)
            if os.path.getsize(filepath) > 0:
-                with jsonlines.open(filepath) as reader:
+                if process_parish(iterator.parish_page_iterator(filepath)):
-                    # print(filepath)
+                    print(filepath)
-                    if process_parish(reader):
+                    found += 1
-                        found += 1
+                else:
-                    else:
+                    not_found += 1
-                        not_found += 1
+                print('found: {}\nnot_found: {}'.format(found, not_found))
                    # print('found: {}\nnot_found: {}'.format(found, not_found))
            else:
                pass  # empty file
@ -125,6 +134,7 @@ def process_parish(reader):
        if result:
            whole_result, groups = result
            if whole_result not in page['content']:
                pass
                import ipdb
                ipdb.set_trace()
            pretty_text = page['content'].replace(
@ -132,19 +142,18 @@ def process_parish(reader):
                color_match(whole_result, groups, Back.BLACK, [
                    Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
                ], Style.BRIGHT))
-            print(pretty_text)
+            # print(pretty_text)
-            import ipdb
+            # print(page['depth'])
-            ipdb.set_trace()
+            # print(page['url'])
-            return True
+            # print(page['button_text'])
        else:
            return False
            # import ipdb
            # ipdb.set_trace()
-            pass
+            return True
    return False
 def main():
-    process_directory('./parishwebsites/data-final')
+    process_directory('./parishwebsites/data')
 if __name__ == '__main__':
--- a/extractor/find_hours.py
+++ b/extractor/find_hours.py
@ -3,22 +3,29 @@ from colorama import Fore, Back, Style
 hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
 def borders_ok(text, start, end):
    text = ' ' + text + ' '
    before_start_char = text[start]
    after_end_char = text[end + 1]
-    if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'):
+    if (before_start_char.isspace()
            or before_start_char == ',') and (after_end_char.isspace()
                                              or after_end_char in ',;'):
        return True
    else:
        return False
 def get_context(text, start, end, minsize):
    hour = text[start:end]
-    prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:]
+    prefix = re.sub(' +', ' ', text[:start]).rsplit(
-    suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1]
+        ' ', maxsplit=minsize + 2)[1:]
    suffix = re.sub(' +', ' ', text[end:]).split(
        ' ', maxsplit=minsize + 2)[:-1]
    return ' '.join(prefix), hour, ' '.join(suffix)
-def hours_iterator(text, minsize=20):
+
 def hours_iterator(text, minsize=20, color=False):
    for hour_match in hour_regex.finditer(text):
        start = hour_match.start(0)
        end = hour_match.end(0)
@ -26,12 +33,15 @@ def hours_iterator(text, minsize=20):
            continue
        prefix, hour, suffix = get_context(text, start, end, minsize)
        utterance = f'{prefix}&&&{hour}###{suffix}'
-        yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT)
+        if color:
            yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN,
                                        Style.BRIGHT)
        else:
            yield utterance
 # w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
 def color_hour(prefix, hour, suffix, color, style):
    return prefix + color + style + hour + Style.RESET_ALL + suffix
--- a/test.py
+++ b/test.py
@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 import redis
 from utils import iterator
 from extractor.find_hours import hours_iterator
 import re
 # r = redis.StrictRedis(host='localhost', port=6379, db=0)
 def add_utterances(content, utterances):
    utterances_nr = 0
    for utterances_nr, utterance in enumerate(hours_iterator(content)):
        utterances.append(utterance)
    return utterances_nr
 def has_mass_metadata(url, button_text):
    regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
    url_match = regex.search(url)
    button_match = regex.search(button_text)
    if url_match and button_match:
        print('both - url_metch: {}'.format(url_match.group(0)))
        print('button_metch: {}'.format(button_match.group(0)))
        return True
    elif url_match:
        print('url_match: {}'.format(url_match.group(0)))
        return True
    elif button_match:
        print('button_match: {}'.format(button_match.group(0)))
        return True
    return False
 def load_parishes(directory):
    utterances = []
    utterances_count = 0
    for file_nr, parish_path in enumerate(
            iterator.parish_path_iterator(directory)):
        print(parish_path)
        metadata_count = 0
        for page_nr, parish_page in enumerate(
                iterator.parish_page_iterator(parish_path)):
            content = parish_page.pop('content')
            # if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
            if page_nr == 0:
                utterances_count += add_utterances(content, utterances)
            if has_mass_metadata(parish_page['url'],
                                 parish_page['button_text']):
                metadata_count += 1
                utterances_count += add_utterances(content, utterances)
            if metadata_count == 1:
                break
            if page_nr == 100:
                print(utterances_count)
                break
            print('file: {}, page: {}, utterances: {}'.format(
                file_nr, page_nr, utterances_count))
    return {}
 utterances = {}
 def main():
    load_parishes('./parishwebsites/data')
    # r.set('foo', 'bar')
    # print(r.get('foo'))
 if __name__ == '__main__':
    main()
--- a/utils/iterator.py
+++ b/utils/iterator.py
@ -0,0 +1,24 @@
 import os
 import jsonlines
 import random
 from parishwebsites.parish2text import Parish2Text
 def parish_path_iterator(directory):
    for root, dirs, files in os.walk(directory):
        for fname in sorted(files):
            filepath = os.path.join(root, fname)
            if os.path.getsize(filepath) > 0:
                yield filepath
 def parish_page_iterator(filepath):
    with jsonlines.open(filepath) as parish_reader:
        page_nr = 0
        for parish_page in parish_reader:
            page_nr += 1
            if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
                    'content']:
                continue
            parish2text = Parish2Text()
            yield parish2text.convert(parish_page)
--- a/webapp/app.py
+++ b/webapp/app.py
@ -1,23 +1,36 @@
 from flask import Flask, render_template, request
 import redis
 app = Flask(__name__)
 def load_parishes(directory):
    return {}
 parishes = load_parishes('dir')
 def post_action():
    return get_action()
 def get_action():
    hour = '12.00'
    left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
-    right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białego rana.'
+    right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
-    return render_template('index.html', hour=hour, left_context=left_context, right_context=right_context)
+    return render_template(
        'index.html',
        hour=hour,
        left_context=left_context,
        right_context=right_context)
@app.route("/", methods=['GET', 'POST'])
 def root():
    if request.method == 'POST':
-       return  post_action()
+        return post_action()
    else:
-       return get_action()
+        return get_action()
 if __name__ == "__main__":
--- a/webapp/templates/index.html
+++ b/webapp/templates/index.html
@ -14,7 +14,7 @@
    <div class="container">
        <div class="container mt-1">
            <div class="row justify-content-start">
-                <button type="button" class="btn btn-warning btn-sm" id="cofnij">Cofinj</button>
+                <button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button>
            </div>
            <div class="row justify-content-center">
                <h2>Czy zaznaczono godzinę mszy świętej?</h2>
@ -26,8 +26,8 @@
            </div>
        </div>
        <div class="btn-group d-flex h-mx" role="group">
-            <button type="button" class="btn btn-danger btn-lg w-100" id="nie">Nie</button>
+            <button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button>
-            <button type="button" class="btn btn-success btn-lg w-100" id="tak">Tak</button>
+            <button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button>
        </div>
    </div>
    <!-- Optional JavaScript -->
@ -36,9 +36,37 @@
    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
    <script type="text/javascript">
-     $("button#tak").click(function(){
+     $("button#yes").click(function(){
         $.post( "/", {result: "yes"}, function() {
-             console.log( "success" );
+             console.log( "yes button clicked" );
         })
          .done(function() {
              console.log( "second success" );
          })
          .fail(function() {
              console.log( "error" );
          })
          .always(function() {
              console.log( "finished" );
          });
     });
     $("button#no").click(function(){
         $.post( "/", {result: "no"}, function() {
             console.log( "no button clicked" );
         })
          .done(function() {
              console.log( "second success" );
          })
          .fail(function() {
              console.log( "error" );
          })
          .always(function() {
              console.log( "finished" );
          });
     });
     $("button#undo").click(function(){
         $.post( "/", {result: "undo"}, function() {
             console.log( "undo button clicked" );
         })
          .done(function() {
              console.log( "second success" );