From 1f6b1e6ffeecf19d71372522aa18109ee5769565 Mon Sep 17 00:00:00 2001
From: siulkilulki <dawjur@st.amu.edu.pl>
Date: Mon, 14 May 2018 01:51:40 +0200
Subject: [PATCH] Working utterances getting/pickling

Working converting parishes from html2text.
Add makefile parish2text goal.
Change to non-html(text) parishes in extract_rule_based and get_utterances
Enhance find_hours.py
Wrap render_template in make_response in webapp/app.py
---
 Makefile                      |   3 +
 extract_rule_based.py         |  11 +--
 extractor/find_hours.py       |   9 +-
 get_utterances.py             | 162 ++++++++++++++++++++++++++++++++++
 parishwebsites/parish2text.py |  29 ++++--
 test.py                       |  73 ---------------
 utils/iterator.py             |  11 +--
 webapp/app.py                 |  14 +--
 8 files changed, 210 insertions(+), 102 deletions(-)
 create mode 100755 get_utterances.py
 delete mode 100755 test.py

diff --git a/Makefile b/Makefile
index 3d3353c..30ee207 100644
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,9 @@ JOBS := 100
 
 all: data
 
+parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
+	mkdir -p parishwebsites/{text-data,text-data-logs}
+	cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
 
 data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
 	cd parishwebsites && ./deal-with-not-completed.sh
diff --git a/extract_rule_based.py b/extract_rule_based.py
index d8a1738..6c780dc 100755
--- a/extract_rule_based.py
+++ b/extract_rule_based.py
@@ -88,6 +88,7 @@ class Extractor:
         groups = (header_match.group(0), sunday_title_match.group(0),
                   sunday_masses_hours, everyday_title_match.group(0),
                   everyday_masses_match.group(0))
+        print(sunday_masses_hours + '\n' + everyday_masses_match.group(0))
         # print(whole_result)
         # print(groups)
         # obsłużyć # TODO:
@@ -109,10 +110,12 @@ def process_directory(directory):
         for fname in sorted(files):
             filepath = os.path.join(root, fname)
             if os.path.getsize(filepath) > 0:
-                if process_parish(iterator.parish_page_iterator(filepath)):
-                    print(filepath)
+                if process_parish(
+                        iterator.parish_page_iterator(filepath, html=False)):
+                    print('found: {}'.format(filepath))
                     found += 1
                 else:
+                    print('missed: {}'.format(filepath))
                     not_found += 1
                 print('found: {}\nnot_found: {}'.format(found, not_found))
             else:
@@ -146,14 +149,12 @@ def process_parish(reader):
             # print(page['depth'])
             # print(page['url'])
             # print(page['button_text'])
-            # import ipdb
-            # ipdb.set_trace()
             return True
     return False
 
 
 def main():
-    process_directory('./parishwebsites/data')
+    process_directory('./parishwebsites/text-data')
 
 
 if __name__ == '__main__':
diff --git a/extractor/find_hours.py b/extractor/find_hours.py
index 7eabcac..b3e4e1b 100755
--- a/extractor/find_hours.py
+++ b/extractor/find_hours.py
@@ -1,16 +1,17 @@
 import re
 from colorama import Fore, Back, Style
 
-hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
+hour_regex = re.compile(
+    '(0[6-9]|1\d|2[0-2])[:.](oo|[0-5]\d)|6|7|8|9|1\d|2[0-2]')
 
 
 def borders_ok(text, start, end):
     text = ' ' + text + ' '
     before_start_char = text[start]
     after_end_char = text[end + 1]
-    if (before_start_char.isspace()
-            or before_start_char == ',') and (after_end_char.isspace()
-                                              or after_end_char in ',;'):
+    if ((before_start_char.isspace() or before_start_char in ',(/')
+            and (after_end_char.isspace() or after_end_char in ',;)/')
+            and (before_start_char != '(' or after_end_char != ')')):
         return True
     else:
         return False
diff --git a/get_utterances.py b/get_utterances.py
new file mode 100755
index 0000000..d8d4d6e
--- /dev/null
+++ b/get_utterances.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+import urllib
+import logging
+import redis
+from utils import iterator
+from extractor.find_hours import hours_iterator
+import re
+import pickle
+
+
+# r = redis.StrictRedis(host='localhost', port=6379, db=0)
+class Utterance():
+    def __init__(self, utterance, url, button_text, depth, filename, line_no):
+        "docstring"
+        self.utterance = utterance
+        self.url = url
+        self.button_text = button_text
+        self.depth = depth
+        self.filename = filename
+        self.line_no = line_no
+
+
+def add_utterances(parish_page, parish_path, utterances):
+    utterances_nr = 0
+    content = parish_page['content']
+    for utterances_nr, utterance in enumerate(hours_iterator(content)):
+        utterance_inst = Utterance(
+            utterance, parish_page['url'], parish_page['button_text'],
+            parish_page['depth'], parish_path, parish_page['line_no'])
+        utterances.append(utterance_inst)
+    return utterances_nr
+
+
+def has_mass_metadata(url, button_text, page):
+    path = urllib.parse.urlparse(url).path
+    url_suffix = path.rsplit('/', 1)[1] if '/' in path else path
+    regex = re.compile(
+        'msze|nabo[żz]e[ńn]stw(a|(?=\W\d)|$)|porz[ąa]dek($|\.htm)|porz[aą]dek.(liturgi|mszy)|(rozk[lł]ad|plan|godziny|uk[lł]ad|harmonogram|grafik|rozpiska).mszy',
+        flags=re.IGNORECASE)
+    bad_regex = re.compile(
+        'nabo[zż]e[nń]stwa.(majowe|wielk|czerwcowe|maryjne|pasyjne|pokutne|fatimskie|do|ro[żz]a|czterdzie|w.wielk)',
+        re.IGNORECASE)
+    url_match = regex.search(url_suffix)
+    bad_url_match = bad_regex.search(url_suffix)
+    button_match = regex.search(button_text)
+    bad_button_match = bad_regex.search(button_text)
+    if url_match and button_match and not (bad_button_match or bad_url_match):
+        # print('both - url_metch: {}'.format(url_match.group(0)))
+        # print('button_metch: {}'.format(button_match.group(0)))
+        return True
+    elif url_match and not bad_url_match:
+        # print('url_match: {}'.format(url_match.group(0)))
+        return True
+    elif button_match and not button_match:
+        # print('button_match: {}'.format(button_match.group(0)))
+        return True
+    return False
+
+
+def remove_http_www(url):
+    url = re.sub('^https?://', '', url)
+    return re.sub('^www\.', '', url)
+
+
+def gather_parish_pages(parish_path, unique_urls):
+    parish_pages = {}
+    for page_nr, parish_page in enumerate(
+            iterator.parish_page_iterator(parish_path, html=False)):
+        url = remove_http_www(parish_page['url'])
+        button_text = parish_page['button_text']
+        if url not in unique_urls and has_mass_metadata(
+                url, button_text, parish_page):
+            unique_urls.add(url)
+            parish_page['line_no'] = page_nr
+            parish_pages[url] = parish_page
+    return parish_pages
+
+
+def get_best_parish_pages(parish_pages, n=3):
+    def pop_best_and_clear(pages):
+        shortest_url = min(parish_pages.keys(), key=lambda x: len(x))
+        best = pages.pop(shortest_url)
+        for key in list(parish_pages.keys()):
+            if key.startswith(shortest_url):
+                del pages[key]
+        return best
+
+    best_n = []
+    for i in range(n):
+        if parish_pages:
+            best_n.append(pop_best_and_clear(parish_pages))
+    return best_n
+
+
+def remove_duplicates(utterances):
+    seen = set()
+    res = []
+    for utt in utterances:
+        if utt.utterance not in seen:
+            res.append(utt)
+            seen.add(utt.utterance)
+    return res
+
+
+def load_parishes(directory, extracted_by_rules):
+    utterances = []
+    utterances_count = 0
+    last = 0
+    maximum = 0
+    unique_urls = set()
+    for file_nr, parish_path in enumerate(
+            iterator.parish_path_iterator(directory)):
+        if parish_path in extracted_by_rules:
+            continue
+        # print(parish_path)
+        metadata_count = 0
+        file_utterances = 0
+        parish_pages_dict = gather_parish_pages(parish_path, unique_urls)
+        parish_pages = get_best_parish_pages(parish_pages_dict)
+        maximum = max(len(parish_pages), maximum)
+        for pages_count, parish_page in enumerate(parish_pages):
+            new_utterances = add_utterances(parish_page, parish_path,
+                                            utterances)
+            # if new_utterances > 100: # TODO: in future check this value if it's to big then dont add new_utterances
+            #     pass
+            utterances_count += new_utterances
+            file_utterances += new_utterances
+            url = parish_page['url']  # TODO delete
+            button_text = parish_page['button_text']  # TODO: delete
+            logging.warning('{}\t||| {} ||| {} ||| {}'.format(
+                new_utterances, url, button_text, parish_page['depth']))
+
+            if utterances_count != last:
+                curr_str = 'file: {}, page: {}, utterances: {}'.format(
+                    file_nr, parish_page['line_no'], utterances_count)
+                print(curr_str)
+            last = utterances_count
+    print(maximum)
+    return remove_duplicates(utterances)
+
+
+def get_extracted_by_rules(filename):
+    extracted_by_rules = set()
+    with open(filename) as f:
+        for line in f:
+            extracted_by_rules.add(line.rstrip('\n'))
+    return extracted_by_rules
+
+
+def main():
+    extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt')
+    utterances = load_parishes('./parishwebsites/text-data',
+                               extracted_by_rules)
+    print(len(utterances))
+    with open('utterances.pkl', 'wb') as f:
+        pickle.dump(utterances, f, pickle.HIGHEST_PROTOCOL)
+    # r.set('foo', 'bar')
+    # print(r.get('foo'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/parishwebsites/parish2text.py b/parishwebsites/parish2text.py
index 7fb93a0..94af060 100755
--- a/parishwebsites/parish2text.py
+++ b/parishwebsites/parish2text.py
@@ -4,10 +4,13 @@ import sys
 import html2text
 import pprint
 import re
+import logging
+
 
 class Parish2Text():
     def __init__(self):
-        "docstring"
+        '''Don't use this object for long period of time, because convertion
+        will slowdown. Destroy it after every convertion.'''
         self.text_maker = html2text.HTML2Text()
         self.text_maker.ignore_links = True
         self.text_maker.ignore_images = True
@@ -16,25 +19,33 @@ class Parish2Text():
         self.text_maker.ul_item_mark = ''
         self.text_maker.emphasis_mark = ''
         self.text_maker.ignore_tables = True
-        
+
     def convert(self, parish):
         parish['content'] = self.text_maker.handle(parish['content'])
         parish['button_text'] = self.text_maker.handle(parish['button_text'])
-        parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split())
+        parish['button_text'] = ' '.join(
+            re.sub('[\W_]+', ' ', parish['button_text']).split())
         return parish
 
 
 def main():
-    parish2text = Parish2Text()
     writer = jsonlines.Writer(sys.stdout)
     # text_maker.wrap_links = False
     reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
-    for parish in reader:
-        parish = parish2text.convert(parish)
-        parish_content = parish.pop('content')
-        pprint.pprint(parish)
-        print(parish_content)
+    for page_nr, parish_page in enumerate(reader):
+        parish2text = Parish2Text()
+        try:
+            parish_page = parish2text.convert(parish_page)
+        except Exception:
+            logging.warning('page: {},url: {}'.format(page_nr,
+                                                      parish_page['url']))
+            continue
+        writer.write(parish_page)
+        # parish_content = parish_page.pop('content')
+        # pprint.pprint(parish_page)
+        # print(parish_content)
     reader.close()
 
+
 if __name__ == '__main__':
     main()
diff --git a/test.py b/test.py
deleted file mode 100755
index 1ab463b..0000000
--- a/test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-import redis
-from utils import iterator
-from extractor.find_hours import hours_iterator
-import re
-
-# r = redis.StrictRedis(host='localhost', port=6379, db=0)
-
-
-def add_utterances(content, utterances):
-    utterances_nr = 0
-    for utterances_nr, utterance in enumerate(hours_iterator(content)):
-        utterances.append(utterance)
-    return utterances_nr
-
-
-def has_mass_metadata(url, button_text):
-    regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
-    url_match = regex.search(url)
-    button_match = regex.search(button_text)
-    if url_match and button_match:
-        print('both - url_metch: {}'.format(url_match.group(0)))
-        print('button_metch: {}'.format(button_match.group(0)))
-        return True
-    elif url_match:
-        print('url_match: {}'.format(url_match.group(0)))
-        return True
-    elif button_match:
-        print('button_match: {}'.format(button_match.group(0)))
-        return True
-    return False
-
-
-def load_parishes(directory):
-    utterances = []
-    utterances_count = 0
-    for file_nr, parish_path in enumerate(
-            iterator.parish_path_iterator(directory)):
-        print(parish_path)
-        metadata_count = 0
-        for page_nr, parish_page in enumerate(
-                iterator.parish_page_iterator(parish_path)):
-            content = parish_page.pop('content')
-            # if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
-            if page_nr == 0:
-                utterances_count += add_utterances(content, utterances)
-            if has_mass_metadata(parish_page['url'],
-                                 parish_page['button_text']):
-                metadata_count += 1
-                utterances_count += add_utterances(content, utterances)
-
-            if metadata_count == 1:
-                break
-
-            if page_nr == 100:
-                print(utterances_count)
-                break
-            print('file: {}, page: {}, utterances: {}'.format(
-                file_nr, page_nr, utterances_count))
-    return {}
-
-
-utterances = {}
-
-
-def main():
-    load_parishes('./parishwebsites/data')
-    # r.set('foo', 'bar')
-    # print(r.get('foo'))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/utils/iterator.py b/utils/iterator.py
index b8297a1..414825c 100644
--- a/utils/iterator.py
+++ b/utils/iterator.py
@@ -12,13 +12,14 @@ def parish_path_iterator(directory):
                 yield filepath
 
 
-def parish_page_iterator(filepath):
+def parish_page_iterator(filepath, html=True):
     with jsonlines.open(filepath) as parish_reader:
-        page_nr = 0
         for parish_page in parish_reader:
-            page_nr += 1
             if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
                     'content']:
                 continue
-            parish2text = Parish2Text()
-            yield parish2text.convert(parish_page)
+            if html:
+                parish2text = Parish2Text()
+                yield parish2text.convert(parish_page)
+            else:
+                yield parish_page
diff --git a/webapp/app.py b/webapp/app.py
index 898157e..1e2c4d3 100644
--- a/webapp/app.py
+++ b/webapp/app.py
@@ -1,4 +1,4 @@
-from flask import Flask, render_template, request
+from flask import Flask, render_template, request, make_response
 import redis
 app = Flask(__name__)
 
@@ -18,11 +18,13 @@ def get_action():
     hour = '12.00'
     left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
     right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
-    return render_template(
-        'index.html',
-        hour=hour,
-        left_context=left_context,
-        right_context=right_context)
+    resp = make_response(
+        render_template(
+            'index.html',
+            hour=hour,
+            left_context=left_context,
+            right_context=right_context))
+    return resp
 
 
 @app.route("/", methods=['GET', 'POST'])