Working utterances getting/pickling
Working converting parishes from html2text. Add makefile parish2text goal. Change to non-html(text) parishes in extract_rule_based and get_utterances Enhance find_hours.py Wrap render_template in make_response in webapp/app.py
This commit is contained in:
parent
382666c563
commit
1f6b1e6ffe
3
Makefile
3
Makefile
@ -7,6 +7,9 @@ JOBS := 100
|
|||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
|
parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
|
||||||
|
mkdir -p parishwebsites/{text-data,text-data-logs}
|
||||||
|
cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
|
||||||
|
|
||||||
data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
|
data-add: parishwebsites/spider-commands-add.txt parishwebsites/domain-blacklist.txt parishwebsites/deal-with-not-completed.sh
|
||||||
cd parishwebsites && ./deal-with-not-completed.sh
|
cd parishwebsites && ./deal-with-not-completed.sh
|
||||||
|
@ -88,6 +88,7 @@ class Extractor:
|
|||||||
groups = (header_match.group(0), sunday_title_match.group(0),
|
groups = (header_match.group(0), sunday_title_match.group(0),
|
||||||
sunday_masses_hours, everyday_title_match.group(0),
|
sunday_masses_hours, everyday_title_match.group(0),
|
||||||
everyday_masses_match.group(0))
|
everyday_masses_match.group(0))
|
||||||
|
print(sunday_masses_hours + '\n' + everyday_masses_match.group(0))
|
||||||
# print(whole_result)
|
# print(whole_result)
|
||||||
# print(groups)
|
# print(groups)
|
||||||
# obsłużyć # TODO:
|
# obsłużyć # TODO:
|
||||||
@ -109,10 +110,12 @@ def process_directory(directory):
|
|||||||
for fname in sorted(files):
|
for fname in sorted(files):
|
||||||
filepath = os.path.join(root, fname)
|
filepath = os.path.join(root, fname)
|
||||||
if os.path.getsize(filepath) > 0:
|
if os.path.getsize(filepath) > 0:
|
||||||
if process_parish(iterator.parish_page_iterator(filepath)):
|
if process_parish(
|
||||||
print(filepath)
|
iterator.parish_page_iterator(filepath, html=False)):
|
||||||
|
print('found: {}'.format(filepath))
|
||||||
found += 1
|
found += 1
|
||||||
else:
|
else:
|
||||||
|
print('missed: {}'.format(filepath))
|
||||||
not_found += 1
|
not_found += 1
|
||||||
print('found: {}\nnot_found: {}'.format(found, not_found))
|
print('found: {}\nnot_found: {}'.format(found, not_found))
|
||||||
else:
|
else:
|
||||||
@ -146,14 +149,12 @@ def process_parish(reader):
|
|||||||
# print(page['depth'])
|
# print(page['depth'])
|
||||||
# print(page['url'])
|
# print(page['url'])
|
||||||
# print(page['button_text'])
|
# print(page['button_text'])
|
||||||
# import ipdb
|
|
||||||
# ipdb.set_trace()
|
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
process_directory('./parishwebsites/data')
|
process_directory('./parishwebsites/text-data')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
import re
|
import re
|
||||||
from colorama import Fore, Back, Style
|
from colorama import Fore, Back, Style
|
||||||
|
|
||||||
hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
|
hour_regex = re.compile(
|
||||||
|
'(0[6-9]|1\d|2[0-2])[:.](oo|[0-5]\d)|6|7|8|9|1\d|2[0-2]')
|
||||||
|
|
||||||
|
|
||||||
def borders_ok(text, start, end):
|
def borders_ok(text, start, end):
|
||||||
text = ' ' + text + ' '
|
text = ' ' + text + ' '
|
||||||
before_start_char = text[start]
|
before_start_char = text[start]
|
||||||
after_end_char = text[end + 1]
|
after_end_char = text[end + 1]
|
||||||
if (before_start_char.isspace()
|
if ((before_start_char.isspace() or before_start_char in ',(/')
|
||||||
or before_start_char == ',') and (after_end_char.isspace()
|
and (after_end_char.isspace() or after_end_char in ',;)/')
|
||||||
or after_end_char in ',;'):
|
and (before_start_char != '(' or after_end_char != ')')):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
162
get_utterances.py
Executable file
162
get_utterances.py
Executable file
@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import urllib
|
||||||
|
import logging
|
||||||
|
import redis
|
||||||
|
from utils import iterator
|
||||||
|
from extractor.find_hours import hours_iterator
|
||||||
|
import re
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
||||||
|
class Utterance():
|
||||||
|
def __init__(self, utterance, url, button_text, depth, filename, line_no):
|
||||||
|
"docstring"
|
||||||
|
self.utterance = utterance
|
||||||
|
self.url = url
|
||||||
|
self.button_text = button_text
|
||||||
|
self.depth = depth
|
||||||
|
self.filename = filename
|
||||||
|
self.line_no = line_no
|
||||||
|
|
||||||
|
|
||||||
|
def add_utterances(parish_page, parish_path, utterances):
|
||||||
|
utterances_nr = 0
|
||||||
|
content = parish_page['content']
|
||||||
|
for utterances_nr, utterance in enumerate(hours_iterator(content)):
|
||||||
|
utterance_inst = Utterance(
|
||||||
|
utterance, parish_page['url'], parish_page['button_text'],
|
||||||
|
parish_page['depth'], parish_path, parish_page['line_no'])
|
||||||
|
utterances.append(utterance_inst)
|
||||||
|
return utterances_nr
|
||||||
|
|
||||||
|
|
||||||
|
def has_mass_metadata(url, button_text, page):
|
||||||
|
path = urllib.parse.urlparse(url).path
|
||||||
|
url_suffix = path.rsplit('/', 1)[1] if '/' in path else path
|
||||||
|
regex = re.compile(
|
||||||
|
'msze|nabo[żz]e[ńn]stw(a|(?=\W\d)|$)|porz[ąa]dek($|\.htm)|porz[aą]dek.(liturgi|mszy)|(rozk[lł]ad|plan|godziny|uk[lł]ad|harmonogram|grafik|rozpiska).mszy',
|
||||||
|
flags=re.IGNORECASE)
|
||||||
|
bad_regex = re.compile(
|
||||||
|
'nabo[zż]e[nń]stwa.(majowe|wielk|czerwcowe|maryjne|pasyjne|pokutne|fatimskie|do|ro[żz]a|czterdzie|w.wielk)',
|
||||||
|
re.IGNORECASE)
|
||||||
|
url_match = regex.search(url_suffix)
|
||||||
|
bad_url_match = bad_regex.search(url_suffix)
|
||||||
|
button_match = regex.search(button_text)
|
||||||
|
bad_button_match = bad_regex.search(button_text)
|
||||||
|
if url_match and button_match and not (bad_button_match or bad_url_match):
|
||||||
|
# print('both - url_metch: {}'.format(url_match.group(0)))
|
||||||
|
# print('button_metch: {}'.format(button_match.group(0)))
|
||||||
|
return True
|
||||||
|
elif url_match and not bad_url_match:
|
||||||
|
# print('url_match: {}'.format(url_match.group(0)))
|
||||||
|
return True
|
||||||
|
elif button_match and not button_match:
|
||||||
|
# print('button_match: {}'.format(button_match.group(0)))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def remove_http_www(url):
|
||||||
|
url = re.sub('^https?://', '', url)
|
||||||
|
return re.sub('^www\.', '', url)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_parish_pages(parish_path, unique_urls):
|
||||||
|
parish_pages = {}
|
||||||
|
for page_nr, parish_page in enumerate(
|
||||||
|
iterator.parish_page_iterator(parish_path, html=False)):
|
||||||
|
url = remove_http_www(parish_page['url'])
|
||||||
|
button_text = parish_page['button_text']
|
||||||
|
if url not in unique_urls and has_mass_metadata(
|
||||||
|
url, button_text, parish_page):
|
||||||
|
unique_urls.add(url)
|
||||||
|
parish_page['line_no'] = page_nr
|
||||||
|
parish_pages[url] = parish_page
|
||||||
|
return parish_pages
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_parish_pages(parish_pages, n=3):
|
||||||
|
def pop_best_and_clear(pages):
|
||||||
|
shortest_url = min(parish_pages.keys(), key=lambda x: len(x))
|
||||||
|
best = pages.pop(shortest_url)
|
||||||
|
for key in list(parish_pages.keys()):
|
||||||
|
if key.startswith(shortest_url):
|
||||||
|
del pages[key]
|
||||||
|
return best
|
||||||
|
|
||||||
|
best_n = []
|
||||||
|
for i in range(n):
|
||||||
|
if parish_pages:
|
||||||
|
best_n.append(pop_best_and_clear(parish_pages))
|
||||||
|
return best_n
|
||||||
|
|
||||||
|
|
||||||
|
def remove_duplicates(utterances):
|
||||||
|
seen = set()
|
||||||
|
res = []
|
||||||
|
for utt in utterances:
|
||||||
|
if utt.utterance not in seen:
|
||||||
|
res.append(utt)
|
||||||
|
seen.add(utt.utterance)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def load_parishes(directory, extracted_by_rules):
|
||||||
|
utterances = []
|
||||||
|
utterances_count = 0
|
||||||
|
last = 0
|
||||||
|
maximum = 0
|
||||||
|
unique_urls = set()
|
||||||
|
for file_nr, parish_path in enumerate(
|
||||||
|
iterator.parish_path_iterator(directory)):
|
||||||
|
if parish_path in extracted_by_rules:
|
||||||
|
continue
|
||||||
|
# print(parish_path)
|
||||||
|
metadata_count = 0
|
||||||
|
file_utterances = 0
|
||||||
|
parish_pages_dict = gather_parish_pages(parish_path, unique_urls)
|
||||||
|
parish_pages = get_best_parish_pages(parish_pages_dict)
|
||||||
|
maximum = max(len(parish_pages), maximum)
|
||||||
|
for pages_count, parish_page in enumerate(parish_pages):
|
||||||
|
new_utterances = add_utterances(parish_page, parish_path,
|
||||||
|
utterances)
|
||||||
|
# if new_utterances > 100: # TODO: in future check this value if it's to big then dont add new_utterances
|
||||||
|
# pass
|
||||||
|
utterances_count += new_utterances
|
||||||
|
file_utterances += new_utterances
|
||||||
|
url = parish_page['url'] # TODO delete
|
||||||
|
button_text = parish_page['button_text'] # TODO: delete
|
||||||
|
logging.warning('{}\t||| {} ||| {} ||| {}'.format(
|
||||||
|
new_utterances, url, button_text, parish_page['depth']))
|
||||||
|
|
||||||
|
if utterances_count != last:
|
||||||
|
curr_str = 'file: {}, page: {}, utterances: {}'.format(
|
||||||
|
file_nr, parish_page['line_no'], utterances_count)
|
||||||
|
print(curr_str)
|
||||||
|
last = utterances_count
|
||||||
|
print(maximum)
|
||||||
|
return remove_duplicates(utterances)
|
||||||
|
|
||||||
|
|
||||||
|
def get_extracted_by_rules(filename):
|
||||||
|
extracted_by_rules = set()
|
||||||
|
with open(filename) as f:
|
||||||
|
for line in f:
|
||||||
|
extracted_by_rules.add(line.rstrip('\n'))
|
||||||
|
return extracted_by_rules
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt')
|
||||||
|
utterances = load_parishes('./parishwebsites/text-data',
|
||||||
|
extracted_by_rules)
|
||||||
|
print(len(utterances))
|
||||||
|
with open('utterances.pkl', 'wb') as f:
|
||||||
|
pickle.dump(utterances, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
# r.set('foo', 'bar')
|
||||||
|
# print(r.get('foo'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -4,10 +4,13 @@ import sys
|
|||||||
import html2text
|
import html2text
|
||||||
import pprint
|
import pprint
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
class Parish2Text():
|
class Parish2Text():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"docstring"
|
'''Don't use this object for long period of time, because convertion
|
||||||
|
will slowdown. Destroy it after every convertion.'''
|
||||||
self.text_maker = html2text.HTML2Text()
|
self.text_maker = html2text.HTML2Text()
|
||||||
self.text_maker.ignore_links = True
|
self.text_maker.ignore_links = True
|
||||||
self.text_maker.ignore_images = True
|
self.text_maker.ignore_images = True
|
||||||
@ -20,21 +23,29 @@ class Parish2Text():
|
|||||||
def convert(self, parish):
|
def convert(self, parish):
|
||||||
parish['content'] = self.text_maker.handle(parish['content'])
|
parish['content'] = self.text_maker.handle(parish['content'])
|
||||||
parish['button_text'] = self.text_maker.handle(parish['button_text'])
|
parish['button_text'] = self.text_maker.handle(parish['button_text'])
|
||||||
parish['button_text'] = ' '.join(re.sub('[\W_]+', ' ', parish['button_text']).split())
|
parish['button_text'] = ' '.join(
|
||||||
|
re.sub('[\W_]+', ' ', parish['button_text']).split())
|
||||||
return parish
|
return parish
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parish2text = Parish2Text()
|
|
||||||
writer = jsonlines.Writer(sys.stdout)
|
writer = jsonlines.Writer(sys.stdout)
|
||||||
# text_maker.wrap_links = False
|
# text_maker.wrap_links = False
|
||||||
reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
|
reader = jsonlines.Reader((line.rstrip('\n') for line in sys.stdin))
|
||||||
for parish in reader:
|
for page_nr, parish_page in enumerate(reader):
|
||||||
parish = parish2text.convert(parish)
|
parish2text = Parish2Text()
|
||||||
parish_content = parish.pop('content')
|
try:
|
||||||
pprint.pprint(parish)
|
parish_page = parish2text.convert(parish_page)
|
||||||
print(parish_content)
|
except Exception:
|
||||||
|
logging.warning('page: {},url: {}'.format(page_nr,
|
||||||
|
parish_page['url']))
|
||||||
|
continue
|
||||||
|
writer.write(parish_page)
|
||||||
|
# parish_content = parish_page.pop('content')
|
||||||
|
# pprint.pprint(parish_page)
|
||||||
|
# print(parish_content)
|
||||||
reader.close()
|
reader.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
73
test.py
73
test.py
@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
import redis
|
|
||||||
from utils import iterator
|
|
||||||
from extractor.find_hours import hours_iterator
|
|
||||||
import re
|
|
||||||
|
|
||||||
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
|
||||||
|
|
||||||
|
|
||||||
def add_utterances(content, utterances):
|
|
||||||
utterances_nr = 0
|
|
||||||
for utterances_nr, utterance in enumerate(hours_iterator(content)):
|
|
||||||
utterances.append(utterance)
|
|
||||||
return utterances_nr
|
|
||||||
|
|
||||||
|
|
||||||
def has_mass_metadata(url, button_text):
|
|
||||||
regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
|
|
||||||
url_match = regex.search(url)
|
|
||||||
button_match = regex.search(button_text)
|
|
||||||
if url_match and button_match:
|
|
||||||
print('both - url_metch: {}'.format(url_match.group(0)))
|
|
||||||
print('button_metch: {}'.format(button_match.group(0)))
|
|
||||||
return True
|
|
||||||
elif url_match:
|
|
||||||
print('url_match: {}'.format(url_match.group(0)))
|
|
||||||
return True
|
|
||||||
elif button_match:
|
|
||||||
print('button_match: {}'.format(button_match.group(0)))
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def load_parishes(directory):
|
|
||||||
utterances = []
|
|
||||||
utterances_count = 0
|
|
||||||
for file_nr, parish_path in enumerate(
|
|
||||||
iterator.parish_path_iterator(directory)):
|
|
||||||
print(parish_path)
|
|
||||||
metadata_count = 0
|
|
||||||
for page_nr, parish_page in enumerate(
|
|
||||||
iterator.parish_page_iterator(parish_path)):
|
|
||||||
content = parish_page.pop('content')
|
|
||||||
# if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
|
|
||||||
if page_nr == 0:
|
|
||||||
utterances_count += add_utterances(content, utterances)
|
|
||||||
if has_mass_metadata(parish_page['url'],
|
|
||||||
parish_page['button_text']):
|
|
||||||
metadata_count += 1
|
|
||||||
utterances_count += add_utterances(content, utterances)
|
|
||||||
|
|
||||||
if metadata_count == 1:
|
|
||||||
break
|
|
||||||
|
|
||||||
if page_nr == 100:
|
|
||||||
print(utterances_count)
|
|
||||||
break
|
|
||||||
print('file: {}, page: {}, utterances: {}'.format(
|
|
||||||
file_nr, page_nr, utterances_count))
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
utterances = {}
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
load_parishes('./parishwebsites/data')
|
|
||||||
# r.set('foo', 'bar')
|
|
||||||
# print(r.get('foo'))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -12,13 +12,14 @@ def parish_path_iterator(directory):
|
|||||||
yield filepath
|
yield filepath
|
||||||
|
|
||||||
|
|
||||||
def parish_page_iterator(filepath):
|
def parish_page_iterator(filepath, html=True):
|
||||||
with jsonlines.open(filepath) as parish_reader:
|
with jsonlines.open(filepath) as parish_reader:
|
||||||
page_nr = 0
|
|
||||||
for parish_page in parish_reader:
|
for parish_page in parish_reader:
|
||||||
page_nr += 1
|
|
||||||
if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
|
if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
|
||||||
'content']:
|
'content']:
|
||||||
continue
|
continue
|
||||||
|
if html:
|
||||||
parish2text = Parish2Text()
|
parish2text = Parish2Text()
|
||||||
yield parish2text.convert(parish_page)
|
yield parish2text.convert(parish_page)
|
||||||
|
else:
|
||||||
|
yield parish_page
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from flask import Flask, render_template, request
|
from flask import Flask, render_template, request, make_response
|
||||||
import redis
|
import redis
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@ -18,11 +18,13 @@ def get_action():
|
|||||||
hour = '12.00'
|
hour = '12.00'
|
||||||
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
|
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
|
||||||
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
|
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
|
||||||
return render_template(
|
resp = make_response(
|
||||||
|
render_template(
|
||||||
'index.html',
|
'index.html',
|
||||||
hour=hour,
|
hour=hour,
|
||||||
left_context=left_context,
|
left_context=left_context,
|
||||||
right_context=right_context)
|
right_context=right_context))
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
@app.route("/", methods=['GET', 'POST'])
|
@app.route("/", methods=['GET', 'POST'])
|
||||||
|
Loading…
Reference in New Issue
Block a user