Add test.py for data gathering (data for annotation)

Small changes to annotator.py (to be deleted in near future)
Add utils/iterator
Add redis to enviroment.yml
Rename, adapt and move rule based extractor.
Adapt find_hours.
Yapify webapp app (probalby nothing more)
Rename buttons in index.html
This commit is contained in:
siulkilulki 2018-05-11 23:12:21 +02:00
parent c617018611
commit 382666c563
8 changed files with 209 additions and 48 deletions

View File

@ -10,31 +10,35 @@ parish2text = Parish2Text()
CONTEXT = 100 CONTEXT = 100
def process_parish_page(parish_page): def process_parish_page(parish_page):
content = parish_page.pop('content') content = parish_page.pop('content')
for utterance, utterance_colored in hours_iterator(content): for utterance, utterance_colored in hours_iterator(content, color=True):
print(utterance_colored) print(utterance_colored)
import ipdb; ipdb.set_trace() import ipdb
ipdb.set_trace()
def process_parish_file(parish_reader): def process_parish_file(parish_reader):
for parish_page in parish_reader: for parish_page in parish_reader:
parish_page = parish2text.convert(parish_page) parish_page = parish2text.convert(parish_page)
import ipdb
ipdb.set_trace()
process_parish_page(parish_page) process_parish_page(parish_page)
def process_directory(directory): def process_directory(directory):
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
# random.shuffle(files) random.shuffle(files)
for fname in sorted(files): for fname in files:
filepath = os.path.join(root, fname) filepath = os.path.join(root, fname)
if os.path.getsize(filepath) > 0: if os.path.getsize(filepath) > 0:
with jsonlines.open(filepath) as parish_reader: with jsonlines.open(filepath) as parish_reader:
process_parish_file(parish_reader) process_parish_file(parish_reader)
def main(): def main():
process_directory('./parishwebsites/data') process_directory('./parishwebsites/data')
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,20 +1,20 @@
name: polish-masses name: polish-masses
channels: channels:
- defaults - defaults
- conda-forge
dependencies: dependencies:
- python - python
- scrapy - scrapy
- gunicorn - gunicorn
- flask - flask
- redis-py
- lxml
- requests
- beautifulsoup4
- colorama
- pip: - pip:
- lxml
- tldextract - tldextract
- requests
- beautifulsoup4
- python-google-places - python-google-places
- jsonlines - jsonlines
- ipdb - ipdb
- colorama
- html2text - html2text
- binaryornot - binaryornot

View File

@ -1,9 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from utils import iterator
from colorama import Fore, Back, Style from colorama import Fore, Back, Style
import os import os
import jsonlines import jsonlines
import re import re
import pprint import pprint
import sys
class Extractor: class Extractor:
@ -38,6 +40,8 @@ class Extractor:
return '(?P<{}>{})'.format(name, pattern) return '(?P<{}>{})'.format(name, pattern)
def extract(self, search_space=None): def extract(self, search_space=None):
if search_space == '':
return None
if not search_space: if not search_space:
search_space = self.content search_space = self.content
header_match = re.search(self.header, search_space, re.I) header_match = re.search(self.header, search_space, re.I)
@ -51,8 +55,13 @@ class Extractor:
if re.search(self.header, search_space[:sunday_title_match.start()], if re.search(self.header, search_space[:sunday_title_match.start()],
re.I): # found header closer to sunday title re.I): # found header closer to sunday title
return self.extract(search_space) return self.extract(search_space)
if sunday_title_match.start() > 50: if sunday_title_match.start(
return self.extract(search_space[sunday_title_match.end()]) ) > 50: #sunday_title za daleko header'a wiec szukaj dalej
try:
return self.extract(search_space[sunday_title_match.end():])
except Exception:
import ipdb
ipdb.set_trace()
everyday_title_match = re.search(self.everyday_title, search_space, everyday_title_match = re.search(self.everyday_title, search_space,
re.I) re.I)
@ -87,8 +96,9 @@ class Extractor:
# w dni powszednie (czas wakacji) - górny kościół # w dni powszednie (czas wakacji) - górny kościół
# 7:00, 8:00, 18:00 # 7:00, 8:00, 18:00
print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[ print('url: {}\ndepth: {}\nbutton: {}'.format(
'url'], self.page['depth'], self.page['button_text'])) self.page['url'], self.page['depth'], self.page['button_text']))
sys.stdout.flush()
return whole_result, groups return whole_result, groups
@ -96,16 +106,15 @@ def process_directory(directory):
found = 0 found = 0
not_found = 0 not_found = 0
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
for fname in files: for fname in sorted(files):
filepath = os.path.join(root, fname) filepath = os.path.join(root, fname)
if os.path.getsize(filepath) > 0: if os.path.getsize(filepath) > 0:
with jsonlines.open(filepath) as reader: if process_parish(iterator.parish_page_iterator(filepath)):
# print(filepath) print(filepath)
if process_parish(reader): found += 1
found += 1 else:
else: not_found += 1
not_found += 1 print('found: {}\nnot_found: {}'.format(found, not_found))
# print('found: {}\nnot_found: {}'.format(found, not_found))
else: else:
pass # empty file pass # empty file
@ -125,6 +134,7 @@ def process_parish(reader):
if result: if result:
whole_result, groups = result whole_result, groups = result
if whole_result not in page['content']: if whole_result not in page['content']:
pass
import ipdb import ipdb
ipdb.set_trace() ipdb.set_trace()
pretty_text = page['content'].replace( pretty_text = page['content'].replace(
@ -132,19 +142,18 @@ def process_parish(reader):
color_match(whole_result, groups, Back.BLACK, [ color_match(whole_result, groups, Back.BLACK, [
Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
], Style.BRIGHT)) ], Style.BRIGHT))
print(pretty_text) # print(pretty_text)
import ipdb # print(page['depth'])
ipdb.set_trace() # print(page['url'])
return True # print(page['button_text'])
else:
return False
# import ipdb # import ipdb
# ipdb.set_trace() # ipdb.set_trace()
pass return True
return False
def main(): def main():
process_directory('./parishwebsites/data-final') process_directory('./parishwebsites/data')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -3,22 +3,29 @@ from colorama import Fore, Back, Style
hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9') hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
def borders_ok(text, start, end): def borders_ok(text, start, end):
text = ' ' + text + ' ' text = ' ' + text + ' '
before_start_char = text[start] before_start_char = text[start]
after_end_char = text[end + 1] after_end_char = text[end + 1]
if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'): if (before_start_char.isspace()
or before_start_char == ',') and (after_end_char.isspace()
or after_end_char in ',;'):
return True return True
else: else:
return False return False
def get_context(text, start, end, minsize): def get_context(text, start, end, minsize):
hour = text[start:end] hour = text[start:end]
prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:] prefix = re.sub(' +', ' ', text[:start]).rsplit(
suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1] ' ', maxsplit=minsize + 2)[1:]
suffix = re.sub(' +', ' ', text[end:]).split(
' ', maxsplit=minsize + 2)[:-1]
return ' '.join(prefix), hour, ' '.join(suffix) return ' '.join(prefix), hour, ' '.join(suffix)
def hours_iterator(text, minsize=20):
def hours_iterator(text, minsize=20, color=False):
for hour_match in hour_regex.finditer(text): for hour_match in hour_regex.finditer(text):
start = hour_match.start(0) start = hour_match.start(0)
end = hour_match.end(0) end = hour_match.end(0)
@ -26,12 +33,15 @@ def hours_iterator(text, minsize=20):
continue continue
prefix, hour, suffix = get_context(text, start, end, minsize) prefix, hour, suffix = get_context(text, start, end, minsize)
utterance = f'{prefix}&&&{hour}###{suffix}' utterance = f'{prefix}&&&{hour}###{suffix}'
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT) if color:
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN,
Style.BRIGHT)
else:
yield utterance
# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie # w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
def color_hour(prefix, hour, suffix, color, style): def color_hour(prefix, hour, suffix, color, style):
return prefix + color + style + hour + Style.RESET_ALL + suffix return prefix + color + style + hour + Style.RESET_ALL + suffix

73
test.py Executable file
View File

@ -0,0 +1,73 @@
#!/usr/bin/env python3
import redis
from utils import iterator
from extractor.find_hours import hours_iterator
import re
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
def add_utterances(content, utterances):
utterances_nr = 0
for utterances_nr, utterance in enumerate(hours_iterator(content)):
utterances.append(utterance)
return utterances_nr
def has_mass_metadata(url, button_text):
regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
url_match = regex.search(url)
button_match = regex.search(button_text)
if url_match and button_match:
print('both - url_metch: {}'.format(url_match.group(0)))
print('button_metch: {}'.format(button_match.group(0)))
return True
elif url_match:
print('url_match: {}'.format(url_match.group(0)))
return True
elif button_match:
print('button_match: {}'.format(button_match.group(0)))
return True
return False
def load_parishes(directory):
utterances = []
utterances_count = 0
for file_nr, parish_path in enumerate(
iterator.parish_path_iterator(directory)):
print(parish_path)
metadata_count = 0
for page_nr, parish_page in enumerate(
iterator.parish_page_iterator(parish_path)):
content = parish_page.pop('content')
# if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
if page_nr == 0:
utterances_count += add_utterances(content, utterances)
if has_mass_metadata(parish_page['url'],
parish_page['button_text']):
metadata_count += 1
utterances_count += add_utterances(content, utterances)
if metadata_count == 1:
break
if page_nr == 100:
print(utterances_count)
break
print('file: {}, page: {}, utterances: {}'.format(
file_nr, page_nr, utterances_count))
return {}
utterances = {}
def main():
load_parishes('./parishwebsites/data')
# r.set('foo', 'bar')
# print(r.get('foo'))
if __name__ == '__main__':
main()

24
utils/iterator.py Normal file
View File

@ -0,0 +1,24 @@
import os
import jsonlines
import random
from parishwebsites.parish2text import Parish2Text
def parish_path_iterator(directory):
for root, dirs, files in os.walk(directory):
for fname in sorted(files):
filepath = os.path.join(root, fname)
if os.path.getsize(filepath) > 0:
yield filepath
def parish_page_iterator(filepath):
with jsonlines.open(filepath) as parish_reader:
page_nr = 0
for parish_page in parish_reader:
page_nr += 1
if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
'content']:
continue
parish2text = Parish2Text()
yield parish2text.convert(parish_page)

View File

@ -1,23 +1,36 @@
from flask import Flask, render_template, request from flask import Flask, render_template, request
import redis
app = Flask(__name__) app = Flask(__name__)
def load_parishes(directory):
return {}
parishes = load_parishes('dir')
def post_action(): def post_action():
return get_action() return get_action()
def get_action(): def get_action():
hour = '12.00' hour = '12.00'
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie ' left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białego rana.' right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
return render_template('index.html', hour=hour, left_context=left_context, right_context=right_context) return render_template(
'index.html',
hour=hour,
left_context=left_context,
right_context=right_context)
@app.route("/", methods=['GET', 'POST']) @app.route("/", methods=['GET', 'POST'])
def root(): def root():
if request.method == 'POST': if request.method == 'POST':
return post_action() return post_action()
else: else:
return get_action() return get_action()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -14,7 +14,7 @@
<div class="container"> <div class="container">
<div class="container mt-1"> <div class="container mt-1">
<div class="row justify-content-start"> <div class="row justify-content-start">
<button type="button" class="btn btn-warning btn-sm" id="cofnij">Cofinj</button> <button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button>
</div> </div>
<div class="row justify-content-center"> <div class="row justify-content-center">
<h2>Czy zaznaczono godzinę mszy świętej?</h2> <h2>Czy zaznaczono godzinę mszy świętej?</h2>
@ -26,8 +26,8 @@
</div> </div>
</div> </div>
<div class="btn-group d-flex h-mx" role="group"> <div class="btn-group d-flex h-mx" role="group">
<button type="button" class="btn btn-danger btn-lg w-100" id="nie">Nie</button> <button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button>
<button type="button" class="btn btn-success btn-lg w-100" id="tak">Tak</button> <button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button>
</div> </div>
</div> </div>
<!-- Optional JavaScript --> <!-- Optional JavaScript -->
@ -36,9 +36,37 @@
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script> <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
<script type="text/javascript"> <script type="text/javascript">
$("button#tak").click(function(){ $("button#yes").click(function(){
$.post( "/", {result: "yes"}, function() { $.post( "/", {result: "yes"}, function() {
console.log( "success" ); console.log( "yes button clicked" );
})
.done(function() {
console.log( "second success" );
})
.fail(function() {
console.log( "error" );
})
.always(function() {
console.log( "finished" );
});
});
$("button#no").click(function(){
$.post( "/", {result: "no"}, function() {
console.log( "no button clicked" );
})
.done(function() {
console.log( "second success" );
})
.fail(function() {
console.log( "error" );
})
.always(function() {
console.log( "finished" );
});
});
$("button#undo").click(function(){
$.post( "/", {result: "undo"}, function() {
console.log( "undo button clicked" );
}) })
.done(function() { .done(function() {
console.log( "second success" ); console.log( "second success" );