Add test.py for data gathering (data for annotation)
Small changes to annotator.py (to be deleted in near future) Add utils/iterator Add redis to enviroment.yml Rename, adapt and move rule based extractor. Adapt find_hours. Yapify webapp app (probalby nothing more) Rename buttons in index.html
This commit is contained in:
parent
c617018611
commit
382666c563
14
annotator.py
14
annotator.py
@ -10,31 +10,35 @@ parish2text = Parish2Text()
|
|||||||
CONTEXT = 100
|
CONTEXT = 100
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_parish_page(parish_page):
|
def process_parish_page(parish_page):
|
||||||
content = parish_page.pop('content')
|
content = parish_page.pop('content')
|
||||||
for utterance, utterance_colored in hours_iterator(content):
|
for utterance, utterance_colored in hours_iterator(content, color=True):
|
||||||
print(utterance_colored)
|
print(utterance_colored)
|
||||||
import ipdb; ipdb.set_trace()
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
|
|
||||||
|
|
||||||
def process_parish_file(parish_reader):
|
def process_parish_file(parish_reader):
|
||||||
for parish_page in parish_reader:
|
for parish_page in parish_reader:
|
||||||
parish_page = parish2text.convert(parish_page)
|
parish_page = parish2text.convert(parish_page)
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
process_parish_page(parish_page)
|
process_parish_page(parish_page)
|
||||||
|
|
||||||
|
|
||||||
def process_directory(directory):
|
def process_directory(directory):
|
||||||
for root, dirs, files in os.walk(directory):
|
for root, dirs, files in os.walk(directory):
|
||||||
# random.shuffle(files)
|
random.shuffle(files)
|
||||||
for fname in sorted(files):
|
for fname in files:
|
||||||
filepath = os.path.join(root, fname)
|
filepath = os.path.join(root, fname)
|
||||||
if os.path.getsize(filepath) > 0:
|
if os.path.getsize(filepath) > 0:
|
||||||
with jsonlines.open(filepath) as parish_reader:
|
with jsonlines.open(filepath) as parish_reader:
|
||||||
process_parish_file(parish_reader)
|
process_parish_file(parish_reader)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
process_directory('./parishwebsites/data')
|
process_directory('./parishwebsites/data')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -1,20 +1,20 @@
|
|||||||
name: polish-masses
|
name: polish-masses
|
||||||
channels:
|
channels:
|
||||||
- defaults
|
- defaults
|
||||||
- conda-forge
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- python
|
- python
|
||||||
- scrapy
|
- scrapy
|
||||||
- gunicorn
|
- gunicorn
|
||||||
- flask
|
- flask
|
||||||
|
- redis-py
|
||||||
|
- lxml
|
||||||
|
- requests
|
||||||
|
- beautifulsoup4
|
||||||
|
- colorama
|
||||||
- pip:
|
- pip:
|
||||||
- lxml
|
|
||||||
- tldextract
|
- tldextract
|
||||||
- requests
|
|
||||||
- beautifulsoup4
|
|
||||||
- python-google-places
|
- python-google-places
|
||||||
- jsonlines
|
- jsonlines
|
||||||
- ipdb
|
- ipdb
|
||||||
- colorama
|
|
||||||
- html2text
|
- html2text
|
||||||
- binaryornot
|
- binaryornot
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
from utils import iterator
|
||||||
from colorama import Fore, Back, Style
|
from colorama import Fore, Back, Style
|
||||||
import os
|
import os
|
||||||
import jsonlines
|
import jsonlines
|
||||||
import re
|
import re
|
||||||
import pprint
|
import pprint
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
class Extractor:
|
class Extractor:
|
||||||
@ -38,6 +40,8 @@ class Extractor:
|
|||||||
return '(?P<{}>{})'.format(name, pattern)
|
return '(?P<{}>{})'.format(name, pattern)
|
||||||
|
|
||||||
def extract(self, search_space=None):
|
def extract(self, search_space=None):
|
||||||
|
if search_space == '':
|
||||||
|
return None
|
||||||
if not search_space:
|
if not search_space:
|
||||||
search_space = self.content
|
search_space = self.content
|
||||||
header_match = re.search(self.header, search_space, re.I)
|
header_match = re.search(self.header, search_space, re.I)
|
||||||
@ -51,8 +55,13 @@ class Extractor:
|
|||||||
if re.search(self.header, search_space[:sunday_title_match.start()],
|
if re.search(self.header, search_space[:sunday_title_match.start()],
|
||||||
re.I): # found header closer to sunday title
|
re.I): # found header closer to sunday title
|
||||||
return self.extract(search_space)
|
return self.extract(search_space)
|
||||||
if sunday_title_match.start() > 50:
|
if sunday_title_match.start(
|
||||||
return self.extract(search_space[sunday_title_match.end()])
|
) > 50: #sunday_title za daleko header'a wiec szukaj dalej
|
||||||
|
try:
|
||||||
|
return self.extract(search_space[sunday_title_match.end():])
|
||||||
|
except Exception:
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
|
|
||||||
everyday_title_match = re.search(self.everyday_title, search_space,
|
everyday_title_match = re.search(self.everyday_title, search_space,
|
||||||
re.I)
|
re.I)
|
||||||
@ -87,8 +96,9 @@ class Extractor:
|
|||||||
# w dni powszednie (czas wakacji) - górny kościół
|
# w dni powszednie (czas wakacji) - górny kościół
|
||||||
# 7:00, 8:00, 18:00
|
# 7:00, 8:00, 18:00
|
||||||
|
|
||||||
print('url: {}\ndepth: {}\nbutton: {}'.format(self.page[
|
print('url: {}\ndepth: {}\nbutton: {}'.format(
|
||||||
'url'], self.page['depth'], self.page['button_text']))
|
self.page['url'], self.page['depth'], self.page['button_text']))
|
||||||
|
sys.stdout.flush()
|
||||||
return whole_result, groups
|
return whole_result, groups
|
||||||
|
|
||||||
|
|
||||||
@ -96,16 +106,15 @@ def process_directory(directory):
|
|||||||
found = 0
|
found = 0
|
||||||
not_found = 0
|
not_found = 0
|
||||||
for root, dirs, files in os.walk(directory):
|
for root, dirs, files in os.walk(directory):
|
||||||
for fname in files:
|
for fname in sorted(files):
|
||||||
filepath = os.path.join(root, fname)
|
filepath = os.path.join(root, fname)
|
||||||
if os.path.getsize(filepath) > 0:
|
if os.path.getsize(filepath) > 0:
|
||||||
with jsonlines.open(filepath) as reader:
|
if process_parish(iterator.parish_page_iterator(filepath)):
|
||||||
# print(filepath)
|
print(filepath)
|
||||||
if process_parish(reader):
|
found += 1
|
||||||
found += 1
|
else:
|
||||||
else:
|
not_found += 1
|
||||||
not_found += 1
|
print('found: {}\nnot_found: {}'.format(found, not_found))
|
||||||
# print('found: {}\nnot_found: {}'.format(found, not_found))
|
|
||||||
else:
|
else:
|
||||||
pass # empty file
|
pass # empty file
|
||||||
|
|
||||||
@ -125,6 +134,7 @@ def process_parish(reader):
|
|||||||
if result:
|
if result:
|
||||||
whole_result, groups = result
|
whole_result, groups = result
|
||||||
if whole_result not in page['content']:
|
if whole_result not in page['content']:
|
||||||
|
pass
|
||||||
import ipdb
|
import ipdb
|
||||||
ipdb.set_trace()
|
ipdb.set_trace()
|
||||||
pretty_text = page['content'].replace(
|
pretty_text = page['content'].replace(
|
||||||
@ -132,19 +142,18 @@ def process_parish(reader):
|
|||||||
color_match(whole_result, groups, Back.BLACK, [
|
color_match(whole_result, groups, Back.BLACK, [
|
||||||
Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
|
Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.MAGENTA, Fore.CYAN
|
||||||
], Style.BRIGHT))
|
], Style.BRIGHT))
|
||||||
print(pretty_text)
|
# print(pretty_text)
|
||||||
import ipdb
|
# print(page['depth'])
|
||||||
ipdb.set_trace()
|
# print(page['url'])
|
||||||
return True
|
# print(page['button_text'])
|
||||||
else:
|
|
||||||
return False
|
|
||||||
# import ipdb
|
# import ipdb
|
||||||
# ipdb.set_trace()
|
# ipdb.set_trace()
|
||||||
pass
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
process_directory('./parishwebsites/data-final')
|
process_directory('./parishwebsites/data')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
@ -3,22 +3,29 @@ from colorama import Fore, Back, Style
|
|||||||
|
|
||||||
hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
|
hour_regex = re.compile('\d\d?[:.]?(oo|\d\d)|\d\d|6|7|8|9')
|
||||||
|
|
||||||
|
|
||||||
def borders_ok(text, start, end):
|
def borders_ok(text, start, end):
|
||||||
text = ' ' + text + ' '
|
text = ' ' + text + ' '
|
||||||
before_start_char = text[start]
|
before_start_char = text[start]
|
||||||
after_end_char = text[end + 1]
|
after_end_char = text[end + 1]
|
||||||
if (before_start_char.isspace() or before_start_char == ',') and (after_end_char.isspace() or after_end_char in ',;'):
|
if (before_start_char.isspace()
|
||||||
|
or before_start_char == ',') and (after_end_char.isspace()
|
||||||
|
or after_end_char in ',;'):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_context(text, start, end, minsize):
|
def get_context(text, start, end, minsize):
|
||||||
hour = text[start:end]
|
hour = text[start:end]
|
||||||
prefix = re.sub(' +', ' ', text[:start]).rsplit(' ', maxsplit=minsize+2)[1:]
|
prefix = re.sub(' +', ' ', text[:start]).rsplit(
|
||||||
suffix = re.sub(' +', ' ', text[end:]).split(' ', maxsplit=minsize+2)[:-1]
|
' ', maxsplit=minsize + 2)[1:]
|
||||||
|
suffix = re.sub(' +', ' ', text[end:]).split(
|
||||||
|
' ', maxsplit=minsize + 2)[:-1]
|
||||||
return ' '.join(prefix), hour, ' '.join(suffix)
|
return ' '.join(prefix), hour, ' '.join(suffix)
|
||||||
|
|
||||||
def hours_iterator(text, minsize=20):
|
|
||||||
|
def hours_iterator(text, minsize=20, color=False):
|
||||||
for hour_match in hour_regex.finditer(text):
|
for hour_match in hour_regex.finditer(text):
|
||||||
start = hour_match.start(0)
|
start = hour_match.start(0)
|
||||||
end = hour_match.end(0)
|
end = hour_match.end(0)
|
||||||
@ -26,12 +33,15 @@ def hours_iterator(text, minsize=20):
|
|||||||
continue
|
continue
|
||||||
prefix, hour, suffix = get_context(text, start, end, minsize)
|
prefix, hour, suffix = get_context(text, start, end, minsize)
|
||||||
utterance = f'{prefix}&&&{hour}###{suffix}'
|
utterance = f'{prefix}&&&{hour}###{suffix}'
|
||||||
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, Style.BRIGHT)
|
if color:
|
||||||
|
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN,
|
||||||
|
Style.BRIGHT)
|
||||||
|
else:
|
||||||
|
yield utterance
|
||||||
|
|
||||||
|
|
||||||
# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
|
# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
|
||||||
|
|
||||||
|
|
||||||
def color_hour(prefix, hour, suffix, color, style):
|
def color_hour(prefix, hour, suffix, color, style):
|
||||||
return prefix + color + style + hour + Style.RESET_ALL + suffix
|
return prefix + color + style + hour + Style.RESET_ALL + suffix
|
||||||
|
|
||||||
|
|
||||||
|
73
test.py
Executable file
73
test.py
Executable file
@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import redis
|
||||||
|
from utils import iterator
|
||||||
|
from extractor.find_hours import hours_iterator
|
||||||
|
import re
|
||||||
|
|
||||||
|
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
||||||
|
|
||||||
|
|
||||||
|
def add_utterances(content, utterances):
|
||||||
|
utterances_nr = 0
|
||||||
|
for utterances_nr, utterance in enumerate(hours_iterator(content)):
|
||||||
|
utterances.append(utterance)
|
||||||
|
return utterances_nr
|
||||||
|
|
||||||
|
|
||||||
|
def has_mass_metadata(url, button_text):
|
||||||
|
regex = re.compile('msz[eay]|nabo[żz]e[ńn]stw|porz[ąa]dek')
|
||||||
|
url_match = regex.search(url)
|
||||||
|
button_match = regex.search(button_text)
|
||||||
|
if url_match and button_match:
|
||||||
|
print('both - url_metch: {}'.format(url_match.group(0)))
|
||||||
|
print('button_metch: {}'.format(button_match.group(0)))
|
||||||
|
return True
|
||||||
|
elif url_match:
|
||||||
|
print('url_match: {}'.format(url_match.group(0)))
|
||||||
|
return True
|
||||||
|
elif button_match:
|
||||||
|
print('button_match: {}'.format(button_match.group(0)))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_parishes(directory):
|
||||||
|
utterances = []
|
||||||
|
utterances_count = 0
|
||||||
|
for file_nr, parish_path in enumerate(
|
||||||
|
iterator.parish_path_iterator(directory)):
|
||||||
|
print(parish_path)
|
||||||
|
metadata_count = 0
|
||||||
|
for page_nr, parish_page in enumerate(
|
||||||
|
iterator.parish_page_iterator(parish_path)):
|
||||||
|
content = parish_page.pop('content')
|
||||||
|
# if page_nr == 0 or has_mass_metadata(parish_page['url'], parish_page['button_text']):
|
||||||
|
if page_nr == 0:
|
||||||
|
utterances_count += add_utterances(content, utterances)
|
||||||
|
if has_mass_metadata(parish_page['url'],
|
||||||
|
parish_page['button_text']):
|
||||||
|
metadata_count += 1
|
||||||
|
utterances_count += add_utterances(content, utterances)
|
||||||
|
|
||||||
|
if metadata_count == 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
if page_nr == 100:
|
||||||
|
print(utterances_count)
|
||||||
|
break
|
||||||
|
print('file: {}, page: {}, utterances: {}'.format(
|
||||||
|
file_nr, page_nr, utterances_count))
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
utterances = {}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
load_parishes('./parishwebsites/data')
|
||||||
|
# r.set('foo', 'bar')
|
||||||
|
# print(r.get('foo'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
24
utils/iterator.py
Normal file
24
utils/iterator.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import os
|
||||||
|
import jsonlines
|
||||||
|
import random
|
||||||
|
from parishwebsites.parish2text import Parish2Text
|
||||||
|
|
||||||
|
|
||||||
|
def parish_path_iterator(directory):
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
for fname in sorted(files):
|
||||||
|
filepath = os.path.join(root, fname)
|
||||||
|
if os.path.getsize(filepath) > 0:
|
||||||
|
yield filepath
|
||||||
|
|
||||||
|
|
||||||
|
def parish_page_iterator(filepath):
|
||||||
|
with jsonlines.open(filepath) as parish_reader:
|
||||||
|
page_nr = 0
|
||||||
|
for parish_page in parish_reader:
|
||||||
|
page_nr += 1
|
||||||
|
if 'Maximum execution time of 30 seconds exceeded in' in parish_page[
|
||||||
|
'content']:
|
||||||
|
continue
|
||||||
|
parish2text = Parish2Text()
|
||||||
|
yield parish2text.convert(parish_page)
|
@ -1,23 +1,36 @@
|
|||||||
from flask import Flask, render_template, request
|
from flask import Flask, render_template, request
|
||||||
|
import redis
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def load_parishes(directory):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
parishes = load_parishes('dir')
|
||||||
|
|
||||||
|
|
||||||
def post_action():
|
def post_action():
|
||||||
return get_action()
|
return get_action()
|
||||||
|
|
||||||
|
|
||||||
def get_action():
|
def get_action():
|
||||||
hour = '12.00'
|
hour = '12.00'
|
||||||
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
|
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
|
||||||
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białego rana.'
|
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
|
||||||
return render_template('index.html', hour=hour, left_context=left_context, right_context=right_context)
|
return render_template(
|
||||||
|
'index.html',
|
||||||
|
hour=hour,
|
||||||
|
left_context=left_context,
|
||||||
|
right_context=right_context)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/", methods=['GET', 'POST'])
|
@app.route("/", methods=['GET', 'POST'])
|
||||||
def root():
|
def root():
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
return post_action()
|
return post_action()
|
||||||
else:
|
else:
|
||||||
return get_action()
|
return get_action()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
<div class="container">
|
<div class="container">
|
||||||
<div class="container mt-1">
|
<div class="container mt-1">
|
||||||
<div class="row justify-content-start">
|
<div class="row justify-content-start">
|
||||||
<button type="button" class="btn btn-warning btn-sm" id="cofnij">Cofinj</button>
|
<button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="row justify-content-center">
|
<div class="row justify-content-center">
|
||||||
<h2>Czy zaznaczono godzinę mszy świętej?</h2>
|
<h2>Czy zaznaczono godzinę mszy świętej?</h2>
|
||||||
@ -26,8 +26,8 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="btn-group d-flex h-mx" role="group">
|
<div class="btn-group d-flex h-mx" role="group">
|
||||||
<button type="button" class="btn btn-danger btn-lg w-100" id="nie">Nie</button>
|
<button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button>
|
||||||
<button type="button" class="btn btn-success btn-lg w-100" id="tak">Tak</button>
|
<button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<!-- Optional JavaScript -->
|
<!-- Optional JavaScript -->
|
||||||
@ -36,9 +36,37 @@
|
|||||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
|
||||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
|
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
|
||||||
<script type="text/javascript">
|
<script type="text/javascript">
|
||||||
$("button#tak").click(function(){
|
$("button#yes").click(function(){
|
||||||
$.post( "/", {result: "yes"}, function() {
|
$.post( "/", {result: "yes"}, function() {
|
||||||
console.log( "success" );
|
console.log( "yes button clicked" );
|
||||||
|
})
|
||||||
|
.done(function() {
|
||||||
|
console.log( "second success" );
|
||||||
|
})
|
||||||
|
.fail(function() {
|
||||||
|
console.log( "error" );
|
||||||
|
})
|
||||||
|
.always(function() {
|
||||||
|
console.log( "finished" );
|
||||||
|
});
|
||||||
|
});
|
||||||
|
$("button#no").click(function(){
|
||||||
|
$.post( "/", {result: "no"}, function() {
|
||||||
|
console.log( "no button clicked" );
|
||||||
|
})
|
||||||
|
.done(function() {
|
||||||
|
console.log( "second success" );
|
||||||
|
})
|
||||||
|
.fail(function() {
|
||||||
|
console.log( "error" );
|
||||||
|
})
|
||||||
|
.always(function() {
|
||||||
|
console.log( "finished" );
|
||||||
|
});
|
||||||
|
});
|
||||||
|
$("button#undo").click(function(){
|
||||||
|
$.post( "/", {result: "undo"}, function() {
|
||||||
|
console.log( "undo button clicked" );
|
||||||
})
|
})
|
||||||
.done(function() {
|
.done(function() {
|
||||||
console.log( "second success" );
|
console.log( "second success" );
|
||||||
|
Loading…
Reference in New Issue
Block a user