Working annotator. Without abuse handling, but logging actions.

Modify find_hours
Modify get_utterances
Add missing parish2text-commands.sh
workin app.py
add hash.min.js (fingerpirntjs)
modify index.html, make it prettier, add functions and more
This commit is contained in:
siulkilulki 2018-05-15 07:13:09 +02:00
parent 1f6b1e6ffe
commit 95491b20a7
6 changed files with 298 additions and 82 deletions

View File

@ -17,11 +17,19 @@ def borders_ok(text, start, end):
return False return False
def delete_duplicates(text):
text = re.sub(' +', ' ', text)
text = re.sub(' ?\n ?', '\n', text)
text = re.sub('\n{5,}', '\n\n\n', text)
text = re.sub('\n\n', '\n', text)
return text
def get_context(text, start, end, minsize): def get_context(text, start, end, minsize):
hour = text[start:end] hour = text[start:end]
prefix = re.sub(' +', ' ', text[:start]).rsplit( prefix = delete_duplicates(text[:start]).rsplit(
' ', maxsplit=minsize + 2)[1:] ' ', maxsplit=minsize + 12)[1:]
suffix = re.sub(' +', ' ', text[end:]).split( suffix = delete_duplicates(text[end:]).split(
' ', maxsplit=minsize + 2)[:-1] ' ', maxsplit=minsize + 2)[:-1]
return ' '.join(prefix), hour, ' '.join(suffix) return ' '.join(prefix), hour, ' '.join(suffix)
@ -33,12 +41,12 @@ def hours_iterator(text, minsize=20, color=False):
if not borders_ok(text, start, end): if not borders_ok(text, start, end):
continue continue
prefix, hour, suffix = get_context(text, start, end, minsize) prefix, hour, suffix = get_context(text, start, end, minsize)
utterance = f'{prefix}&&&{hour}###{suffix}'
if color: if color:
utterance = f'{prefix}&&&{hour}###{suffix}'
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN, yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN,
Style.BRIGHT) Style.BRIGHT)
else: else:
yield utterance yield prefix, hour, suffix
# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie # w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie

View File

@ -8,7 +8,6 @@ import re
import pickle import pickle
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
class Utterance(): class Utterance():
def __init__(self, utterance, url, button_text, depth, filename, line_no): def __init__(self, utterance, url, button_text, depth, filename, line_no):
"docstring" "docstring"
@ -23,11 +22,26 @@ class Utterance():
def add_utterances(parish_page, parish_path, utterances): def add_utterances(parish_page, parish_path, utterances):
utterances_nr = 0 utterances_nr = 0
content = parish_page['content'] content = parish_page['content']
for utterances_nr, utterance in enumerate(hours_iterator(content)): for utterances_nr, (prefix, hour, suffix) in enumerate(
utterance_inst = Utterance( hours_iterator(content)):
utterance, parish_page['url'], parish_page['button_text'], # utterance_inst = Utterance(
parish_page['depth'], parish_path, parish_page['line_no']) # utterance, parish_page['url'], parish_page['button_text'],
utterances.append(utterance_inst) # parish_page['depth'], parish_path, parish_page['line_no'])
utterance_dict = {
'prefix': prefix,
'hour': hour,
'suffix': suffix,
'url': parish_page['url'],
'button_text': parish_page['button_text'],
'depth': parish_page['depth'],
'filepath': parish_path,
'line_no': parish_page['line_no']
}
# print(prefix)
# print(hour)
# print('-------------------------------------------------')
# print(suffix)
utterances.append(utterance_dict)
return utterances_nr return utterances_nr
@ -96,9 +110,11 @@ def remove_duplicates(utterances):
seen = set() seen = set()
res = [] res = []
for utt in utterances: for utt in utterances:
if utt.utterance not in seen: # TODO: check why with &&&hour### it gives more utterances!!!
utterance_text = utt['prefix'] + utt['hour'] + utt['suffix']
if utterance_text not in seen:
res.append(utt) res.append(utt)
seen.add(utt.utterance) seen.add(utterance_text)
return res return res
@ -148,6 +164,7 @@ def get_extracted_by_rules(filename):
def main(): def main():
# TODO: use argparse and add makefile goal
extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt') extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt')
utterances = load_parishes('./parishwebsites/text-data', utterances = load_parishes('./parishwebsites/text-data',
extracted_by_rules) extracted_by_rules)

View File

@ -0,0 +1,5 @@
#!/usr/bin/env bash
for file in $1/*; do
filename=`basename $file`
echo "./parish2text.py < \"$file\" > \"text-data/$filename\" 2> \"text-data-logs/$filename\""
done

View File

@ -1,38 +1,169 @@
from flask import Flask, render_template, request, make_response from flask import Flask, render_template, request, make_response, jsonify
import os
from get_utterances import Utterance
import redis import redis
import pickle
import re
app = Flask(__name__) app = Flask(__name__)
r = redis.StrictRedis(host='localhost', port=6379, db=0)
def load_parishes(directory):
return {}
parishes = load_parishes('dir') def log(msg):
with open('/tmp/tmp', 'w') as f:
print(msg, f, flush=True)
def post_action(): def load_utterances(filename):
return get_action() with open(filename, 'rb') as f:
utterances = pickle.load(f)
return utterances
def get_action(): utterances = load_utterances(
hour = '12.00' '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.' UTT_SCORES = 'utterance-scores'
resp = make_response( # log(utterances[0:2])
render_template(
'index.html', #initialize_redis_db
hour=hour, # r.flushdb()
# status = None
status = r.get('status')
if status:
status = status.decode('utf-8')
else:
r.flushdb()
if status != 'filled':
log('filling status')
for i in range(len(utterances)):
r.zadd(UTT_SCORES, 0, str(i))
r.set('status', 'filled')
status = 'filled'
def get_next():
index = int(r.zrangebyscore(UTT_SCORES, '-inf', 'inf')[1])
left_context = utterances[index]['prefix'].replace('\n', '<br>')
hour = utterances[index]['hour'].replace('\n', '<br>')
right_context = ' '.join(
utterances[index]['suffix'].split(' ')[:10]).replace('\n', '<br>')
# log('get_next index: {}, score: {}'.format(index,
# r.zscore(UTT_SCORES, index)))
return index, left_context, hour, right_context
def get_next_response(cookie_hash):
index, left_context, hour, right_context = get_next()
resp = jsonify(
index=index,
left_context=left_context, left_context=left_context,
right_context=right_context)) hour=hour,
right_context=right_context)
if cookie_hash:
resp.set_cookie('cookie-hash', cookie_hash, max_age=60 * 60 * 24 * 90)
return resp return resp
def get_by_index(index):
left_context = utterances[index]['prefix'].replace('\n', '<br>')
hour = utterances[index]['hour'].replace('\n', '<br>')
right_context = ' '.join(
utterances[index]['suffix'].split(' ')[:10]).replace('\n', '<br>')
# log('get_next index: {}, score: {}'.format(index,
# r.zscore(UTT_SCORES, index)))
return index, left_context, hour, right_context
def get_response_by_index(index, cookie_hash):
index, left_context, hour, right_context = get_by_index(index)
resp = jsonify(
index=index,
left_context=left_context,
hour=hour,
right_context=right_context)
if cookie_hash:
resp.set_cookie('cookie-hash', cookie_hash, max_age=60 * 60 * 24 * 90)
return resp
def annotate_redis(yesno, index):
cookie_hash = request.cookies.get('cookie-hash')
if not cookie_hash:
return None
annotation = r.get('{}:{}'.format(
cookie_hash, index)) # previous annotation of utterance by that user
if annotation:
log('!!!!!!!!!!!')
log(annotation)
str_index = int(annotation.decode('utf-8').split(':')[1])
log(str_index)
r.setrange(index, str_index, yesno) #sets str_index to yesno value
else:
# before = r.zscore(UTT_SCORES, index)
r.zincrby(UTT_SCORES, index)
# log('incrementing index {}, before_val: {}, value: {}'.format(
# index, before, r.zscore(UTT_SCORES, index)))
str_index = r.append(index, yesno) - 1
r.set('{}:{}'.format(cookie_hash, index), '{}:{}'.format(yesno, str_index))
r.lpush(cookie_hash, '{}:{}:{}'.format(yesno, index, str_index))
return cookie_hash
def set_cookie(js_hash):
# dodawać nowe js_hash do listy z key bedacym cookie_hash
old_cookie_hash = None
cookie_hash = request.cookies.get('cookie-hash')
if not cookie_hash:
old_cookie_hash = r.get(js_hash)
if not old_cookie_hash:
cookie_hash = str(
int.from_bytes(os.urandom(4), byteorder='little'))
r.set(js_hash, cookie_hash, nx=True)
else:
cookie_hash = str(old_cookie_hash)
log('old_cookie: {}, cookie: {}'.format(old_cookie_hash, cookie_hash))
return cookie_hash
def http_post(request):
index = str(request.form.get('index'))
action = request.form['action']
js_hash = request.form['hash']
log(request.form)
log(f'action: {action}')
if action == 'get':
cookie_hash = set_cookie(js_hash)
return get_next_response(cookie_hash)
elif action == 'undo':
cookie_hash = request.cookies.get('cookie-hash')
if cookie_hash:
last_action = r.lpop(cookie_hash)
if last_action:
index = int(last_action.decode('utf-8').split(':')[1])
return get_response_by_index(index, cookie_hash)
# if no cookie-hash or action list is empty return None
elif action == 'yes':
cookie_hash = annotate_redis('y', index)
if cookie_hash:
return get_next_response(cookie_hash)
elif action == 'no':
cookie_hash = annotate_redis('n', index)
if cookie_hash:
return get_next_response(cookie_hash)
def http_get():
return render_template('index.html')
@app.route("/", methods=['GET', 'POST']) @app.route("/", methods=['GET', 'POST'])
def root(): def root():
if request.method == 'POST': if request.method == 'POST':
return post_action() return http_post(request)
else: else:
return get_action() return http_get()
if __name__ == "__main__": if __name__ == "__main__":

1
webapp/static/hash.min.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,7 @@
<!doctype html> <!doctype html>
<html lang="pl"> <html lang="pl">
<head>
<head>
<!-- Required meta tags --> <!-- Required meta tags -->
<meta charset="utf-8"> <meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=yes"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=yes">
@ -9,10 +10,11 @@
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous"> <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous">
<title>Annotator mszy świętych</title> <title>Annotator mszy świętych</title>
</head> </head>
<body>
<body>
<div class="container"> <div class="container">
<div class="container mt-1"> <div class="container mt-2">
<div class="row justify-content-start"> <div class="row justify-content-start">
<button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button> <button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button>
</div> </div>
@ -20,12 +22,11 @@
<h2>Czy zaznaczono godzinę mszy świętej?</h2> <h2>Czy zaznaczono godzinę mszy świętej?</h2>
</div> </div>
<div class="row justify-content-center"> <div class="row justify-content-center">
<div class="jumbotron my-auto"> <div class="jumbotron my-auto w-100" id="content">
<p class="lead">{{ left_context }}<strong class="bg-warning h4">{{ hour }}</strong>{{right_context}}</p>
</div> </div>
</div> </div>
</div> </div>
<div class="btn-group d-flex h-mx" role="group"> <div class="btn-group d-flex mb-1" role="group">
<button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button> <button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button>
<button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button> <button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button>
</div> </div>
@ -35,49 +36,102 @@
<script src="https://code.jquery.com/jquery-3.3.1.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script> <script src="https://code.jquery.com/jquery-3.3.1.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script> <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
<script type="text/javascript" src="{{ url_for('static', filename='hash.min.js') }}"></script>
<script type="text/javascript"> <script type="text/javascript">
$("button#yes").click(function(){ /* console.log('index = ' + {{index}}) */
$.post( "/", {result: "yes"}, function() { function update_content(dict) {
console.log( "yes button clicked" ); document.getElementById("content").innerHTML = `<p class="lead">${dict.left_context}<strong class="bg-warning h4">${dict.hour}</strong>${dict.right_context}</p>`
}
function tell_to_refresh() {
document.getElementById("content").innerHTML = '<p class="lead">Please reload page.</p>'
}
var index;
new Fingerprint2().get(function(result, components) {
console.log(result)
$.post("/", {
action: "get",
hash: result,
}, function() {
console.log("first get");
}) })
.done(function() { .done(function(reply) {
console.log( "second success" ); index = reply.index;
update_content(reply);
console.log("content set");
}) })
.fail(function() { .fail(function() {
console.log( "error" ); console.log("error");
tell_to_refresh()
}) })
.always(function() { .always(function() {
console.log( "finished" ); console.log("finished");
}); });
}); $("button#yes").click(function() {
$("button#no").click(function(){ $.post("/", {
$.post( "/", {result: "no"}, function() { action: "yes",
console.log( "no button clicked" ); index: index,
hash: result
}, function() {
console.log("yes button clicked");
}) })
.done(function() { .done(function(reply) {
console.log( "second success" ); index = reply.index;
update_content(reply);
window.scrollTo(0, document.body.scrollHeight);
}) })
.fail(function() { .fail(function() {
console.log( "error" ); console.log("error");
tell_to_refresh()
}) })
.always(function() { .always(function() {
console.log( "finished" ); console.log("finished");
}); });
}); });
$("button#undo").click(function(){ $("button#no").click(function() {
$.post( "/", {result: "undo"}, function() { $.post("/", {
console.log( "undo button clicked" ); action: "no",
index: index,
hash: result
}, function() {
console.log("no button clicked");
}) })
.done(function() { .done(function(reply) {
console.log( "second success" ); index = reply.index;
update_content(reply);
window.scrollTo(0, document.body.scrollHeight);
}) })
.fail(function() { .fail(function() {
console.log( "error" ); console.log("error");
tell_to_refresh()
}) })
.always(function() { .always(function() {
console.log( "finished" ); console.log("finished");
}); });
}); });
$("button#undo").click(function() {
$.post("/", {
action: "undo",
index: index,
hash: result
}, function() {
console.log("undo button clicked");
})
.done(function(reply) {
index = reply.index;
update_content(reply);
console.log("second success");
})
.fail(function() {
console.log("error");
tell_to_refresh()
})
.always(function() {
console.log("finished");
});
});
})
</script> </script>
</body> </body>
</html> </html>