Working annotator. Without abuse handling, but logging actions.
Modify find_hours Modify get_utterances Add missing parish2text-commands.sh workin app.py add hash.min.js (fingerpirntjs) modify index.html, make it prettier, add functions and more
This commit is contained in:
parent
1f6b1e6ffe
commit
95491b20a7
@ -17,11 +17,19 @@ def borders_ok(text, start, end):
|
||||
return False
|
||||
|
||||
|
||||
def delete_duplicates(text):
|
||||
text = re.sub(' +', ' ', text)
|
||||
text = re.sub(' ?\n ?', '\n', text)
|
||||
text = re.sub('\n{5,}', '\n\n\n', text)
|
||||
text = re.sub('\n\n', '\n', text)
|
||||
return text
|
||||
|
||||
|
||||
def get_context(text, start, end, minsize):
|
||||
hour = text[start:end]
|
||||
prefix = re.sub(' +', ' ', text[:start]).rsplit(
|
||||
' ', maxsplit=minsize + 2)[1:]
|
||||
suffix = re.sub(' +', ' ', text[end:]).split(
|
||||
prefix = delete_duplicates(text[:start]).rsplit(
|
||||
' ', maxsplit=minsize + 12)[1:]
|
||||
suffix = delete_duplicates(text[end:]).split(
|
||||
' ', maxsplit=minsize + 2)[:-1]
|
||||
return ' '.join(prefix), hour, ' '.join(suffix)
|
||||
|
||||
@ -33,12 +41,12 @@ def hours_iterator(text, minsize=20, color=False):
|
||||
if not borders_ok(text, start, end):
|
||||
continue
|
||||
prefix, hour, suffix = get_context(text, start, end, minsize)
|
||||
utterance = f'{prefix}&&&{hour}###{suffix}'
|
||||
if color:
|
||||
utterance = f'{prefix}&&&{hour}###{suffix}'
|
||||
yield utterance, color_hour(prefix, hour, suffix, Fore.GREEN,
|
||||
Style.BRIGHT)
|
||||
else:
|
||||
yield utterance
|
||||
yield prefix, hour, suffix
|
||||
|
||||
|
||||
# w klasyfikatorze dzielić tak aby jeszcze \n było oddzielnie
|
||||
|
@ -8,7 +8,6 @@ import re
|
||||
import pickle
|
||||
|
||||
|
||||
# r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
||||
class Utterance():
|
||||
def __init__(self, utterance, url, button_text, depth, filename, line_no):
|
||||
"docstring"
|
||||
@ -23,11 +22,26 @@ class Utterance():
|
||||
def add_utterances(parish_page, parish_path, utterances):
|
||||
utterances_nr = 0
|
||||
content = parish_page['content']
|
||||
for utterances_nr, utterance in enumerate(hours_iterator(content)):
|
||||
utterance_inst = Utterance(
|
||||
utterance, parish_page['url'], parish_page['button_text'],
|
||||
parish_page['depth'], parish_path, parish_page['line_no'])
|
||||
utterances.append(utterance_inst)
|
||||
for utterances_nr, (prefix, hour, suffix) in enumerate(
|
||||
hours_iterator(content)):
|
||||
# utterance_inst = Utterance(
|
||||
# utterance, parish_page['url'], parish_page['button_text'],
|
||||
# parish_page['depth'], parish_path, parish_page['line_no'])
|
||||
utterance_dict = {
|
||||
'prefix': prefix,
|
||||
'hour': hour,
|
||||
'suffix': suffix,
|
||||
'url': parish_page['url'],
|
||||
'button_text': parish_page['button_text'],
|
||||
'depth': parish_page['depth'],
|
||||
'filepath': parish_path,
|
||||
'line_no': parish_page['line_no']
|
||||
}
|
||||
# print(prefix)
|
||||
# print(hour)
|
||||
# print('-------------------------------------------------')
|
||||
# print(suffix)
|
||||
utterances.append(utterance_dict)
|
||||
return utterances_nr
|
||||
|
||||
|
||||
@ -96,9 +110,11 @@ def remove_duplicates(utterances):
|
||||
seen = set()
|
||||
res = []
|
||||
for utt in utterances:
|
||||
if utt.utterance not in seen:
|
||||
# TODO: check why with &&&hour### it gives more utterances!!!
|
||||
utterance_text = utt['prefix'] + utt['hour'] + utt['suffix']
|
||||
if utterance_text not in seen:
|
||||
res.append(utt)
|
||||
seen.add(utt.utterance)
|
||||
seen.add(utterance_text)
|
||||
return res
|
||||
|
||||
|
||||
@ -148,6 +164,7 @@ def get_extracted_by_rules(filename):
|
||||
|
||||
|
||||
def main():
|
||||
# TODO: use argparse and add makefile goal
|
||||
extracted_by_rules = get_extracted_by_rules('./extracted-by-rules.txt')
|
||||
utterances = load_parishes('./parishwebsites/text-data',
|
||||
extracted_by_rules)
|
||||
|
5
parishwebsites/parish2text-commands.sh
Executable file
5
parishwebsites/parish2text-commands.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
for file in $1/*; do
|
||||
filename=`basename $file`
|
||||
echo "./parish2text.py < \"$file\" > \"text-data/$filename\" 2> \"text-data-logs/$filename\""
|
||||
done
|
167
webapp/app.py
167
webapp/app.py
@ -1,38 +1,169 @@
|
||||
from flask import Flask, render_template, request, make_response
|
||||
from flask import Flask, render_template, request, make_response, jsonify
|
||||
import os
|
||||
from get_utterances import Utterance
|
||||
import redis
|
||||
import pickle
|
||||
import re
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
def load_parishes(directory):
|
||||
return {}
|
||||
r = redis.StrictRedis(host='localhost', port=6379, db=0)
|
||||
|
||||
|
||||
parishes = load_parishes('dir')
|
||||
def log(msg):
|
||||
with open('/tmp/tmp', 'w') as f:
|
||||
print(msg, f, flush=True)
|
||||
|
||||
|
||||
def post_action():
|
||||
return get_action()
|
||||
def load_utterances(filename):
|
||||
with open(filename, 'rb') as f:
|
||||
utterances = pickle.load(f)
|
||||
return utterances
|
||||
|
||||
|
||||
def get_action():
|
||||
hour = '12.00'
|
||||
left_context = 'Dawno, dawno temu był sobia para młoda, bardzo piękna para młoda. Msza rozpocznie się o godzinie '
|
||||
right_context = '. Następnie para młoda uda się na wesele do Kubusia i będą się bawić do białegop prana.'
|
||||
resp = make_response(
|
||||
render_template(
|
||||
'index.html',
|
||||
hour=hour,
|
||||
utterances = load_utterances(
|
||||
'/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')
|
||||
|
||||
UTT_SCORES = 'utterance-scores'
|
||||
# log(utterances[0:2])
|
||||
|
||||
#initialize_redis_db
|
||||
# r.flushdb()
|
||||
# status = None
|
||||
status = r.get('status')
|
||||
if status:
|
||||
status = status.decode('utf-8')
|
||||
else:
|
||||
r.flushdb()
|
||||
if status != 'filled':
|
||||
log('filling status')
|
||||
for i in range(len(utterances)):
|
||||
r.zadd(UTT_SCORES, 0, str(i))
|
||||
r.set('status', 'filled')
|
||||
status = 'filled'
|
||||
|
||||
|
||||
def get_next():
|
||||
index = int(r.zrangebyscore(UTT_SCORES, '-inf', 'inf')[1])
|
||||
left_context = utterances[index]['prefix'].replace('\n', '<br>')
|
||||
hour = utterances[index]['hour'].replace('\n', '<br>')
|
||||
right_context = ' '.join(
|
||||
utterances[index]['suffix'].split(' ')[:10]).replace('\n', '<br>')
|
||||
# log('get_next index: {}, score: {}'.format(index,
|
||||
# r.zscore(UTT_SCORES, index)))
|
||||
return index, left_context, hour, right_context
|
||||
|
||||
|
||||
def get_next_response(cookie_hash):
|
||||
index, left_context, hour, right_context = get_next()
|
||||
resp = jsonify(
|
||||
index=index,
|
||||
left_context=left_context,
|
||||
right_context=right_context))
|
||||
hour=hour,
|
||||
right_context=right_context)
|
||||
if cookie_hash:
|
||||
resp.set_cookie('cookie-hash', cookie_hash, max_age=60 * 60 * 24 * 90)
|
||||
return resp
|
||||
|
||||
|
||||
def get_by_index(index):
|
||||
left_context = utterances[index]['prefix'].replace('\n', '<br>')
|
||||
hour = utterances[index]['hour'].replace('\n', '<br>')
|
||||
right_context = ' '.join(
|
||||
utterances[index]['suffix'].split(' ')[:10]).replace('\n', '<br>')
|
||||
# log('get_next index: {}, score: {}'.format(index,
|
||||
# r.zscore(UTT_SCORES, index)))
|
||||
return index, left_context, hour, right_context
|
||||
|
||||
|
||||
def get_response_by_index(index, cookie_hash):
|
||||
index, left_context, hour, right_context = get_by_index(index)
|
||||
resp = jsonify(
|
||||
index=index,
|
||||
left_context=left_context,
|
||||
hour=hour,
|
||||
right_context=right_context)
|
||||
if cookie_hash:
|
||||
resp.set_cookie('cookie-hash', cookie_hash, max_age=60 * 60 * 24 * 90)
|
||||
return resp
|
||||
|
||||
|
||||
def annotate_redis(yesno, index):
|
||||
cookie_hash = request.cookies.get('cookie-hash')
|
||||
if not cookie_hash:
|
||||
return None
|
||||
annotation = r.get('{}:{}'.format(
|
||||
cookie_hash, index)) # previous annotation of utterance by that user
|
||||
if annotation:
|
||||
log('!!!!!!!!!!!')
|
||||
log(annotation)
|
||||
str_index = int(annotation.decode('utf-8').split(':')[1])
|
||||
log(str_index)
|
||||
r.setrange(index, str_index, yesno) #sets str_index to yesno value
|
||||
else:
|
||||
# before = r.zscore(UTT_SCORES, index)
|
||||
r.zincrby(UTT_SCORES, index)
|
||||
# log('incrementing index {}, before_val: {}, value: {}'.format(
|
||||
# index, before, r.zscore(UTT_SCORES, index)))
|
||||
str_index = r.append(index, yesno) - 1
|
||||
r.set('{}:{}'.format(cookie_hash, index), '{}:{}'.format(yesno, str_index))
|
||||
r.lpush(cookie_hash, '{}:{}:{}'.format(yesno, index, str_index))
|
||||
return cookie_hash
|
||||
|
||||
|
||||
def set_cookie(js_hash):
|
||||
# dodawać nowe js_hash do listy z key bedacym cookie_hash
|
||||
old_cookie_hash = None
|
||||
cookie_hash = request.cookies.get('cookie-hash')
|
||||
if not cookie_hash:
|
||||
old_cookie_hash = r.get(js_hash)
|
||||
if not old_cookie_hash:
|
||||
cookie_hash = str(
|
||||
int.from_bytes(os.urandom(4), byteorder='little'))
|
||||
r.set(js_hash, cookie_hash, nx=True)
|
||||
else:
|
||||
cookie_hash = str(old_cookie_hash)
|
||||
log('old_cookie: {}, cookie: {}'.format(old_cookie_hash, cookie_hash))
|
||||
return cookie_hash
|
||||
|
||||
|
||||
def http_post(request):
|
||||
index = str(request.form.get('index'))
|
||||
action = request.form['action']
|
||||
js_hash = request.form['hash']
|
||||
log(request.form)
|
||||
log(f'action: {action}')
|
||||
if action == 'get':
|
||||
cookie_hash = set_cookie(js_hash)
|
||||
return get_next_response(cookie_hash)
|
||||
elif action == 'undo':
|
||||
cookie_hash = request.cookies.get('cookie-hash')
|
||||
if cookie_hash:
|
||||
last_action = r.lpop(cookie_hash)
|
||||
if last_action:
|
||||
index = int(last_action.decode('utf-8').split(':')[1])
|
||||
return get_response_by_index(index, cookie_hash)
|
||||
# if no cookie-hash or action list is empty return None
|
||||
elif action == 'yes':
|
||||
cookie_hash = annotate_redis('y', index)
|
||||
if cookie_hash:
|
||||
return get_next_response(cookie_hash)
|
||||
elif action == 'no':
|
||||
cookie_hash = annotate_redis('n', index)
|
||||
if cookie_hash:
|
||||
return get_next_response(cookie_hash)
|
||||
|
||||
|
||||
def http_get():
|
||||
return render_template('index.html')
|
||||
|
||||
|
||||
@app.route("/", methods=['GET', 'POST'])
|
||||
def root():
|
||||
if request.method == 'POST':
|
||||
return post_action()
|
||||
return http_post(request)
|
||||
else:
|
||||
return get_action()
|
||||
return http_get()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
1
webapp/static/hash.min.js
vendored
Normal file
1
webapp/static/hash.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
@ -1,5 +1,6 @@
|
||||
<!doctype html>
|
||||
<html lang="pl">
|
||||
|
||||
<head>
|
||||
<!-- Required meta tags -->
|
||||
<meta charset="utf-8">
|
||||
@ -10,9 +11,10 @@
|
||||
|
||||
<title>Annotator mszy świętych</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="container mt-1">
|
||||
<div class="container mt-2">
|
||||
<div class="row justify-content-start">
|
||||
<button type="button" class="btn btn-warning btn-sm" id="undo">Cofnij</button>
|
||||
</div>
|
||||
@ -20,12 +22,11 @@
|
||||
<h2>Czy zaznaczono godzinę mszy świętej?</h2>
|
||||
</div>
|
||||
<div class="row justify-content-center">
|
||||
<div class="jumbotron my-auto">
|
||||
<p class="lead">{{ left_context }}<strong class="bg-warning h4">{{ hour }}</strong>{{right_context}}</p>
|
||||
<div class="jumbotron my-auto w-100" id="content">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="btn-group d-flex h-mx" role="group">
|
||||
<div class="btn-group d-flex mb-1" role="group">
|
||||
<button type="button" class="btn btn-danger btn-lg w-100" id="no">Nie</button>
|
||||
<button type="button" class="btn btn-success btn-lg w-100" id="yes">Tak</button>
|
||||
</div>
|
||||
@ -35,49 +36,102 @@
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.min.js" integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.3/umd/popper.min.js" integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js" integrity="sha384-smHYKdLADwkXOn1EmN1qk/HfnUcbVRZyYmZ4qpPea6sjB/pTJ0euyQp0Mk8ck+5T" crossorigin="anonymous"></script>
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='hash.min.js') }}"></script>
|
||||
<script type="text/javascript">
|
||||
$("button#yes").click(function(){
|
||||
$.post( "/", {result: "yes"}, function() {
|
||||
console.log( "yes button clicked" );
|
||||
/* console.log('index = ' + {{index}}) */
|
||||
function update_content(dict) {
|
||||
document.getElementById("content").innerHTML = `<p class="lead">${dict.left_context}<strong class="bg-warning h4">${dict.hour}</strong>${dict.right_context}</p>`
|
||||
}
|
||||
|
||||
function tell_to_refresh() {
|
||||
document.getElementById("content").innerHTML = '<p class="lead">Please reload page.</p>'
|
||||
}
|
||||
var index;
|
||||
new Fingerprint2().get(function(result, components) {
|
||||
console.log(result)
|
||||
$.post("/", {
|
||||
action: "get",
|
||||
hash: result,
|
||||
}, function() {
|
||||
console.log("first get");
|
||||
})
|
||||
.done(function() {
|
||||
console.log( "second success" );
|
||||
.done(function(reply) {
|
||||
index = reply.index;
|
||||
update_content(reply);
|
||||
console.log("content set");
|
||||
})
|
||||
.fail(function() {
|
||||
console.log("error");
|
||||
tell_to_refresh()
|
||||
})
|
||||
.always(function() {
|
||||
console.log("finished");
|
||||
});
|
||||
$("button#yes").click(function() {
|
||||
$.post("/", {
|
||||
action: "yes",
|
||||
index: index,
|
||||
hash: result
|
||||
}, function() {
|
||||
console.log("yes button clicked");
|
||||
})
|
||||
.done(function(reply) {
|
||||
index = reply.index;
|
||||
update_content(reply);
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
})
|
||||
.fail(function() {
|
||||
console.log("error");
|
||||
tell_to_refresh()
|
||||
})
|
||||
.always(function() {
|
||||
console.log("finished");
|
||||
});
|
||||
});
|
||||
$("button#no").click(function() {
|
||||
$.post( "/", {result: "no"}, function() {
|
||||
$.post("/", {
|
||||
action: "no",
|
||||
index: index,
|
||||
hash: result
|
||||
}, function() {
|
||||
console.log("no button clicked");
|
||||
})
|
||||
.done(function() {
|
||||
console.log( "second success" );
|
||||
.done(function(reply) {
|
||||
index = reply.index;
|
||||
update_content(reply);
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
})
|
||||
.fail(function() {
|
||||
console.log("error");
|
||||
tell_to_refresh()
|
||||
})
|
||||
.always(function() {
|
||||
console.log("finished");
|
||||
});
|
||||
});
|
||||
$("button#undo").click(function() {
|
||||
$.post( "/", {result: "undo"}, function() {
|
||||
$.post("/", {
|
||||
action: "undo",
|
||||
index: index,
|
||||
hash: result
|
||||
}, function() {
|
||||
console.log("undo button clicked");
|
||||
})
|
||||
.done(function() {
|
||||
.done(function(reply) {
|
||||
index = reply.index;
|
||||
update_content(reply);
|
||||
console.log("second success");
|
||||
})
|
||||
.fail(function() {
|
||||
console.log("error");
|
||||
tell_to_refresh()
|
||||
})
|
||||
.always(function() {
|
||||
console.log("finished");
|
||||
});
|
||||
});
|
||||
})
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
Loading…
Reference in New Issue
Block a user