Add redis data structures description. Handle banned users.

Rename annotation_stats ---> annotator_console.py Add ban and users stats function in annotator_console.py
2018-05-26 19:07:08 +02:00 · 2018-05-26 19:07:08 +02:00 · 606ebb5260
commit 606ebb5260
parent 626307f135
3 changed files with 332 additions and 127 deletions
--- a/annotation_stats.py
+++ b/annotation_stats.py
@ -1,120 +0,0 @@
 #!/usr/bin/env python3
 import sys
 import argparse
 import redis
 from extractor.find_hours import color_hour
 import pickle
 from colorama import Fore, Back, Style
 import time
 import datetime
 import re
 r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0)
 def load_utterances(filename):
    with open(filename, 'rb') as f:
        utterances = pickle.load(f)
    return utterances
 utterances = load_utterances(
    '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')
 def format_time(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime(
        '%H:%M:%S.%f   %Y-%m-%d')
 def investigate_by_cookie(cookie_hash):
    cx = 0
    index_stop = None
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if ':' in key and not '.' in key and cookie_hash in key:
            if cx != 0:
                cx -= 1
                continue
            index = int(key.split(':')[1])
            if index_stop and index_stop != index:
                continue
            annotation_info = r.get(key).decode('utf-8')
            pprint_utterance(index, annotation_info)
            print(index)
            print(format_time(float(annotation_info.split(':')[2])))
            # print(annotation_info)
            action = input(
                'c: continue, cX: continue Xtimes, number: goto index\n')
            if action.isdigit():
                index_stop = int(action)
            else:
                index_stop = None
            if action[0] == 'c':
                if action[1:]:
                    cx = int(action[1:])
 def pprint_utterance(index, annotation_info=None):
    if not annotation_info:
        annotation_info = ['y']
    color = Fore.GREEN if annotation_info[0] == 'y' else Fore.RED
    print(
        color_hour(utterances[index]['prefix'], utterances[index]['hour'],
                   utterances[index]['suffix'], color))
 def print_stats():
    annotated = set()
    all_count = 0
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if ':' in key and not '.' in key:
            all_count += 1
            index = key.split(':')[1]
            annotated.add(index)
    print('All annotations: {}'.format(all_count))
    print('Annotated utterances: {}'.format(len(annotated)))
 def get_args():
    parser = argparse.ArgumentParser()
    subparser = parser.add_subparsers(dest='cmd')
    parser_stats = subparser.add_parser('stats', help='Show annotation stats.')
    parser_investigate = subparser.add_parser(
        'investigate', help='investigate cookie.')
    parser_investigate.add_argument('cookie', help='User cookie string')
    parser_index = subparser.add_parser('index', help='Print utterance.')
    parser_index.add_argument('index', type=int, help='Utterance index')
    subparser.add_parser('ipdb', help='Get into ipdb.')
    parser_exec = subparser.add_parser('exec', help='Execute redis command.')
    parser_exec.add_argument(
        'redis_command',
        help=
        'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\''
    )
    return parser.parse_args()
 def main():
    args = get_args()
    if args.cmd == 'stats':
        print_stats()
    elif args.cmd == 'investigate':
        investigate_by_cookie(args.cookie)
    elif args.cmd == 'index':
        pprint_utterance(args.index)
    elif args.cmd == 'ipdb':
        import ipdb
        ipdb.set_trace()
    elif args.cmd == 'exec':
        exec('print(r.{})'.format(args.redis_command), {
            'print': print,
            'r': r
        })
 if __name__ == '__main__':
    main()
--- a/annotator_console.py
+++ b/annotator_console.py
@ -0,0 +1,252 @@
 #!/usr/bin/env python3
 import sys
 import argparse
 import redis
 from extractor.find_hours import color_hour
 import pickle
 from colorama import Fore, Back, Style
 import time
 import datetime
 import re
 from collections import defaultdict
 import math
 r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0)
 def load_utterances(filename):
    with open(filename, 'rb') as f:
        utterances = pickle.load(f)
    return utterances
 utterances = load_utterances(
    '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')
 def format_time(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime(
        '%H:%M:%S.%f   %Y-%m-%d')
 def is_cookie_index(key):
    if ':' in key and not 'jshash' in key and not '.' in key and r.type(
            key) == b'string':
        return True
 def investigate_by_cookie(cookie_hash):
    cx = 0
    index_stop = None
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie_hash in key:
            if cx != 0:
                cx -= 1
                continue
            index = int(key.split(':')[1])
            if index_stop and index_stop != index:
                continue
            annotation_info = r.get(key).decode('utf-8')
            pprint_utterance(index, annotation_info)
            print(index)
            print(format_time(float(annotation_info.split(':')[2])))
            print(annotation_info)
            action = input(
                'c: continue, cX: continue Xtimes, number: goto index\n')
            if action.isdigit():
                index_stop = int(action)
            else:
                index_stop = None
            if action[0] == 'c':
                if action[1:].isdigit():
                    cx = int(action[1:])
 def pprint_utterance(index, annotation_info=None):
    if not annotation_info:
        annotation_info = ['y']
    color = Fore.GREEN if annotation_info[0] in 'yYt' else Fore.RED
    print(
        color_hour(utterances[index]['prefix'], utterances[index]['hour'],
                   utterances[index]['suffix'], color))
 def print_stats():
    annotated = set()
    all_count = 0
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and not r.sismember('banned',
                                                    key.split(':')[0]):
            all_count += 1
            index = key.split(':')[1]
            annotated.add(index)
    print('All annotations: {}'.format(all_count))
    print('Annotated utterances: {}/{}'.format(
        len(annotated), len(utterances)))
 def ban(cookie):
    r.sadd('banned', cookie)
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie in key.split(':')[0]:
            user, index = key.split(':')
            annotation = r.get(key).decode('utf-8')
            if annotation[0] in 'yn':
                yesno = annotation[0].translate(str.maketrans('yn', 'tf'))
                r.setrange(key, 0, yesno)
                str_index = int(annotation.split(':')[1])
                r.setrange(index, str_index,
                           yesno)  #sets str_index to yesno value
                r.zincrby('utterance-scores', index, -1)
 def users_stats():
    users_dict = defaultdict(lambda: defaultdict(list))
    users_set = set()
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key):
            user = key.split(':')[0]
            users_set.add(user)
            res = r.get(key)
            res_list = res.decode('utf-8').split(':')
            if len(res_list) == 4:
                yesno, str_index, timestamp, ip_addr = res_list
            else:
                yesno, str_index, timestamp = res_list
                ip_addr = '0'
            if 'yes_count' not in users_dict[user]:
                users_dict[user]['yes_count'] = 0
            if 'no_count' not in users_dict[user]:
                users_dict[user]['no_count'] = 0
            if yesno in 'yYt':
                users_dict[user]['yes_count'] += 1
            elif yesno in 'nNf':
                users_dict[user]['no_count'] += 1
            users_dict[user]['annotations'].append({
                'yesno':
                yesno,
                'str_index':
                int(str_index),
                'timestamp':
                float(timestamp),
                'ip_addr':
                ip_addr
            })
    for user in users_set:
        users_dict[user]['annotations'] = sorted(
            users_dict[user]['annotations'], key=lambda x: x['timestamp'])
    calculate_avg_annotation_time(users_dict)
    print_sorted(users_dict)
 def calculate_avg_annotation_time(users_dict, max_interval=10):
    for user, user_dict in users_dict.items():
        delta_sum = 0
        divider = 0
        breaks = 0
        for ann_1, ann_2 in zip(user_dict['annotations'],
                                user_dict['annotations'][1:]):
            delta = ann_2['timestamp'] - ann_1['timestamp']
            if delta < 10:
                delta_sum += delta
                divider += 1
            else:
                breaks += 1
        if delta_sum == 0:
            user_dict['avg_time'] = math.inf
        else:
            user_dict['avg_time'] = delta_sum / divider
        user_dict['breaks'] = breaks
 def print_sorted(users_dict, sortby='annotations max'):
    print('\t'.join(
        ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
         'status']))
    if sortby == 'annotations max':
        keyfunc = lambda x: len(x[1]['annotations'])
    for user, user_dict in sorted(
            users_dict.items(), key=keyfunc, reverse=True):
        if user_dict['yes_count'] + user_dict['no_count'] != len(
                user_dict['annotations']):
            import ipdb
            ipdb.set_trace()
        status = 'uncertain'
        if r.sismember('banned', user):
            status = 'banned'
        elif r.sismember('trusted', user):
            status = 'trusted'
        elif r.sismember('trusted-checked', user):
            status = 'trusted-checked'
        print('\t'.join([
            user,
            str(len(user_dict['annotations'])),
            str(user_dict['yes_count']),
            str(user_dict['no_count']),
            str(user_dict['avg_time']),
            str(user_dict['breaks']), status
        ]))
 def 2tsv():
    pass
 def get_args():
    parser = argparse.ArgumentParser()
    subparser = parser.add_subparsers(dest='cmd')
    parser_stats = subparser.add_parser('stats', help='Show annotation stats.')
    parser_investigate = subparser.add_parser(
        'investigate', help='investigate cookie.')
    parser_investigate.add_argument('cookie', help='User cookie string')
    parser_index = subparser.add_parser('index', help='Print utterance.')
    parser_index.add_argument('index', type=int, help='Utterance index')
    subparser.add_parser('ipdb', help='Get into ipdb.')
    parser_exec = subparser.add_parser('exec', help='Execute redis command.')
    parser_exec.add_argument(
        'redis_command',
        help=
        'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\''
    )
    parser_users = subparser.add_parser('users', help='User statistics')
    parser_ban = subparser.add_parser('ban', help='Ban user')
    parser_ban.add_argument('cookie', help='Cookie.')
    parser_2tsv = subparser.add_parser('2tsv', help='Convert data to tsv')
    return parser, parser.parse_args()
 def main():
    parser, args = get_args()
    if args.cmd == 'stats':
        print_stats()
    elif args.cmd == 'investigate':
        investigate_by_cookie(args.cookie)
    elif args.cmd == 'users':
        users_stats()
    elif args.cmd == 'index':
        pprint_utterance(args.index)
    elif args.cmd == 'ban':
        ban(args.cookie)
    elif args.cmd == '2tsv':
        2tsv()
    elif args.cmd == 'ipdb':
        import ipdb
        ipdb.set_trace()
    elif args.cmd == 'exec':
        exec('print(r.{})'.format(args.redis_command), {
            'print': print,
            'r': r
        })
    else:
        print(parser.format_help())
 if __name__ == '__main__':
    main()
--- a/webapp/app.py
+++ b/webapp/app.py
@ -1,3 +1,76 @@
 """
 ### REDIS data structures in annotator:
 STRINGS:
    'status'
        ---> {'filled', None, other_string}
            'filled'
                indicate that utterance-scores are properly filled
                with utterance ids
            None
                clears database
            other non empty value
                adds non existance utterances and sets their score to 0
    '{cookie}:{index}'
        ---> '{yesno}:{str_index}:{timestamp}:{ip_addr}'
    '{ip_addr}:{index}'
        ---> '{yesno}:{str_index}:{timestamp}:{cookie}'
    '{index}'
        ---> '[yYnNtf]*' e.g. 'ynyNYnynYytf'
            y
                uncertain yes
            t
                banned yes
            Y
                trusted yes (not handled yet)
            n
                uncertain no
            N
                trusted no (not handled yet)
            f
                banned no
    'jshash:{js_hash}'
        ---> '{cookie}'
 LISTS:
    '{cookie}'
        ---> '{yesno}:{index}:{str_index}'
    'undo:{cookie}'
        ---> '{yesno}:{index}:{str_index}'
 SETS:
    'ip-cookies:{ip_addr}'
        ---> '{cookie}'
            expires after 3 hours
    'trusted'
        ---> '{cookie}'
            trusted because I know this person
    'trusted-checked'
        ---> '{cookie}'
    'banned'
        ---> '{cookie}'
 SORTED SETS:
    'utterance-scores'
        ---> {index} ---> {nr_of_annotations}
 """
 from flask import Flask, render_template, request, make_response, jsonify
 import secrets
 import time
@ -70,8 +143,8 @@ def get_next(cookie_hash):
        index = find_not_annotated(cookie_hash)
        log('found unannotated index: {}'.format(index))
    left_context, hour, right_context = get_utterance_for_web(index)
-    log('get_next index: {}, score: {}'.format(index,
+    # log('get_next index: {}, score: {}'.format(index,
-                                               r.zscore(UTT_SCORES, index)))
+    #                                            r.zscore(UTT_SCORES, index)))
    return index, left_context, hour, right_context
@ -108,18 +181,18 @@ def get_response_by_index(index, cookie_hash):
 def annotate_redis(yesno, index, ip_addr, cookie_hash):
    # log('annotate: {}'.format(cookie_hash))
    banned = r.sismember('banned', cookie_hash)
    if banned:
        yesno = yesno.translate(str.maketrans('yn', 'tf'))
    timestamp = time.time()
    annotation = r.get('{}:{}'.format(
        cookie_hash, index))  # previous annotation of utterance by that user
    if annotation:
        # log(annotation.decode('utf-8'))
        str_index = int(annotation.decode('utf-8').split(':')[1])
        r.setrange(index, str_index, yesno)  #sets str_index to yesno value
    else:
-        # before = r.zscore(UTT_SCORES, index)
+        if not banned:
            r.zincrby(UTT_SCORES, index)
        # log('incrementing index {}, before_val: {}, value: {}'.format(
        #     index, before, r.zscore(UTT_SCORES, index)))
        str_index = r.append(index, yesno) - 1
    r.set('{}:{}'.format(cookie_hash, index), '{}:{}:{}:{}'.format(
        yesno, str_index, timestamp, ip_addr))