diff --git a/annotation_stats.py b/annotation_stats.py deleted file mode 100755 index f678f22..0000000 --- a/annotation_stats.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python3 -import sys - -import argparse -import redis -from extractor.find_hours import color_hour -import pickle -from colorama import Fore, Back, Style -import time -import datetime -import re - -r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0) - - -def load_utterances(filename): - with open(filename, 'rb') as f: - utterances = pickle.load(f) - return utterances - - -utterances = load_utterances( - '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl') - - -def format_time(timestamp): - return datetime.datetime.fromtimestamp(timestamp).strftime( - '%H:%M:%S.%f %Y-%m-%d') - - -def investigate_by_cookie(cookie_hash): - cx = 0 - index_stop = None - for key in sorted(set(r.scan_iter())): - key = key.decode('utf-8') - if ':' in key and not '.' in key and cookie_hash in key: - if cx != 0: - cx -= 1 - continue - index = int(key.split(':')[1]) - if index_stop and index_stop != index: - continue - annotation_info = r.get(key).decode('utf-8') - pprint_utterance(index, annotation_info) - print(index) - print(format_time(float(annotation_info.split(':')[2]))) - # print(annotation_info) - action = input( - 'c: continue, cX: continue Xtimes, number: goto index\n') - if action.isdigit(): - index_stop = int(action) - else: - index_stop = None - if action[0] == 'c': - if action[1:]: - cx = int(action[1:]) - - -def pprint_utterance(index, annotation_info=None): - if not annotation_info: - annotation_info = ['y'] - color = Fore.GREEN if annotation_info[0] == 'y' else Fore.RED - print( - color_hour(utterances[index]['prefix'], utterances[index]['hour'], - utterances[index]['suffix'], color)) - - -def print_stats(): - annotated = set() - all_count = 0 - for key in set(r.scan_iter()): - key = key.decode('utf-8') - if ':' in key and not '.' in key: - all_count += 1 - index = key.split(':')[1] - annotated.add(index) - print('All annotations: {}'.format(all_count)) - print('Annotated utterances: {}'.format(len(annotated))) - - -def get_args(): - parser = argparse.ArgumentParser() - subparser = parser.add_subparsers(dest='cmd') - parser_stats = subparser.add_parser('stats', help='Show annotation stats.') - parser_investigate = subparser.add_parser( - 'investigate', help='investigate cookie.') - parser_investigate.add_argument('cookie', help='User cookie string') - parser_index = subparser.add_parser('index', help='Print utterance.') - parser_index.add_argument('index', type=int, help='Utterance index') - subparser.add_parser('ipdb', help='Get into ipdb.') - parser_exec = subparser.add_parser('exec', help='Execute redis command.') - parser_exec.add_argument( - 'redis_command', - help= - 'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\'' - ) - - return parser.parse_args() - - -def main(): - args = get_args() - if args.cmd == 'stats': - print_stats() - elif args.cmd == 'investigate': - investigate_by_cookie(args.cookie) - elif args.cmd == 'index': - pprint_utterance(args.index) - elif args.cmd == 'ipdb': - import ipdb - ipdb.set_trace() - elif args.cmd == 'exec': - exec('print(r.{})'.format(args.redis_command), { - 'print': print, - 'r': r - }) - - -if __name__ == '__main__': - main() diff --git a/annotator_console.py b/annotator_console.py new file mode 100755 index 0000000..c560e4e --- /dev/null +++ b/annotator_console.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +import sys + +import argparse +import redis +from extractor.find_hours import color_hour +import pickle +from colorama import Fore, Back, Style +import time +import datetime +import re +from collections import defaultdict +import math + +r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0) + + +def load_utterances(filename): + with open(filename, 'rb') as f: + utterances = pickle.load(f) + return utterances + + +utterances = load_utterances( + '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl') + + +def format_time(timestamp): + return datetime.datetime.fromtimestamp(timestamp).strftime( + '%H:%M:%S.%f %Y-%m-%d') + + +def is_cookie_index(key): + if ':' in key and not 'jshash' in key and not '.' in key and r.type( + key) == b'string': + return True + + +def investigate_by_cookie(cookie_hash): + cx = 0 + index_stop = None + for key in sorted(set(r.scan_iter())): + key = key.decode('utf-8') + if is_cookie_index(key) and cookie_hash in key: + if cx != 0: + cx -= 1 + continue + index = int(key.split(':')[1]) + if index_stop and index_stop != index: + continue + annotation_info = r.get(key).decode('utf-8') + pprint_utterance(index, annotation_info) + print(index) + print(format_time(float(annotation_info.split(':')[2]))) + print(annotation_info) + action = input( + 'c: continue, cX: continue Xtimes, number: goto index\n') + if action.isdigit(): + index_stop = int(action) + else: + index_stop = None + if action[0] == 'c': + if action[1:].isdigit(): + cx = int(action[1:]) + + +def pprint_utterance(index, annotation_info=None): + if not annotation_info: + annotation_info = ['y'] + color = Fore.GREEN if annotation_info[0] in 'yYt' else Fore.RED + print( + color_hour(utterances[index]['prefix'], utterances[index]['hour'], + utterances[index]['suffix'], color)) + + +def print_stats(): + annotated = set() + all_count = 0 + for key in set(r.scan_iter()): + key = key.decode('utf-8') + if is_cookie_index(key) and not r.sismember('banned', + key.split(':')[0]): + all_count += 1 + index = key.split(':')[1] + annotated.add(index) + print('All annotations: {}'.format(all_count)) + print('Annotated utterances: {}/{}'.format( + len(annotated), len(utterances))) + + +def ban(cookie): + r.sadd('banned', cookie) + for key in set(r.scan_iter()): + key = key.decode('utf-8') + if is_cookie_index(key) and cookie in key.split(':')[0]: + user, index = key.split(':') + annotation = r.get(key).decode('utf-8') + if annotation[0] in 'yn': + yesno = annotation[0].translate(str.maketrans('yn', 'tf')) + r.setrange(key, 0, yesno) + str_index = int(annotation.split(':')[1]) + r.setrange(index, str_index, + yesno) #sets str_index to yesno value + r.zincrby('utterance-scores', index, -1) + + +def users_stats(): + users_dict = defaultdict(lambda: defaultdict(list)) + users_set = set() + for key in sorted(set(r.scan_iter())): + key = key.decode('utf-8') + if is_cookie_index(key): + user = key.split(':')[0] + users_set.add(user) + res = r.get(key) + res_list = res.decode('utf-8').split(':') + if len(res_list) == 4: + yesno, str_index, timestamp, ip_addr = res_list + else: + yesno, str_index, timestamp = res_list + ip_addr = '0' + if 'yes_count' not in users_dict[user]: + users_dict[user]['yes_count'] = 0 + if 'no_count' not in users_dict[user]: + users_dict[user]['no_count'] = 0 + if yesno in 'yYt': + users_dict[user]['yes_count'] += 1 + elif yesno in 'nNf': + users_dict[user]['no_count'] += 1 + users_dict[user]['annotations'].append({ + 'yesno': + yesno, + 'str_index': + int(str_index), + 'timestamp': + float(timestamp), + 'ip_addr': + ip_addr + }) + for user in users_set: + users_dict[user]['annotations'] = sorted( + users_dict[user]['annotations'], key=lambda x: x['timestamp']) + calculate_avg_annotation_time(users_dict) + print_sorted(users_dict) + + +def calculate_avg_annotation_time(users_dict, max_interval=10): + for user, user_dict in users_dict.items(): + delta_sum = 0 + divider = 0 + breaks = 0 + for ann_1, ann_2 in zip(user_dict['annotations'], + user_dict['annotations'][1:]): + delta = ann_2['timestamp'] - ann_1['timestamp'] + if delta < 10: + delta_sum += delta + divider += 1 + else: + breaks += 1 + + if delta_sum == 0: + user_dict['avg_time'] = math.inf + else: + user_dict['avg_time'] = delta_sum / divider + user_dict['breaks'] = breaks + + +def print_sorted(users_dict, sortby='annotations max'): + print('\t'.join( + ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', + 'status'])) + if sortby == 'annotations max': + keyfunc = lambda x: len(x[1]['annotations']) + for user, user_dict in sorted( + users_dict.items(), key=keyfunc, reverse=True): + if user_dict['yes_count'] + user_dict['no_count'] != len( + user_dict['annotations']): + import ipdb + ipdb.set_trace() + status = 'uncertain' + if r.sismember('banned', user): + status = 'banned' + elif r.sismember('trusted', user): + status = 'trusted' + elif r.sismember('trusted-checked', user): + status = 'trusted-checked' + print('\t'.join([ + user, + str(len(user_dict['annotations'])), + str(user_dict['yes_count']), + str(user_dict['no_count']), + str(user_dict['avg_time']), + str(user_dict['breaks']), status + ])) + +def 2tsv(): + pass + + + +def get_args(): + parser = argparse.ArgumentParser() + subparser = parser.add_subparsers(dest='cmd') + parser_stats = subparser.add_parser('stats', help='Show annotation stats.') + parser_investigate = subparser.add_parser( + 'investigate', help='investigate cookie.') + parser_investigate.add_argument('cookie', help='User cookie string') + parser_index = subparser.add_parser('index', help='Print utterance.') + parser_index.add_argument('index', type=int, help='Utterance index') + subparser.add_parser('ipdb', help='Get into ipdb.') + parser_exec = subparser.add_parser('exec', help='Execute redis command.') + parser_exec.add_argument( + 'redis_command', + help= + 'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\'' + ) + parser_users = subparser.add_parser('users', help='User statistics') + parser_ban = subparser.add_parser('ban', help='Ban user') + parser_ban.add_argument('cookie', help='Cookie.') + + parser_2tsv = subparser.add_parser('2tsv', help='Convert data to tsv') + return parser, parser.parse_args() + + +def main(): + parser, args = get_args() + if args.cmd == 'stats': + print_stats() + elif args.cmd == 'investigate': + investigate_by_cookie(args.cookie) + elif args.cmd == 'users': + users_stats() + elif args.cmd == 'index': + pprint_utterance(args.index) + elif args.cmd == 'ban': + ban(args.cookie) + elif args.cmd == '2tsv': + 2tsv() + elif args.cmd == 'ipdb': + import ipdb + ipdb.set_trace() + elif args.cmd == 'exec': + exec('print(r.{})'.format(args.redis_command), { + 'print': print, + 'r': r + }) + else: + print(parser.format_help()) + + +if __name__ == '__main__': + main() diff --git a/webapp/app.py b/webapp/app.py index 252b40f..92ebfbe 100644 --- a/webapp/app.py +++ b/webapp/app.py @@ -1,3 +1,76 @@ +""" +### REDIS data structures in annotator: + + + STRINGS: + + 'status' + ---> {'filled', None, other_string} + + 'filled' + indicate that utterance-scores are properly filled + with utterance ids + None + clears database + other non empty value + adds non existance utterances and sets their score to 0 + + '{cookie}:{index}' + ---> '{yesno}:{str_index}:{timestamp}:{ip_addr}' + + '{ip_addr}:{index}' + ---> '{yesno}:{str_index}:{timestamp}:{cookie}' + + '{index}' + ---> '[yYnNtf]*' e.g. 'ynyNYnynYytf' + + y + uncertain yes + t + banned yes + Y + trusted yes (not handled yet) + n + uncertain no + N + trusted no (not handled yet) + f + banned no + + 'jshash:{js_hash}' + ---> '{cookie}' + + + LISTS: + + '{cookie}' + ---> '{yesno}:{index}:{str_index}' + + 'undo:{cookie}' + ---> '{yesno}:{index}:{str_index}' + + + SETS: + + 'ip-cookies:{ip_addr}' + ---> '{cookie}' + expires after 3 hours + + 'trusted' + ---> '{cookie}' + trusted because I know this person + + 'trusted-checked' + ---> '{cookie}' + + 'banned' + ---> '{cookie}' + + + SORTED SETS: + 'utterance-scores' + ---> {index} ---> {nr_of_annotations} +""" from flask import Flask, render_template, request, make_response, jsonify import secrets import time @@ -70,8 +143,8 @@ def get_next(cookie_hash): index = find_not_annotated(cookie_hash) log('found unannotated index: {}'.format(index)) left_context, hour, right_context = get_utterance_for_web(index) - log('get_next index: {}, score: {}'.format(index, - r.zscore(UTT_SCORES, index))) + # log('get_next index: {}, score: {}'.format(index, + # r.zscore(UTT_SCORES, index))) return index, left_context, hour, right_context @@ -108,18 +181,18 @@ def get_response_by_index(index, cookie_hash): def annotate_redis(yesno, index, ip_addr, cookie_hash): # log('annotate: {}'.format(cookie_hash)) + banned = r.sismember('banned', cookie_hash) + if banned: + yesno = yesno.translate(str.maketrans('yn', 'tf')) timestamp = time.time() annotation = r.get('{}:{}'.format( cookie_hash, index)) # previous annotation of utterance by that user if annotation: - # log(annotation.decode('utf-8')) str_index = int(annotation.decode('utf-8').split(':')[1]) r.setrange(index, str_index, yesno) #sets str_index to yesno value else: - # before = r.zscore(UTT_SCORES, index) - r.zincrby(UTT_SCORES, index) - # log('incrementing index {}, before_val: {}, value: {}'.format( - # index, before, r.zscore(UTT_SCORES, index))) + if not banned: + r.zincrby(UTT_SCORES, index) str_index = r.append(index, yesno) - 1 r.set('{}:{}'.format(cookie_hash, index), '{}:{}:{}:{}'.format( yesno, str_index, timestamp, ip_addr))