mass-scraper/annotator_console.py

#!/usr/bin/env python3
import sys

import argparse
import redis
from extractor.find_hours import color_hour
import pickle
from colorama import Fore, Back, Style
import time
import datetime
import re
from collections import defaultdict
import math

r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0)


def load_utterances(filename):
    with open(filename, 'rb') as f:
        utterances = pickle.load(f)
    return utterances


utterances = load_utterances(
    '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')


def format_time(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime(
        '%H:%M:%S.%f   %Y-%m-%d')


def is_cookie_index(key):
    if ':' in key and not 'jshash' in key and not '.' in key and r.type(
            key) == b'string':
        return True


def investigate_by_cookie(cookie_hash):
    cx = 0
    index_stop = None
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie_hash in key:
            if cx != 0:
                cx -= 1
                continue
            index = int(key.split(':')[1])
            if index_stop and index_stop != index:
                continue
            annotation_info = r.get(key).decode('utf-8')
            pprint_utterance(index, annotation_info)
            print(index)
            print(format_time(float(annotation_info.split(':')[2])))
            print(annotation_info)
            action = input(
                'c: continue, cX: continue Xtimes, number: goto index\n')
            if action.isdigit():
                index_stop = int(action)
            else:
                index_stop = None
            if action[0] == 'c':
                if action[1:].isdigit():
                    cx = int(action[1:])


def pprint_utterance(index, annotation_info=None):
    if not annotation_info:
        annotation_info = ['y']
    color = Fore.GREEN if annotation_info[0] in 'yYt' else Fore.RED
    print(
        color_hour(utterances[index]['prefix'], utterances[index]['hour'],
                   utterances[index]['suffix'], color))


def print_stats():
    annotated = set()
    all_count = 0
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and not r.sismember('banned',
                                                    key.split(':')[0]):
            all_count += 1
            index = key.split(':')[1]
            annotated.add(index)
    print('All annotations: {}'.format(all_count))
    print('Annotated utterances: {}/{}'.format(
        len(annotated), len(utterances)))


def ban(cookie):
    r.sadd('banned', cookie)
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie in key.split(':')[0]:
            user, index = key.split(':')
            annotation = r.get(key).decode('utf-8')
            if annotation[0] in 'yn':
                yesno = annotation[0].translate(str.maketrans('yn', 'tf'))
                r.setrange(key, 0, yesno)
                str_index = int(annotation.split(':')[1])
                r.setrange(index, str_index,
                           yesno)  #sets str_index to yesno value
                r.zincrby('utterance-scores', index, -1)


def users_stats():
    users_dict = defaultdict(lambda: defaultdict(list))
    users_set = set()
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key):
            user = key.split(':')[0]
            users_set.add(user)
            res = r.get(key)
            res_list = res.decode('utf-8').split(':')
            if len(res_list) == 4:
                yesno, str_index, timestamp, ip_addr = res_list
            else:
                yesno, str_index, timestamp = res_list
                ip_addr = '0'
            if 'yes_count' not in users_dict[user]:
                users_dict[user]['yes_count'] = 0
            if 'no_count' not in users_dict[user]:
                users_dict[user]['no_count'] = 0
            if yesno in 'yYt':
                users_dict[user]['yes_count'] += 1
            elif yesno in 'nNf':
                users_dict[user]['no_count'] += 1
            users_dict[user]['annotations'].append({
                'yesno':
                yesno,
                'str_index':
                int(str_index),
                'timestamp':
                float(timestamp),
                'ip_addr':
                ip_addr
            })
    for user in users_set:
        users_dict[user]['annotations'] = sorted(
            users_dict[user]['annotations'], key=lambda x: x['timestamp'])
    calculate_avg_annotation_time(users_dict)
    print_sorted(users_dict)


def calculate_avg_annotation_time(users_dict, max_interval=10):
    for user, user_dict in users_dict.items():
        delta_sum = 0
        divider = 0
        breaks = 0
        for ann_1, ann_2 in zip(user_dict['annotations'],
                                user_dict['annotations'][1:]):
            delta = ann_2['timestamp'] - ann_1['timestamp']
            if delta < 10:
                delta_sum += delta
                divider += 1
            else:
                breaks += 1

        if delta_sum == 0:
            user_dict['avg_time'] = math.inf
        else:
            user_dict['avg_time'] = delta_sum / divider
        user_dict['breaks'] = breaks


def print_sorted(users_dict, sortby='annotations max'):
    print('\t'.join(
        ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
         'status']))
    if sortby == 'annotations max':
        keyfunc = lambda x: len(x[1]['annotations'])
    for user, user_dict in sorted(
            users_dict.items(), key=keyfunc, reverse=True):
        if user_dict['yes_count'] + user_dict['no_count'] != len(
                user_dict['annotations']):
            import ipdb
            ipdb.set_trace()
        status = 'uncertain'
        if r.sismember('banned', user):
            status = 'banned'
        elif r.sismember('trusted', user):
            status = 'trusted'
        elif r.sismember('trusted-checked', user):
            status = 'trusted-checked'
        print('\t'.join([
            user,
            str(len(user_dict['annotations'])),
            str(user_dict['yes_count']),
            str(user_dict['no_count']),
            str(user_dict['avg_time']),
            str(user_dict['breaks']), status
        ]))

def 2tsv():
    pass
    

def get_args():
    parser = argparse.ArgumentParser()
    subparser = parser.add_subparsers(dest='cmd')
    parser_stats = subparser.add_parser('stats', help='Show annotation stats.')
    parser_investigate = subparser.add_parser(
        'investigate', help='investigate cookie.')
    parser_investigate.add_argument('cookie', help='User cookie string')
    parser_index = subparser.add_parser('index', help='Print utterance.')
    parser_index.add_argument('index', type=int, help='Utterance index')
    subparser.add_parser('ipdb', help='Get into ipdb.')
    parser_exec = subparser.add_parser('exec', help='Execute redis command.')
    parser_exec.add_argument(
        'redis_command',
        help=
        'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\''
    )
    parser_users = subparser.add_parser('users', help='User statistics')
    parser_ban = subparser.add_parser('ban', help='Ban user')
    parser_ban.add_argument('cookie', help='Cookie.')

    parser_2tsv = subparser.add_parser('2tsv', help='Convert data to tsv')
    return parser, parser.parse_args()


def main():
    parser, args = get_args()
    if args.cmd == 'stats':
        print_stats()
    elif args.cmd == 'investigate':
        investigate_by_cookie(args.cookie)
    elif args.cmd == 'users':
        users_stats()
    elif args.cmd == 'index':
        pprint_utterance(args.index)
    elif args.cmd == 'ban':
        ban(args.cookie)
    elif args.cmd == '2tsv':
        2tsv()
    elif args.cmd == 'ipdb':
        import ipdb
        ipdb.set_trace()
    elif args.cmd == 'exec':
        exec('print(r.{})'.format(args.redis_command), {
            'print': print,
            'r': r
        })
    else:
        print(parser.format_help())


if __name__ == '__main__':
    main()
Add redis data structures description. Handle banned users. Rename annotation_stats ---> annotator_console.py Add ban and users stats function in annotator_console.py 2018-05-26 19:07:08 +02:00			`#!/usr/bin/env python3`
			`import sys`

			`import argparse`
			`import redis`
			`from extractor.find_hours import color_hour`
			`import pickle`
			`from colorama import Fore, Back, Style`
			`import time`
			`import datetime`
			`import re`
			`from collections import defaultdict`
			`import math`

			`r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0)`


			`def load_utterances(filename):`
			`with open(filename, 'rb') as f:`
			`utterances = pickle.load(f)`
			`return utterances`


			`utterances = load_utterances(`
			`'/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')`


			`def format_time(timestamp):`
			`return datetime.datetime.fromtimestamp(timestamp).strftime(`
			`'%H:%M:%S.%f %Y-%m-%d')`


			`def is_cookie_index(key):`
			`if ':' in key and not 'jshash' in key and not '.' in key and r.type(`
			`key) == b'string':`
			`return True`


			`def investigate_by_cookie(cookie_hash):`
			`cx = 0`
			`index_stop = None`
			`for key in sorted(set(r.scan_iter())):`
			`key = key.decode('utf-8')`
			`if is_cookie_index(key) and cookie_hash in key:`
			`if cx != 0:`
			`cx -= 1`
			`continue`
			`index = int(key.split(':')[1])`
			`if index_stop and index_stop != index:`
			`continue`
			`annotation_info = r.get(key).decode('utf-8')`
			`pprint_utterance(index, annotation_info)`
			`print(index)`
			`print(format_time(float(annotation_info.split(':')[2])))`
			`print(annotation_info)`
			`action = input(`
			`'c: continue, cX: continue Xtimes, number: goto index\n')`
			`if action.isdigit():`
			`index_stop = int(action)`
			`else:`
			`index_stop = None`
			`if action[0] == 'c':`
			`if action[1:].isdigit():`
			`cx = int(action[1:])`


			`def pprint_utterance(index, annotation_info=None):`
			`if not annotation_info:`
			`annotation_info = ['y']`
			`color = Fore.GREEN if annotation_info[0] in 'yYt' else Fore.RED`
			`print(`
			`color_hour(utterances[index]['prefix'], utterances[index]['hour'],`
			`utterances[index]['suffix'], color))`


			`def print_stats():`
			`annotated = set()`
			`all_count = 0`
			`for key in set(r.scan_iter()):`
			`key = key.decode('utf-8')`
			`if is_cookie_index(key) and not r.sismember('banned',`
			`key.split(':')[0]):`
			`all_count += 1`
			`index = key.split(':')[1]`
			`annotated.add(index)`
			`print('All annotations: {}'.format(all_count))`
			`print('Annotated utterances: {}/{}'.format(`
			`len(annotated), len(utterances)))`


			`def ban(cookie):`
			`r.sadd('banned', cookie)`
			`for key in set(r.scan_iter()):`
			`key = key.decode('utf-8')`
			`if is_cookie_index(key) and cookie in key.split(':')[0]:`
			`user, index = key.split(':')`
			`annotation = r.get(key).decode('utf-8')`
			`if annotation[0] in 'yn':`
			`yesno = annotation[0].translate(str.maketrans('yn', 'tf'))`
			`r.setrange(key, 0, yesno)`
			`str_index = int(annotation.split(':')[1])`
			`r.setrange(index, str_index,`
			`yesno) #sets str_index to yesno value`
			`r.zincrby('utterance-scores', index, -1)`


			`def users_stats():`
			`users_dict = defaultdict(lambda: defaultdict(list))`
			`users_set = set()`
			`for key in sorted(set(r.scan_iter())):`
			`key = key.decode('utf-8')`
			`if is_cookie_index(key):`
			`user = key.split(':')[0]`
			`users_set.add(user)`
			`res = r.get(key)`
			`res_list = res.decode('utf-8').split(':')`
			`if len(res_list) == 4:`
			`yesno, str_index, timestamp, ip_addr = res_list`
			`else:`
			`yesno, str_index, timestamp = res_list`
			`ip_addr = '0'`
			`if 'yes_count' not in users_dict[user]:`
			`users_dict[user]['yes_count'] = 0`
			`if 'no_count' not in users_dict[user]:`
			`users_dict[user]['no_count'] = 0`
			`if yesno in 'yYt':`
			`users_dict[user]['yes_count'] += 1`
			`elif yesno in 'nNf':`
			`users_dict[user]['no_count'] += 1`
			`users_dict[user]['annotations'].append({`
			`'yesno':`
			`yesno,`
			`'str_index':`
			`int(str_index),`
			`'timestamp':`
			`float(timestamp),`
			`'ip_addr':`
			`ip_addr`
			`})`
			`for user in users_set:`
			`users_dict[user]['annotations'] = sorted(`
			`users_dict[user]['annotations'], key=lambda x: x['timestamp'])`
			`calculate_avg_annotation_time(users_dict)`
			`print_sorted(users_dict)`


			`def calculate_avg_annotation_time(users_dict, max_interval=10):`
			`for user, user_dict in users_dict.items():`
			`delta_sum = 0`
			`divider = 0`
			`breaks = 0`
			`for ann_1, ann_2 in zip(user_dict['annotations'],`
			`user_dict['annotations'][1:]):`
			`delta = ann_2['timestamp'] - ann_1['timestamp']`
			`if delta < 10:`
			`delta_sum += delta`
			`divider += 1`
			`else:`
			`breaks += 1`

			`if delta_sum == 0:`
			`user_dict['avg_time'] = math.inf`
			`else:`
			`user_dict['avg_time'] = delta_sum / divider`
			`user_dict['breaks'] = breaks`


			`def print_sorted(users_dict, sortby='annotations max'):`
			`print('\t'.join(`
			`['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',`
			`'status']))`
			`if sortby == 'annotations max':`
			`keyfunc = lambda x: len(x[1]['annotations'])`
			`for user, user_dict in sorted(`
			`users_dict.items(), key=keyfunc, reverse=True):`
			`if user_dict['yes_count'] + user_dict['no_count'] != len(`
			`user_dict['annotations']):`
			`import ipdb`
			`ipdb.set_trace()`
			`status = 'uncertain'`
			`if r.sismember('banned', user):`
			`status = 'banned'`
			`elif r.sismember('trusted', user):`
			`status = 'trusted'`
			`elif r.sismember('trusted-checked', user):`
			`status = 'trusted-checked'`
			`print('\t'.join([`
			`user,`
			`str(len(user_dict['annotations'])),`
			`str(user_dict['yes_count']),`
			`str(user_dict['no_count']),`
			`str(user_dict['avg_time']),`
			`str(user_dict['breaks']), status`
			`]))`

			`def 2tsv():`
			`pass`



			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`subparser = parser.add_subparsers(dest='cmd')`
			`parser_stats = subparser.add_parser('stats', help='Show annotation stats.')`
			`parser_investigate = subparser.add_parser(`
			`'investigate', help='investigate cookie.')`
			`parser_investigate.add_argument('cookie', help='User cookie string')`
			`parser_index = subparser.add_parser('index', help='Print utterance.')`
			`parser_index.add_argument('index', type=int, help='Utterance index')`
			`subparser.add_parser('ipdb', help='Get into ipdb.')`
			`parser_exec = subparser.add_parser('exec', help='Execute redis command.')`
			`parser_exec.add_argument(`
			`'redis_command',`
			`help=`
			`'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\''`
			`)`
			`parser_users = subparser.add_parser('users', help='User statistics')`
			`parser_ban = subparser.add_parser('ban', help='Ban user')`
			`parser_ban.add_argument('cookie', help='Cookie.')`

			`parser_2tsv = subparser.add_parser('2tsv', help='Convert data to tsv')`
			`return parser, parser.parse_args()`


			`def main():`
			`parser, args = get_args()`
			`if args.cmd == 'stats':`
			`print_stats()`
			`elif args.cmd == 'investigate':`
			`investigate_by_cookie(args.cookie)`
			`elif args.cmd == 'users':`
			`users_stats()`
			`elif args.cmd == 'index':`
			`pprint_utterance(args.index)`
			`elif args.cmd == 'ban':`
			`ban(args.cookie)`
			`elif args.cmd == '2tsv':`
			`2tsv()`
			`elif args.cmd == 'ipdb':`
			`import ipdb`
			`ipdb.set_trace()`
			`elif args.cmd == 'exec':`
			`exec('print(r.{})'.format(args.redis_command), {`
			`'print': print,`
			`'r': r`
			`})`
			`else:`
			`print(parser.format_help())`


			`if __name__ == '__main__':`
			`main()`