mass-scraper/annotator_console.py

#!/usr/bin/env python3
import sys

import argparse
import redis
from extractor.find_hours import color_hour
import pickle
from colorama import Fore, Back, Style
import time
import datetime
import re
from collections import defaultdict, Counter
import math

r = redis.StrictRedis(unix_socket_path='/redis-socket/redis.sock', db=0)


def load_utterances(filename):
    with open(filename, 'rb') as f:
        utterances = pickle.load(f)
    return utterances


utterances = load_utterances(
    '/home/siulkilulki/gitrepos/mass-scraper/utterances.pkl')


def format_time(timestamp):
    return datetime.datetime.fromtimestamp(timestamp).strftime(
        '%H:%M:%S.%f   %Y-%m-%d')


def is_cookie_index(key):
    if ':' in key and not 'jshash' in key and not '.' in key and r.type(
            key) == b'string':
        return True


def investigate_by_cookie(cookie_hash):
    cx = 0
    index_stop = None
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie_hash in key:
            if cx != 0:
                cx -= 1
                continue
            index = int(key.split(':')[1])
            if index_stop and index_stop != index:
                continue
            annotation_info = r.get(key).decode('utf-8')
            pprint_utterance(index, annotation_info)
            print(index)
            print(format_time(float(annotation_info.split(':')[2])))
            print(annotation_info)
            action = input(
                'c: continue, cX: continue Xtimes, number: goto index\n')
            if action.isdigit():
                index_stop = int(action)
            else:
                index_stop = None
            if action[0] == 'c':
                if action[1:].isdigit():
                    cx = int(action[1:])


def pprint_utterance(index, annotation_info=None):
    if not annotation_info:
        annotation_info = ['y']
    color = Fore.GREEN if annotation_info[0] in 'yYt' else Fore.RED
    print(
        color_hour(utterances[index]['prefix'], utterances[index]['hour'],
                   utterances[index]['suffix'], color))


def print_stats():
    annotated = set()
    all_count = 0
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and not r.sismember('banned',
                                                    key.split(':')[0]):
            all_count += 1
            index = key.split(':')[1]
            annotated.add(index)
    print('All annotations: {}'.format(all_count))
    print('Annotated utterances: {}/{}'.format(
        len(annotated), len(utterances)))


def ban(cookie):
    r.sadd('banned', cookie)
    for key in set(r.scan_iter()):
        key = key.decode('utf-8')
        if is_cookie_index(key) and cookie in key.split(':')[0]:
            user, index = key.split(':')
            annotation = r.get(key).decode('utf-8')
            if annotation[0] in 'yn':
                yesno = annotation[0].translate(str.maketrans('yn', 'tf'))
                r.setrange(key, 0, yesno)
                str_index = int(annotation.split(':')[1])
                r.setrange(index, str_index,
                           yesno)  #sets str_index to yesno value
                r.zincrby('utterance-scores', index, -1)


def users_stats():
    users_dict = defaultdict(lambda: defaultdict(list))
    users_set = set()
    for key in sorted(set(r.scan_iter())):
        key = key.decode('utf-8')
        if is_cookie_index(key):
            user = key.split(':')[0]
            users_set.add(user)
            res = r.get(key)
            res_list = res.decode('utf-8').split(':')
            if len(res_list) == 4:
                yesno, str_index, timestamp, ip_addr = res_list
            else:
                yesno, str_index, timestamp = res_list
                ip_addr = '0'
            if 'last_access' not in users_dict[user]:
                users_dict[user]['last_access'] = float(timestamp)
            else:
                users_dict[user]['last_access'] = max(
                    float(timestamp), users_dict[user]['last_access'])
            if 'yes_count' not in users_dict[user]:
                users_dict[user]['yes_count'] = 0
            if 'no_count' not in users_dict[user]:
                users_dict[user]['no_count'] = 0
            if yesno in 'yYt':
                users_dict[user]['yes_count'] += 1
            elif yesno in 'nNf':
                users_dict[user]['no_count'] += 1
            users_dict[user]['annotations'].append({
                'yesno':
                yesno,
                'str_index':
                int(str_index),
                'timestamp':
                float(timestamp),
                'ip_addr':
                ip_addr
            })
    for user in users_set:
        users_dict[user]['annotations'] = sorted(
            users_dict[user]['annotations'], key=lambda x: x['timestamp'])
    calculate_avg_annotation_time(users_dict)
    print_sorted(users_dict)


def calculate_avg_annotation_time(users_dict, max_interval=10):
    for user, user_dict in users_dict.items():
        delta_sum = 0
        divider = 0
        breaks = 0
        for ann_1, ann_2 in zip(user_dict['annotations'],
                                user_dict['annotations'][1:]):
            delta = ann_2['timestamp'] - ann_1['timestamp']
            if delta < 10:
                delta_sum += delta
                divider += 1
            else:
                breaks += 1

        if delta_sum == 0:
            user_dict['avg_time'] = math.inf
        else:
            user_dict['avg_time'] = round(delta_sum / divider, 4)
        user_dict['breaks'] = breaks


def print_sorted(users_dict, sortby='annotations max'):
    print('\t'.join([
        'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
        'last_access'
    ]))
    if sortby == 'annotations max':
        keyfunc = lambda x: len(x[1]['annotations'])
    for user, user_dict in sorted(
            users_dict.items(), key=keyfunc, reverse=True):
        if user_dict['yes_count'] + user_dict['no_count'] != len(
                user_dict['annotations']):
            import ipdb
            ipdb.set_trace()
        status = 'uncertain'
        if r.sismember('banned', user):
            status = 'banned'
        elif r.sismember('trusted', user):
            status = 'trusted'
        elif r.sismember('trusted-checked', user):
            status = 'trusted-checked'
        print('\t'.join([
            user,
            str(len(user_dict['annotations'])),
            str(user_dict['yes_count']),
            str(user_dict['no_count']),
            str(user_dict['avg_time']),
            str(user_dict['breaks']), status,
            format_time(user_dict['last_access'])
        ]))


def redis2tsv():
    print('\t'.join([
        'prefix', 'hour', 'suffix', 'is_mass', 'yes_count', 'no_count', 'url',
        'button_text', 'depth', 'filepath', 'line_no'
    ]))
    for index, utterance in enumerate(utterances):
        utterance_annotations = r.get(index)
        if utterance_annotations:
            utterance_annotations = utterance_annotations.decode('utf-8')
        if utterance_annotations and re.search('[yYnN]',
                                               utterance_annotations):
            annotations_counts = Counter(utterance_annotations.lower())
            if annotations_counts['y'] > annotations_counts['n']:
                is_mass = 'no'
            elif annotations_counts['y'] < annotations_counts['n']:
                is_mass = 'yes'
            else:
                continue
            trans_table = str.maketrans({'\x00': '', '\n': '\\n'})
            print('\t'.join([
                utterance['prefix'].translate(trans_table),
                utterance['hour'].translate(trans_table),
                utterance['suffix'].translate(trans_table), is_mass,
                str(annotations_counts['y']),
                str(annotations_counts['n']), utterance['url'],
                utterance['button_text'],
                str(utterance['depth']), utterance['filepath'],
                str(utterance['line_no'])
            ]))


def get_args():
    parser = argparse.ArgumentParser()
    subparser = parser.add_subparsers(dest='cmd')
    parser_stats = subparser.add_parser('stats', help='Show annotation stats.')
    parser_investigate = subparser.add_parser(
        'investigate', help='investigate cookie.')
    parser_investigate.add_argument('cookie', help='User cookie string')
    parser_index = subparser.add_parser('index', help='Print utterance.')
    parser_index.add_argument('index', type=int, help='Utterance index')
    subparser.add_parser('ipdb', help='Get into ipdb.')
    parser_exec = subparser.add_parser('exec', help='Execute redis command.')
    parser_exec.add_argument(
        'redis_command',
        help=
        'Redis command (lowercased). e.g. to get r.zrangebyscore("key", "-inf", "inf", start=0, num=1) pass \'zrangebyscore("key", "-inf", "inf", start=0, num=1)\''
    )
    parser_users = subparser.add_parser('users', help='User statistics')
    parser_ban = subparser.add_parser('ban', help='Ban user')
    parser_ban.add_argument('cookie', help='Cookie.')

    parser_2tsv = subparser.add_parser('2tsv', help='Convert data to tsv')
    return parser, parser.parse_args()


def main():
    parser, args = get_args()
    if args.cmd == 'stats':
        print_stats()
    elif args.cmd == 'investigate':
        investigate_by_cookie(args.cookie)
    elif args.cmd == 'users':
        users_stats()
    elif args.cmd == 'index':
        pprint_utterance(args.index)
    elif args.cmd == 'ban':
        ban(args.cookie)
    elif args.cmd == '2tsv':
        redis2tsv()
    elif args.cmd == 'ipdb':
        import ipdb
        ipdb.set_trace()
    elif args.cmd == 'exec':
        exec('print(r.{})'.format(args.redis_command), {
            'print': print,
            'r': r
        })
    else:
        print(parser.format_help())


if __name__ == '__main__':
    main()