From 7dd903b3b592d51e39ac6ed417149d0bca0fa70c Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Mon, 28 May 2018 15:10:31 +0200 Subject: [PATCH] First version of ml hour classificator. Add last_access field to annotator_console user stats. Add split-data script. Add tsv2fasttext.py Add todos.org. --- Makefile | 21 ++++++++++++++++- annotator_console.py | 17 ++++++++++---- evaluate.py | 55 ++++++++++++++++++++++++++++++++++++++++++++ split-data.sh | 19 +++++++++++++++ todos.org | 19 +++++++++++++++ tsv2fasttext.py | 25 ++++++++++++++++++++ 6 files changed, 150 insertions(+), 6 deletions(-) create mode 100755 evaluate.py create mode 100755 split-data.sh create mode 100644 todos.org create mode 100755 tsv2fasttext.py diff --git a/Makefile b/Makefile index 30ee207..10d7bee 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv) include /tmp/makeenv JOBS := 100 -.PHONY: all update data clean clean-data clean-cache +.PHONY: all update data clean clean-data clean-cache clean-ml all: data +# annotator_console.py needs running redis instance +# move belo +score.txt: predicted.txt test.txt + paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@ + +predicted.txt: fs-model.bin test.txt + ./fasttext predict $< $(word 2,$^) > $@ + +fs-model.bin: train.txt + ./fasttext supervised -input $< -output `basename $@ .bin` + +train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh + ./$< 2tsv | ./$(word 2,$^) > all.txt + ./split-data.sh all.txt + rm all.txt + +clean-ml: + rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt + parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh mkdir -p parishwebsites/{text-data,text-data-logs} cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt diff --git a/annotator_console.py b/annotator_console.py index 388c808..9dabd63 100755 --- a/annotator_console.py +++ b/annotator_console.py @@ -119,6 +119,11 @@ def users_stats(): else: yesno, str_index, timestamp = res_list ip_addr = '0' + if 'last_access' not in users_dict[user]: + users_dict[user]['last_access'] = float(timestamp) + else: + users_dict[user]['last_access'] = max( + float(timestamp), users_dict[user]['last_access']) if 'yes_count' not in users_dict[user]: users_dict[user]['yes_count'] = 0 if 'no_count' not in users_dict[user]: @@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10): if delta_sum == 0: user_dict['avg_time'] = math.inf else: - user_dict['avg_time'] = delta_sum / divider + user_dict['avg_time'] = round(delta_sum / divider, 4) user_dict['breaks'] = breaks def print_sorted(users_dict, sortby='annotations max'): - print('\t'.join( - ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', - 'status'])) + print('\t'.join([ + 'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status', + 'last_access' + ])) if sortby == 'annotations max': keyfunc = lambda x: len(x[1]['annotations']) for user, user_dict in sorted( @@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'): str(user_dict['yes_count']), str(user_dict['no_count']), str(user_dict['avg_time']), - str(user_dict['breaks']), status + str(user_dict['breaks']), status, + format_time(user_dict['last_access']) ])) diff --git a/evaluate.py b/evaluate.py new file mode 100755 index 0000000..08bff01 --- /dev/null +++ b/evaluate.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import sys + +positive = 0 +negative = 0 +true_positive = 0 +false_positive = 0 +true_negative = 0 +false_negative = 0 +for line in sys.stdin: + predicted, actual = line.rstrip('\n').split('\t') + if 'yes' in predicted and 'yes' in actual: + true_positive += 1 + positive += 1 + if 'yes' in predicted and 'no' in actual: + false_positive += 1 + negative += 1 + if 'no' in predicted and 'yes' in actual: + false_negative += 1 + positive += 1 + if 'no' in predicted and 'no' in actual: + true_negative += 1 + negative += 1 + +#true positive rate, sensivity +recall = true_positive / positive + +#true negative rate +specificity = true_negative / negative + +#positive predictive value +precision = true_positive / (true_positive + false_positive) +negative_predictive_value = true_negative / (true_negative + false_negative) + +# false negative rate +miss_rate = 1 - recall # or false_negative / positive + +# false positive rate, (negative miss rate) +fall_out = 1 - specificity # or false_positive / negative + +false_discovery_rate = 1 - precision # or false_positive/ (false_positive / true_positvie) +false_omission_rate = 1 - negative_predictive_value # or false_negative / (false_negative + true_negative) +accuracy = (true_positive + true_negative) / (positive + negative) + +f1 = 2 * (precision * recall) / (precision + recall) +mcc = (true_positive * true_negative) - (false_positive * false_negative) / ( + (true_positive + false_positive) * (true_positive + false_negative) * + (true_negative + false_positive) * (true_negative + false_negative))**0.5 + +print(f""" + Recall = {recall} +Precision = {precision} + F1 = {f1} + Accuracy = {accuracy} +""") diff --git a/split-data.sh b/split-data.sh new file mode 100755 index 0000000..a292695 --- /dev/null +++ b/split-data.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +split_data() { + split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d + mv part-00 "$1_test.txt" + mv part-01 "$1_dev.txt" + cat part-0* > "$1_train.txt" && rm part-0* $1 +} + +grep '__label__yes' "$1" > yes.txt +grep '__label__no' "$1" > no.txt +split_data yes.txt +split_data no.txt +cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt +cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt +cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt + +diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort) +rm yes.txt* no.txt* diff --git a/todos.org b/todos.org new file mode 100644 index 0000000..1f5c747 --- /dev/null +++ b/todos.org @@ -0,0 +1,19 @@ +** Backlog +*** TODO odsiać binarny content +*** TODO przetestować find_new_key +*** TODO napisać testy do annotatora +*** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700 +*** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html +*** TODO handle and tags e.g. + Msze od poniedziałku do soboty rano o godzinie 645 ` +*** TODO Change to levelDB database for cache +*** TODO crawling non domain masses + Scraper shouldn't crawl these types of urls: +https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie + +Came from: +http://albert.zgora.pl/2014/08/bierzmowanie/ +*** TODO dostęp do redisa bez sudo +** W1 +*** TODO dodać ogłoszenia na duzych portalach katolickich +*** TODO alogrytm do odzielenia trusted od untrusted users diff --git a/tsv2fasttext.py b/tsv2fasttext.py new file mode 100755 index 0000000..441c3c4 --- /dev/null +++ b/tsv2fasttext.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +import csv +import sys +import re + + +def preprocess(prefix, hour, suffix): + sentence = prefix + hour + suffix + sentence = re.sub(r'\\n', r' \\n ', sentence) + sentence = re.sub(' +', ' ', sentence) + return sentence + + +def main(): + # csv.reader(sys.stdin, delimiter='\t') + next(sys.stdin) + for line in sys.stdin: + prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip( + '\n').rsplit('\t') + sentence = preprocess(prefix, hour, suffix) + print(f'__label__{is_mass} {sentence}') + + +if __name__ == '__main__': + main()