First version of ml hour classificator.

Add last_access field to annotator_console user stats. Add split-data script. Add tsv2fasttext.py Add todos.org.
2018-05-28 15:10:31 +02:00 · 2018-05-28 15:10:31 +02:00 · 7dd903b3b5
commit 7dd903b3b5
parent 6ff4f230db
6 changed files with 150 additions and 6 deletions
--- a/21
+++ b/21
@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
 include /tmp/makeenv
 JOBS := 100

-.PHONY: all update data clean clean-data clean-cache
+.PHONY: all update data clean clean-data clean-cache clean-ml

 all: data

+# annotator_console.py needs running redis instance
+# move belo
+score.txt: predicted.txt test.txt
+	paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@
+
+predicted.txt: fs-model.bin test.txt
+	./fasttext predict $<  $(word 2,$^) > $@
+
+fs-model.bin: train.txt
+	./fasttext supervised -input $< -output `basename $@ .bin`
+
+train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
+	./$< 2tsv | ./$(word 2,$^) > all.txt
+	./split-data.sh all.txt
+	rm all.txt
+
+clean-ml:
+	rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt
+
 parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
 	mkdir -p parishwebsites/{text-data,text-data-logs}
 	cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
--- a/annotator_console.py
+++ b/annotator_console.py
@ -119,6 +119,11 @@ def users_stats():
            else:
                yesno, str_index, timestamp = res_list
                ip_addr = '0'
+            if 'last_access' not in users_dict[user]:
+                users_dict[user]['last_access'] = float(timestamp)
+            else:
+                users_dict[user]['last_access'] = max(
+                    float(timestamp), users_dict[user]['last_access'])
            if 'yes_count' not in users_dict[user]:
                users_dict[user]['yes_count'] = 0
            if 'no_count' not in users_dict[user]:
@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10):
        if delta_sum == 0:
            user_dict['avg_time'] = math.inf
        else:
-            user_dict['avg_time'] = delta_sum / divider
+            user_dict['avg_time'] = round(delta_sum / divider, 4)
        user_dict['breaks'] = breaks


 def print_sorted(users_dict, sortby='annotations max'):
-    print('\t'.join(
-        ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
-         'status']))
+    print('\t'.join([
+        'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
+        'last_access'
+    ]))
    if sortby == 'annotations max':
        keyfunc = lambda x: len(x[1]['annotations'])
    for user, user_dict in sorted(
@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'):
            str(user_dict['yes_count']),
            str(user_dict['no_count']),
            str(user_dict['avg_time']),
-            str(user_dict['breaks']), status
+            str(user_dict['breaks']), status,
+            format_time(user_dict['last_access'])
        ]))


--- a/evaluate.py
+++ b/evaluate.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import sys
+
+positive = 0
+negative = 0
+true_positive = 0
+false_positive = 0
+true_negative = 0
+false_negative = 0
+for line in sys.stdin:
+    predicted, actual = line.rstrip('\n').split('\t')
+    if 'yes' in predicted and 'yes' in actual:
+        true_positive += 1
+        positive += 1
+    if 'yes' in predicted and 'no' in actual:
+        false_positive += 1
+        negative += 1
+    if 'no' in predicted and 'yes' in actual:
+        false_negative += 1
+        positive += 1
+    if 'no' in predicted and 'no' in actual:
+        true_negative += 1
+        negative += 1
+
+#true positive rate, sensivity
+recall = true_positive / positive
+
+#true negative rate
+specificity = true_negative / negative
+
+#positive predictive value
+precision = true_positive / (true_positive + false_positive)
+negative_predictive_value = true_negative / (true_negative + false_negative)
+
+# false negative rate
+miss_rate = 1 - recall  # or false_negative / positive
+
+# false positive rate, (negative miss rate)
+fall_out = 1 - specificity  # or false_positive / negative
+
+false_discovery_rate = 1 - precision  # or false_positive/ (false_positive / true_positvie)
+false_omission_rate = 1 - negative_predictive_value  # or false_negative / (false_negative + true_negative)
+accuracy = (true_positive + true_negative) / (positive + negative)
+
+f1 = 2 * (precision * recall) / (precision + recall)
+mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
+    (true_positive + false_positive) * (true_positive + false_negative) *
+    (true_negative + false_positive) * (true_negative + false_negative))**0.5
+
+print(f"""
+   Recall  =  {recall}
+Precision  =  {precision}
+       F1  =  {f1}
+ Accuracy  =  {accuracy}
+""")
--- a/split-data.sh
+++ b/split-data.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+split_data() {
+    split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
+    mv part-00 "$1_test.txt"
+    mv part-01 "$1_dev.txt"
+    cat part-0* > "$1_train.txt" && rm part-0* $1
+}
+
+grep '__label__yes' "$1"  > yes.txt
+grep '__label__no'  "$1"  > no.txt
+split_data yes.txt
+split_data no.txt
+cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
+cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
+cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
+
+diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
+rm yes.txt* no.txt*
--- a/todos.org
+++ b/todos.org
@ -0,0 +1,19 @@
+** Backlog
+*** TODO odsiać binarny content
+*** TODO przetestować find_new_key
+*** TODO napisać testy do annotatora
+*** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700
+*** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html
+*** TODO handle <sup> and <sub> tags e.g.
+    Msze od poniedziałku do soboty rano o godzinie 6<sup>45 </sup>`
+*** TODO Change to levelDB database for cache
+*** TODO crawling non domain masses
+    Scraper shouldn't crawl these types of urls:
+https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie
+
+Came from:
+http://albert.zgora.pl/2014/08/bierzmowanie/
+*** TODO dostęp do redisa bez sudo
+** W1
+*** TODO dodać ogłoszenia na duzych portalach katolickich
+*** TODO alogrytm do odzielenia trusted od untrusted users
--- a/tsv2fasttext.py
+++ b/tsv2fasttext.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+import csv
+import sys
+import re
+
+
+def preprocess(prefix, hour, suffix):
+    sentence = prefix + hour + suffix
+    sentence = re.sub(r'\\n', r' \\n ', sentence)
+    sentence = re.sub(' +', ' ', sentence)
+    return sentence
+
+
+def main():
+    # csv.reader(sys.stdin, delimiter='\t')
+    next(sys.stdin)
+    for line in sys.stdin:
+        prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
+            '\n').rsplit('\t')
+        sentence = preprocess(prefix, hour, suffix)
+        print(f'__label__{is_mass} {sentence}')
+
+
+if __name__ == '__main__':
+    main()