First version of ml hour classificator.

Add last_access field to annotator_console user stats. Add split-data script. Add tsv2fasttext.py Add todos.org.
2018-05-28 15:10:31 +02:00 · 2018-05-28 15:10:31 +02:00 · 7dd903b3b5
commit 7dd903b3b5
parent 6ff4f230db
6 changed files with 150 additions and 6 deletions
--- a/21
+++ b/21
@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
 include /tmp/makeenv
 JOBS := 100
-.PHONY: all update data clean clean-data clean-cache
+.PHONY: all update data clean clean-data clean-cache clean-ml
 all: data
 # annotator_console.py needs running redis instance
 # move belo
 score.txt: predicted.txt test.txt
 	paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@
 predicted.txt: fs-model.bin test.txt
 	./fasttext predict $<  $(word 2,$^) > $@
 fs-model.bin: train.txt
 	./fasttext supervised -input $< -output `basename $@ .bin`
 train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
 	./$< 2tsv | ./$(word 2,$^) > all.txt
 	./split-data.sh all.txt
 	rm all.txt
 clean-ml:
 	rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt
 parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
 	mkdir -p parishwebsites/{text-data,text-data-logs}
 	cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
--- a/annotator_console.py
+++ b/annotator_console.py
@ -119,6 +119,11 @@ def users_stats():
            else:
                yesno, str_index, timestamp = res_list
                ip_addr = '0'
            if 'last_access' not in users_dict[user]:
                users_dict[user]['last_access'] = float(timestamp)
            else:
                users_dict[user]['last_access'] = max(
                    float(timestamp), users_dict[user]['last_access'])
            if 'yes_count' not in users_dict[user]:
                users_dict[user]['yes_count'] = 0
            if 'no_count' not in users_dict[user]:
@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10):
        if delta_sum == 0:
            user_dict['avg_time'] = math.inf
        else:
-            user_dict['avg_time'] = delta_sum / divider
+            user_dict['avg_time'] = round(delta_sum / divider, 4)
        user_dict['breaks'] = breaks
 def print_sorted(users_dict, sortby='annotations max'):
-    print('\t'.join(
+    print('\t'.join([
-        ['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
+        'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
-         'status']))
+        'last_access'
    ]))
    if sortby == 'annotations max':
        keyfunc = lambda x: len(x[1]['annotations'])
    for user, user_dict in sorted(
@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'):
            str(user_dict['yes_count']),
            str(user_dict['no_count']),
            str(user_dict['avg_time']),
-            str(user_dict['breaks']), status
+            str(user_dict['breaks']), status,
            format_time(user_dict['last_access'])
        ]))
--- a/evaluate.py
+++ b/evaluate.py
@ -0,0 +1,55 @@
 #!/usr/bin/env python3
 import sys
 positive = 0
 negative = 0
 true_positive = 0
 false_positive = 0
 true_negative = 0
 false_negative = 0
 for line in sys.stdin:
    predicted, actual = line.rstrip('\n').split('\t')
    if 'yes' in predicted and 'yes' in actual:
        true_positive += 1
        positive += 1
    if 'yes' in predicted and 'no' in actual:
        false_positive += 1
        negative += 1
    if 'no' in predicted and 'yes' in actual:
        false_negative += 1
        positive += 1
    if 'no' in predicted and 'no' in actual:
        true_negative += 1
        negative += 1
 #true positive rate, sensivity
 recall = true_positive / positive
 #true negative rate
 specificity = true_negative / negative
 #positive predictive value
 precision = true_positive / (true_positive + false_positive)
 negative_predictive_value = true_negative / (true_negative + false_negative)
 # false negative rate
 miss_rate = 1 - recall  # or false_negative / positive
 # false positive rate, (negative miss rate)
 fall_out = 1 - specificity  # or false_positive / negative
 false_discovery_rate = 1 - precision  # or false_positive/ (false_positive / true_positvie)
 false_omission_rate = 1 - negative_predictive_value  # or false_negative / (false_negative + true_negative)
 accuracy = (true_positive + true_negative) / (positive + negative)
 f1 = 2 * (precision * recall) / (precision + recall)
 mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
    (true_positive + false_positive) * (true_positive + false_negative) *
    (true_negative + false_positive) * (true_negative + false_negative))**0.5
 print(f"""
   Recall  =  {recall}
 Precision  =  {precision}
       F1  =  {f1}
 Accuracy  =  {accuracy}
 """)
--- a/split-data.sh
+++ b/split-data.sh
@ -0,0 +1,19 @@
 #!/usr/bin/env bash
 split_data() {
    split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
    mv part-00 "$1_test.txt"
    mv part-01 "$1_dev.txt"
    cat part-0* > "$1_train.txt" && rm part-0* $1
 }
 grep '__label__yes' "$1"  > yes.txt
 grep '__label__no'  "$1"  > no.txt
 split_data yes.txt
 split_data no.txt
 cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
 cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
 cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
 diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
 rm yes.txt* no.txt*
--- a/todos.org
+++ b/todos.org
@ -0,0 +1,19 @@
 ** Backlog
 *** TODO odsiać binarny content
 *** TODO przetestować find_new_key
 *** TODO napisać testy do annotatora
 *** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700
 *** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html
 *** TODO handle <sup> and <sub> tags e.g.
    Msze od poniedziałku do soboty rano o godzinie 6<sup>45 </sup>`
 *** TODO Change to levelDB database for cache
 *** TODO crawling non domain masses
    Scraper shouldn't crawl these types of urls:
 https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie
 Came from:
 http://albert.zgora.pl/2014/08/bierzmowanie/
 *** TODO dostęp do redisa bez sudo
 ** W1
 *** TODO dodać ogłoszenia na duzych portalach katolickich
 *** TODO alogrytm do odzielenia trusted od untrusted users
--- a/tsv2fasttext.py
+++ b/tsv2fasttext.py
@ -0,0 +1,25 @@
 #!/usr/bin/env python3
 import csv
 import sys
 import re
 def preprocess(prefix, hour, suffix):
    sentence = prefix + hour + suffix
    sentence = re.sub(r'\\n', r' \\n ', sentence)
    sentence = re.sub(' +', ' ', sentence)
    return sentence
 def main():
    # csv.reader(sys.stdin, delimiter='\t')
    next(sys.stdin)
    for line in sys.stdin:
        prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
            '\n').rsplit('\t')
        sentence = preprocess(prefix, hour, suffix)
        print(f'__label__{is_mass} {sentence}')
 if __name__ == '__main__':
    main()