First version of ml hour classificator.

Add last_access field to annotator_console user stats.

Add split-data script.
Add tsv2fasttext.py
Add todos.org.
This commit is contained in:
siulkilulki 2018-05-28 15:10:31 +02:00
parent 6ff4f230db
commit 7dd903b3b5
6 changed files with 150 additions and 6 deletions

View File

@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
include /tmp/makeenv include /tmp/makeenv
JOBS := 100 JOBS := 100
.PHONY: all update data clean clean-data clean-cache .PHONY: all update data clean clean-data clean-cache clean-ml
all: data all: data
# annotator_console.py needs running redis instance
# move belo
score.txt: predicted.txt test.txt
paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@
predicted.txt: fs-model.bin test.txt
./fasttext predict $< $(word 2,$^) > $@
fs-model.bin: train.txt
./fasttext supervised -input $< -output `basename $@ .bin`
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
./$< 2tsv | ./$(word 2,$^) > all.txt
./split-data.sh all.txt
rm all.txt
clean-ml:
rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt
parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
mkdir -p parishwebsites/{text-data,text-data-logs} mkdir -p parishwebsites/{text-data,text-data-logs}
cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt

View File

@ -119,6 +119,11 @@ def users_stats():
else: else:
yesno, str_index, timestamp = res_list yesno, str_index, timestamp = res_list
ip_addr = '0' ip_addr = '0'
if 'last_access' not in users_dict[user]:
users_dict[user]['last_access'] = float(timestamp)
else:
users_dict[user]['last_access'] = max(
float(timestamp), users_dict[user]['last_access'])
if 'yes_count' not in users_dict[user]: if 'yes_count' not in users_dict[user]:
users_dict[user]['yes_count'] = 0 users_dict[user]['yes_count'] = 0
if 'no_count' not in users_dict[user]: if 'no_count' not in users_dict[user]:
@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10):
if delta_sum == 0: if delta_sum == 0:
user_dict['avg_time'] = math.inf user_dict['avg_time'] = math.inf
else: else:
user_dict['avg_time'] = delta_sum / divider user_dict['avg_time'] = round(delta_sum / divider, 4)
user_dict['breaks'] = breaks user_dict['breaks'] = breaks
def print_sorted(users_dict, sortby='annotations max'): def print_sorted(users_dict, sortby='annotations max'):
print('\t'.join( print('\t'.join([
['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
'status'])) 'last_access'
]))
if sortby == 'annotations max': if sortby == 'annotations max':
keyfunc = lambda x: len(x[1]['annotations']) keyfunc = lambda x: len(x[1]['annotations'])
for user, user_dict in sorted( for user, user_dict in sorted(
@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'):
str(user_dict['yes_count']), str(user_dict['yes_count']),
str(user_dict['no_count']), str(user_dict['no_count']),
str(user_dict['avg_time']), str(user_dict['avg_time']),
str(user_dict['breaks']), status str(user_dict['breaks']), status,
format_time(user_dict['last_access'])
])) ]))

55
evaluate.py Executable file
View File

@ -0,0 +1,55 @@
#!/usr/bin/env python3
import sys
positive = 0
negative = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for line in sys.stdin:
predicted, actual = line.rstrip('\n').split('\t')
if 'yes' in predicted and 'yes' in actual:
true_positive += 1
positive += 1
if 'yes' in predicted and 'no' in actual:
false_positive += 1
negative += 1
if 'no' in predicted and 'yes' in actual:
false_negative += 1
positive += 1
if 'no' in predicted and 'no' in actual:
true_negative += 1
negative += 1
#true positive rate, sensivity
recall = true_positive / positive
#true negative rate
specificity = true_negative / negative
#positive predictive value
precision = true_positive / (true_positive + false_positive)
negative_predictive_value = true_negative / (true_negative + false_negative)
# false negative rate
miss_rate = 1 - recall # or false_negative / positive
# false positive rate, (negative miss rate)
fall_out = 1 - specificity # or false_positive / negative
false_discovery_rate = 1 - precision # or false_positive/ (false_positive / true_positvie)
false_omission_rate = 1 - negative_predictive_value # or false_negative / (false_negative + true_negative)
accuracy = (true_positive + true_negative) / (positive + negative)
f1 = 2 * (precision * recall) / (precision + recall)
mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
(true_positive + false_positive) * (true_positive + false_negative) *
(true_negative + false_positive) * (true_negative + false_negative))**0.5
print(f"""
Recall = {recall}
Precision = {precision}
F1 = {f1}
Accuracy = {accuracy}
""")

19
split-data.sh Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
split_data() {
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
mv part-00 "$1_test.txt"
mv part-01 "$1_dev.txt"
cat part-0* > "$1_train.txt" && rm part-0* $1
}
grep '__label__yes' "$1" > yes.txt
grep '__label__no' "$1" > no.txt
split_data yes.txt
split_data no.txt
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
rm yes.txt* no.txt*

19
todos.org Normal file
View File

@ -0,0 +1,19 @@
** Backlog
*** TODO odsiać binarny content
*** TODO przetestować find_new_key
*** TODO napisać testy do annotatora
*** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700
*** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html
*** TODO handle <sup> and <sub> tags e.g.
Msze od poniedziałku do soboty rano o godzinie 6<sup>45 </sup>`
*** TODO Change to levelDB database for cache
*** TODO crawling non domain masses
Scraper shouldn't crawl these types of urls:
https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie
Came from:
http://albert.zgora.pl/2014/08/bierzmowanie/
*** TODO dostęp do redisa bez sudo
** W1
*** TODO dodać ogłoszenia na duzych portalach katolickich
*** TODO alogrytm do odzielenia trusted od untrusted users

25
tsv2fasttext.py Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env python3
import csv
import sys
import re
def preprocess(prefix, hour, suffix):
sentence = prefix + hour + suffix
sentence = re.sub(r'\\n', r' \\n ', sentence)
sentence = re.sub(' +', ' ', sentence)
return sentence
def main():
# csv.reader(sys.stdin, delimiter='\t')
next(sys.stdin)
for line in sys.stdin:
prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
'\n').rsplit('\t')
sentence = preprocess(prefix, hour, suffix)
print(f'__label__{is_mass} {sentence}')
if __name__ == '__main__':
main()