First version of ml hour classificator.
Add last_access field to annotator_console user stats. Add split-data script. Add tsv2fasttext.py Add todos.org.
This commit is contained in:
parent
6ff4f230db
commit
7dd903b3b5
21
Makefile
21
Makefile
@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
||||
include /tmp/makeenv
|
||||
JOBS := 100
|
||||
|
||||
.PHONY: all update data clean clean-data clean-cache
|
||||
.PHONY: all update data clean clean-data clean-cache clean-ml
|
||||
|
||||
all: data
|
||||
|
||||
# annotator_console.py needs running redis instance
|
||||
# move belo
|
||||
score.txt: predicted.txt test.txt
|
||||
paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@
|
||||
|
||||
predicted.txt: fs-model.bin test.txt
|
||||
./fasttext predict $< $(word 2,$^) > $@
|
||||
|
||||
fs-model.bin: train.txt
|
||||
./fasttext supervised -input $< -output `basename $@ .bin`
|
||||
|
||||
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
|
||||
./$< 2tsv | ./$(word 2,$^) > all.txt
|
||||
./split-data.sh all.txt
|
||||
rm all.txt
|
||||
|
||||
clean-ml:
|
||||
rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt
|
||||
|
||||
parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
|
||||
mkdir -p parishwebsites/{text-data,text-data-logs}
|
||||
cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
|
||||
|
@ -119,6 +119,11 @@ def users_stats():
|
||||
else:
|
||||
yesno, str_index, timestamp = res_list
|
||||
ip_addr = '0'
|
||||
if 'last_access' not in users_dict[user]:
|
||||
users_dict[user]['last_access'] = float(timestamp)
|
||||
else:
|
||||
users_dict[user]['last_access'] = max(
|
||||
float(timestamp), users_dict[user]['last_access'])
|
||||
if 'yes_count' not in users_dict[user]:
|
||||
users_dict[user]['yes_count'] = 0
|
||||
if 'no_count' not in users_dict[user]:
|
||||
@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10):
|
||||
if delta_sum == 0:
|
||||
user_dict['avg_time'] = math.inf
|
||||
else:
|
||||
user_dict['avg_time'] = delta_sum / divider
|
||||
user_dict['avg_time'] = round(delta_sum / divider, 4)
|
||||
user_dict['breaks'] = breaks
|
||||
|
||||
|
||||
def print_sorted(users_dict, sortby='annotations max'):
|
||||
print('\t'.join(
|
||||
['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
|
||||
'status']))
|
||||
print('\t'.join([
|
||||
'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
|
||||
'last_access'
|
||||
]))
|
||||
if sortby == 'annotations max':
|
||||
keyfunc = lambda x: len(x[1]['annotations'])
|
||||
for user, user_dict in sorted(
|
||||
@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'):
|
||||
str(user_dict['yes_count']),
|
||||
str(user_dict['no_count']),
|
||||
str(user_dict['avg_time']),
|
||||
str(user_dict['breaks']), status
|
||||
str(user_dict['breaks']), status,
|
||||
format_time(user_dict['last_access'])
|
||||
]))
|
||||
|
||||
|
||||
|
55
evaluate.py
Executable file
55
evaluate.py
Executable file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
|
||||
positive = 0
|
||||
negative = 0
|
||||
true_positive = 0
|
||||
false_positive = 0
|
||||
true_negative = 0
|
||||
false_negative = 0
|
||||
for line in sys.stdin:
|
||||
predicted, actual = line.rstrip('\n').split('\t')
|
||||
if 'yes' in predicted and 'yes' in actual:
|
||||
true_positive += 1
|
||||
positive += 1
|
||||
if 'yes' in predicted and 'no' in actual:
|
||||
false_positive += 1
|
||||
negative += 1
|
||||
if 'no' in predicted and 'yes' in actual:
|
||||
false_negative += 1
|
||||
positive += 1
|
||||
if 'no' in predicted and 'no' in actual:
|
||||
true_negative += 1
|
||||
negative += 1
|
||||
|
||||
#true positive rate, sensivity
|
||||
recall = true_positive / positive
|
||||
|
||||
#true negative rate
|
||||
specificity = true_negative / negative
|
||||
|
||||
#positive predictive value
|
||||
precision = true_positive / (true_positive + false_positive)
|
||||
negative_predictive_value = true_negative / (true_negative + false_negative)
|
||||
|
||||
# false negative rate
|
||||
miss_rate = 1 - recall # or false_negative / positive
|
||||
|
||||
# false positive rate, (negative miss rate)
|
||||
fall_out = 1 - specificity # or false_positive / negative
|
||||
|
||||
false_discovery_rate = 1 - precision # or false_positive/ (false_positive / true_positvie)
|
||||
false_omission_rate = 1 - negative_predictive_value # or false_negative / (false_negative + true_negative)
|
||||
accuracy = (true_positive + true_negative) / (positive + negative)
|
||||
|
||||
f1 = 2 * (precision * recall) / (precision + recall)
|
||||
mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
|
||||
(true_positive + false_positive) * (true_positive + false_negative) *
|
||||
(true_negative + false_positive) * (true_negative + false_negative))**0.5
|
||||
|
||||
print(f"""
|
||||
Recall = {recall}
|
||||
Precision = {precision}
|
||||
F1 = {f1}
|
||||
Accuracy = {accuracy}
|
||||
""")
|
19
split-data.sh
Executable file
19
split-data.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
split_data() {
|
||||
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
||||
mv part-00 "$1_test.txt"
|
||||
mv part-01 "$1_dev.txt"
|
||||
cat part-0* > "$1_train.txt" && rm part-0* $1
|
||||
}
|
||||
|
||||
grep '__label__yes' "$1" > yes.txt
|
||||
grep '__label__no' "$1" > no.txt
|
||||
split_data yes.txt
|
||||
split_data no.txt
|
||||
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
||||
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
||||
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
||||
|
||||
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
||||
rm yes.txt* no.txt*
|
19
todos.org
Normal file
19
todos.org
Normal file
@ -0,0 +1,19 @@
|
||||
** Backlog
|
||||
*** TODO odsiać binarny content
|
||||
*** TODO przetestować find_new_key
|
||||
*** TODO napisać testy do annotatora
|
||||
*** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700
|
||||
*** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html
|
||||
*** TODO handle <sup> and <sub> tags e.g.
|
||||
Msze od poniedziałku do soboty rano o godzinie 6<sup>45 </sup>`
|
||||
*** TODO Change to levelDB database for cache
|
||||
*** TODO crawling non domain masses
|
||||
Scraper shouldn't crawl these types of urls:
|
||||
https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie
|
||||
|
||||
Came from:
|
||||
http://albert.zgora.pl/2014/08/bierzmowanie/
|
||||
*** TODO dostęp do redisa bez sudo
|
||||
** W1
|
||||
*** TODO dodać ogłoszenia na duzych portalach katolickich
|
||||
*** TODO alogrytm do odzielenia trusted od untrusted users
|
25
tsv2fasttext.py
Executable file
25
tsv2fasttext.py
Executable file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python3
|
||||
import csv
|
||||
import sys
|
||||
import re
|
||||
|
||||
|
||||
def preprocess(prefix, hour, suffix):
|
||||
sentence = prefix + hour + suffix
|
||||
sentence = re.sub(r'\\n', r' \\n ', sentence)
|
||||
sentence = re.sub(' +', ' ', sentence)
|
||||
return sentence
|
||||
|
||||
|
||||
def main():
|
||||
# csv.reader(sys.stdin, delimiter='\t')
|
||||
next(sys.stdin)
|
||||
for line in sys.stdin:
|
||||
prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
|
||||
'\n').rsplit('\t')
|
||||
sentence = preprocess(prefix, hour, suffix)
|
||||
print(f'__label__{is_mass} {sentence}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user