First version of ml hour classificator.
Add last_access field to annotator_console user stats. Add split-data script. Add tsv2fasttext.py Add todos.org.
This commit is contained in:
parent
6ff4f230db
commit
7dd903b3b5
21
Makefile
21
Makefile
@ -3,10 +3,29 @@ PREPARE_ENVIRONMENT := $(shell ./prepare-environment.sh > /tmp/makeenv)
|
|||||||
include /tmp/makeenv
|
include /tmp/makeenv
|
||||||
JOBS := 100
|
JOBS := 100
|
||||||
|
|
||||||
.PHONY: all update data clean clean-data clean-cache
|
.PHONY: all update data clean clean-data clean-cache clean-ml
|
||||||
|
|
||||||
all: data
|
all: data
|
||||||
|
|
||||||
|
# annotator_console.py needs running redis instance
|
||||||
|
# move belo
|
||||||
|
score.txt: predicted.txt test.txt
|
||||||
|
paste $< <(egrep -o "__label__(yes|no)" $(word 2,$^)) | ./evaluate.py > $@
|
||||||
|
|
||||||
|
predicted.txt: fs-model.bin test.txt
|
||||||
|
./fasttext predict $< $(word 2,$^) > $@
|
||||||
|
|
||||||
|
fs-model.bin: train.txt
|
||||||
|
./fasttext supervised -input $< -output `basename $@ .bin`
|
||||||
|
|
||||||
|
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
|
||||||
|
./$< 2tsv | ./$(word 2,$^) > all.txt
|
||||||
|
./split-data.sh all.txt
|
||||||
|
rm all.txt
|
||||||
|
|
||||||
|
clean-ml:
|
||||||
|
rm -f train.txt test.txt dev.txt fs-model* predicted.txt score.txt
|
||||||
|
|
||||||
parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
|
parish2text: parishwebsites/parish2text.py parishwebsites/parish2text-commands.sh
|
||||||
mkdir -p parishwebsites/{text-data,text-data-logs}
|
mkdir -p parishwebsites/{text-data,text-data-logs}
|
||||||
cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
|
cd parishwebsites && ./parish2text-commands.sh data > p2t-commands.txt && parallel --jobs -2 < p2t-commands.txt
|
||||||
|
@ -119,6 +119,11 @@ def users_stats():
|
|||||||
else:
|
else:
|
||||||
yesno, str_index, timestamp = res_list
|
yesno, str_index, timestamp = res_list
|
||||||
ip_addr = '0'
|
ip_addr = '0'
|
||||||
|
if 'last_access' not in users_dict[user]:
|
||||||
|
users_dict[user]['last_access'] = float(timestamp)
|
||||||
|
else:
|
||||||
|
users_dict[user]['last_access'] = max(
|
||||||
|
float(timestamp), users_dict[user]['last_access'])
|
||||||
if 'yes_count' not in users_dict[user]:
|
if 'yes_count' not in users_dict[user]:
|
||||||
users_dict[user]['yes_count'] = 0
|
users_dict[user]['yes_count'] = 0
|
||||||
if 'no_count' not in users_dict[user]:
|
if 'no_count' not in users_dict[user]:
|
||||||
@ -161,14 +166,15 @@ def calculate_avg_annotation_time(users_dict, max_interval=10):
|
|||||||
if delta_sum == 0:
|
if delta_sum == 0:
|
||||||
user_dict['avg_time'] = math.inf
|
user_dict['avg_time'] = math.inf
|
||||||
else:
|
else:
|
||||||
user_dict['avg_time'] = delta_sum / divider
|
user_dict['avg_time'] = round(delta_sum / divider, 4)
|
||||||
user_dict['breaks'] = breaks
|
user_dict['breaks'] = breaks
|
||||||
|
|
||||||
|
|
||||||
def print_sorted(users_dict, sortby='annotations max'):
|
def print_sorted(users_dict, sortby='annotations max'):
|
||||||
print('\t'.join(
|
print('\t'.join([
|
||||||
['cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks',
|
'cookie', 'annotations', 'yes', 'no', 'avg_time', 'breaks', 'status',
|
||||||
'status']))
|
'last_access'
|
||||||
|
]))
|
||||||
if sortby == 'annotations max':
|
if sortby == 'annotations max':
|
||||||
keyfunc = lambda x: len(x[1]['annotations'])
|
keyfunc = lambda x: len(x[1]['annotations'])
|
||||||
for user, user_dict in sorted(
|
for user, user_dict in sorted(
|
||||||
@ -190,7 +196,8 @@ def print_sorted(users_dict, sortby='annotations max'):
|
|||||||
str(user_dict['yes_count']),
|
str(user_dict['yes_count']),
|
||||||
str(user_dict['no_count']),
|
str(user_dict['no_count']),
|
||||||
str(user_dict['avg_time']),
|
str(user_dict['avg_time']),
|
||||||
str(user_dict['breaks']), status
|
str(user_dict['breaks']), status,
|
||||||
|
format_time(user_dict['last_access'])
|
||||||
]))
|
]))
|
||||||
|
|
||||||
|
|
||||||
|
55
evaluate.py
Executable file
55
evaluate.py
Executable file
@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
positive = 0
|
||||||
|
negative = 0
|
||||||
|
true_positive = 0
|
||||||
|
false_positive = 0
|
||||||
|
true_negative = 0
|
||||||
|
false_negative = 0
|
||||||
|
for line in sys.stdin:
|
||||||
|
predicted, actual = line.rstrip('\n').split('\t')
|
||||||
|
if 'yes' in predicted and 'yes' in actual:
|
||||||
|
true_positive += 1
|
||||||
|
positive += 1
|
||||||
|
if 'yes' in predicted and 'no' in actual:
|
||||||
|
false_positive += 1
|
||||||
|
negative += 1
|
||||||
|
if 'no' in predicted and 'yes' in actual:
|
||||||
|
false_negative += 1
|
||||||
|
positive += 1
|
||||||
|
if 'no' in predicted and 'no' in actual:
|
||||||
|
true_negative += 1
|
||||||
|
negative += 1
|
||||||
|
|
||||||
|
#true positive rate, sensivity
|
||||||
|
recall = true_positive / positive
|
||||||
|
|
||||||
|
#true negative rate
|
||||||
|
specificity = true_negative / negative
|
||||||
|
|
||||||
|
#positive predictive value
|
||||||
|
precision = true_positive / (true_positive + false_positive)
|
||||||
|
negative_predictive_value = true_negative / (true_negative + false_negative)
|
||||||
|
|
||||||
|
# false negative rate
|
||||||
|
miss_rate = 1 - recall # or false_negative / positive
|
||||||
|
|
||||||
|
# false positive rate, (negative miss rate)
|
||||||
|
fall_out = 1 - specificity # or false_positive / negative
|
||||||
|
|
||||||
|
false_discovery_rate = 1 - precision # or false_positive/ (false_positive / true_positvie)
|
||||||
|
false_omission_rate = 1 - negative_predictive_value # or false_negative / (false_negative + true_negative)
|
||||||
|
accuracy = (true_positive + true_negative) / (positive + negative)
|
||||||
|
|
||||||
|
f1 = 2 * (precision * recall) / (precision + recall)
|
||||||
|
mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
|
||||||
|
(true_positive + false_positive) * (true_positive + false_negative) *
|
||||||
|
(true_negative + false_positive) * (true_negative + false_negative))**0.5
|
||||||
|
|
||||||
|
print(f"""
|
||||||
|
Recall = {recall}
|
||||||
|
Precision = {precision}
|
||||||
|
F1 = {f1}
|
||||||
|
Accuracy = {accuracy}
|
||||||
|
""")
|
19
split-data.sh
Executable file
19
split-data.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
split_data() {
|
||||||
|
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
||||||
|
mv part-00 "$1_test.txt"
|
||||||
|
mv part-01 "$1_dev.txt"
|
||||||
|
cat part-0* > "$1_train.txt" && rm part-0* $1
|
||||||
|
}
|
||||||
|
|
||||||
|
grep '__label__yes' "$1" > yes.txt
|
||||||
|
grep '__label__no' "$1" > no.txt
|
||||||
|
split_data yes.txt
|
||||||
|
split_data no.txt
|
||||||
|
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
||||||
|
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
||||||
|
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
||||||
|
|
||||||
|
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
||||||
|
rm yes.txt* no.txt*
|
19
todos.org
Normal file
19
todos.org
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
** Backlog
|
||||||
|
*** TODO odsiać binarny content
|
||||||
|
*** TODO przetestować find_new_key
|
||||||
|
*** TODO napisać testy do annotatora
|
||||||
|
*** TODO obsłużyć w annotatorze częściowe godziny typu: 7,00 | 7 00 | 700
|
||||||
|
*** TODO handle column pages http://parafiawincentegoapaulo.pl/msze.html
|
||||||
|
*** TODO handle <sup> and <sub> tags e.g.
|
||||||
|
Msze od poniedziałku do soboty rano o godzinie 6<sup>45 </sup>`
|
||||||
|
*** TODO Change to levelDB database for cache
|
||||||
|
*** TODO crawling non domain masses
|
||||||
|
Scraper shouldn't crawl these types of urls:
|
||||||
|
https://www.tumblr.com/widgets/share/tool/preview?shareSource=legacy&canonicalUrl=&url=http%3A%2F%2Falbert.zgora.pl%2F2014%2F08%2Fbierzmowanie%2F&title=Bierzmowanie
|
||||||
|
|
||||||
|
Came from:
|
||||||
|
http://albert.zgora.pl/2014/08/bierzmowanie/
|
||||||
|
*** TODO dostęp do redisa bez sudo
|
||||||
|
** W1
|
||||||
|
*** TODO dodać ogłoszenia na duzych portalach katolickich
|
||||||
|
*** TODO alogrytm do odzielenia trusted od untrusted users
|
25
tsv2fasttext.py
Executable file
25
tsv2fasttext.py
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(prefix, hour, suffix):
|
||||||
|
sentence = prefix + hour + suffix
|
||||||
|
sentence = re.sub(r'\\n', r' \\n ', sentence)
|
||||||
|
sentence = re.sub(' +', ' ', sentence)
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# csv.reader(sys.stdin, delimiter='\t')
|
||||||
|
next(sys.stdin)
|
||||||
|
for line in sys.stdin:
|
||||||
|
prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
|
||||||
|
'\n').rsplit('\t')
|
||||||
|
sentence = preprocess(prefix, hour, suffix)
|
||||||
|
print(f'__label__{is_mass} {sentence}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user