Fix mcc score.

Add comments to makefile.
Fix get_utterances condition.
Adjust craweler settings.
Change split-data script
This commit is contained in:
siulkilulki 2018-06-18 14:56:41 +02:00
parent 7dd903b3b5
commit c99c218436
5 changed files with 24 additions and 16 deletions

View File

@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt
./fasttext predict $< $(word 2,$^) > $@ ./fasttext predict $< $(word 2,$^) > $@
fs-model.bin: train.txt fs-model.bin: train.txt
./fasttext supervised -input $< -output `basename $@ .bin` ./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
./$< 2tsv | ./$(word 2,$^) > all.txt ./$< 2tsv | ./$(word 2,$^) > all.txt
# paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt
./split-data.sh all.txt ./split-data.sh all.txt
rm all.txt rm all.txt

View File

@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value # or false_negative / (fals
accuracy = (true_positive + true_negative) / (positive + negative) accuracy = (true_positive + true_negative) / (positive + negative)
f1 = 2 * (precision * recall) / (precision + recall) f1 = 2 * (precision * recall) / (precision + recall)
mcc = (true_positive * true_negative) - (false_positive * false_negative) / ( mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / (
(true_positive + false_positive) * (true_positive + false_negative) * (true_positive + false_positive) * (true_positive + false_negative) *
(true_negative + false_positive) * (true_negative + false_negative))**0.5 (true_negative + false_positive) * (true_negative + false_negative))**0.5
@ -52,4 +52,5 @@ print(f"""
Precision = {precision} Precision = {precision}
F1 = {f1} F1 = {f1}
Accuracy = {accuracy} Accuracy = {accuracy}
MCC = {mcc}
""") """)

View File

@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page):
elif url_match and not bad_url_match: elif url_match and not bad_url_match:
# print('url_match: {}'.format(url_match.group(0))) # print('url_match: {}'.format(url_match.group(0)))
return True return True
elif button_match and not button_match: elif button_match and not bad_button_match:
# print('button_match: {}'.format(button_match.group(0))) # print('button_match: {}'.format(button_match.group(0)))
return True return True
return False return False

View File

@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300, # 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
#} #}
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True
# The initial download delay # The initial download delay
AUTOTHROTTLE_START_DELAY = 5 AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies # The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 500 AUTOTHROTTLE_MAX_DELAY = 300
# The average number of requests Scrapy should be sending in parallel to # The average number of requests Scrapy should be sending in parallel to
# each remote server # each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1 AUTOTHROTTLE_TARGET_CONCURRENCY = 1
@ -98,4 +97,3 @@ DEPTH_LIMIT = 3
# DEPTH_PRIORITY = 1 # DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'

View File

@ -3,17 +3,25 @@
split_data() { split_data() {
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
mv part-00 "$1_test.txt" mv part-00 "$1_test.txt"
mv part-01 "$1_dev.txt" # mv part-01 "$1_dev.txt"
cat part-0* > "$1_train.txt" && rm part-0* $1 cat part-0* > "$1_train.txt" && rm part-0* $1
} }
grep '__label__yes' "$1" > yes.txt # grep '__label__yes' "$1" > yes.txt
grep '__label__no' "$1" > no.txt # grep '__label__no' "$1" > no.txt
split_data yes.txt # split_data yes.txt
split_data no.txt # split_data no.txt
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt # cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt # cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt # cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort) # diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
rm yes.txt* no.txt* # rm yes.txt* no.txt*
# ---
egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
split_data yesno.txt
mv yesno.txt_train.txt train.txt
mv yesno.txt_test.txt test.txt
# mv yesno.txt_dev.txt > dev.txt