Fix mcc score.
Add comments to makefile. Fix get_utterances condition. Adjust craweler settings. Change split-data script
This commit is contained in:
parent
7dd903b3b5
commit
c99c218436
3
Makefile
3
Makefile
@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt
|
|||||||
./fasttext predict $< $(word 2,$^) > $@
|
./fasttext predict $< $(word 2,$^) > $@
|
||||||
|
|
||||||
fs-model.bin: train.txt
|
fs-model.bin: train.txt
|
||||||
./fasttext supervised -input $< -output `basename $@ .bin`
|
./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns
|
||||||
|
|
||||||
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
|
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
|
||||||
./$< 2tsv | ./$(word 2,$^) > all.txt
|
./$< 2tsv | ./$(word 2,$^) > all.txt
|
||||||
|
# paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt
|
||||||
./split-data.sh all.txt
|
./split-data.sh all.txt
|
||||||
rm all.txt
|
rm all.txt
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value # or false_negative / (fals
|
|||||||
accuracy = (true_positive + true_negative) / (positive + negative)
|
accuracy = (true_positive + true_negative) / (positive + negative)
|
||||||
|
|
||||||
f1 = 2 * (precision * recall) / (precision + recall)
|
f1 = 2 * (precision * recall) / (precision + recall)
|
||||||
mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
|
mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / (
|
||||||
(true_positive + false_positive) * (true_positive + false_negative) *
|
(true_positive + false_positive) * (true_positive + false_negative) *
|
||||||
(true_negative + false_positive) * (true_negative + false_negative))**0.5
|
(true_negative + false_positive) * (true_negative + false_negative))**0.5
|
||||||
|
|
||||||
@ -52,4 +52,5 @@ print(f"""
|
|||||||
Precision = {precision}
|
Precision = {precision}
|
||||||
F1 = {f1}
|
F1 = {f1}
|
||||||
Accuracy = {accuracy}
|
Accuracy = {accuracy}
|
||||||
|
MCC = {mcc}
|
||||||
""")
|
""")
|
||||||
|
@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page):
|
|||||||
elif url_match and not bad_url_match:
|
elif url_match and not bad_url_match:
|
||||||
# print('url_match: {}'.format(url_match.group(0)))
|
# print('url_match: {}'.format(url_match.group(0)))
|
||||||
return True
|
return True
|
||||||
elif button_match and not button_match:
|
elif button_match and not bad_button_match:
|
||||||
# print('button_match: {}'.format(button_match.group(0)))
|
# print('button_match: {}'.format(button_match.group(0)))
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False
|
|||||||
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
|
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
#CONCURRENT_REQUESTS = 32
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True
|
|||||||
# The initial download delay
|
# The initial download delay
|
||||||
AUTOTHROTTLE_START_DELAY = 5
|
AUTOTHROTTLE_START_DELAY = 5
|
||||||
# The maximum download delay to be set in case of high latencies
|
# The maximum download delay to be set in case of high latencies
|
||||||
AUTOTHROTTLE_MAX_DELAY = 500
|
AUTOTHROTTLE_MAX_DELAY = 300
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
# each remote server
|
# each remote server
|
||||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
||||||
@ -98,4 +97,3 @@ DEPTH_LIMIT = 3
|
|||||||
# DEPTH_PRIORITY = 1
|
# DEPTH_PRIORITY = 1
|
||||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||||
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||||
|
|
||||||
|
@ -3,17 +3,25 @@
|
|||||||
split_data() {
|
split_data() {
|
||||||
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
||||||
mv part-00 "$1_test.txt"
|
mv part-00 "$1_test.txt"
|
||||||
mv part-01 "$1_dev.txt"
|
# mv part-01 "$1_dev.txt"
|
||||||
cat part-0* > "$1_train.txt" && rm part-0* $1
|
cat part-0* > "$1_train.txt" && rm part-0* $1
|
||||||
}
|
}
|
||||||
|
|
||||||
grep '__label__yes' "$1" > yes.txt
|
# grep '__label__yes' "$1" > yes.txt
|
||||||
grep '__label__no' "$1" > no.txt
|
# grep '__label__no' "$1" > no.txt
|
||||||
split_data yes.txt
|
# split_data yes.txt
|
||||||
split_data no.txt
|
# split_data no.txt
|
||||||
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
||||||
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
# cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
||||||
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
# cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
||||||
|
|
||||||
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
||||||
rm yes.txt* no.txt*
|
# rm yes.txt* no.txt*
|
||||||
|
|
||||||
|
|
||||||
|
# ---
|
||||||
|
egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
|
||||||
|
split_data yesno.txt
|
||||||
|
mv yesno.txt_train.txt train.txt
|
||||||
|
mv yesno.txt_test.txt test.txt
|
||||||
|
# mv yesno.txt_dev.txt > dev.txt
|
||||||
|
Loading…
Reference in New Issue
Block a user