Fix mcc score.
Add comments to makefile. Fix get_utterances condition. Adjust craweler settings. Change split-data script
This commit is contained in:
parent
7dd903b3b5
commit
c99c218436
3
Makefile
3
Makefile
@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt
|
||||
./fasttext predict $< $(word 2,$^) > $@
|
||||
|
||||
fs-model.bin: train.txt
|
||||
./fasttext supervised -input $< -output `basename $@ .bin`
|
||||
./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns
|
||||
|
||||
train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
|
||||
./$< 2tsv | ./$(word 2,$^) > all.txt
|
||||
# paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt
|
||||
./split-data.sh all.txt
|
||||
rm all.txt
|
||||
|
||||
|
@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value # or false_negative / (fals
|
||||
accuracy = (true_positive + true_negative) / (positive + negative)
|
||||
|
||||
f1 = 2 * (precision * recall) / (precision + recall)
|
||||
mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
|
||||
mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / (
|
||||
(true_positive + false_positive) * (true_positive + false_negative) *
|
||||
(true_negative + false_positive) * (true_negative + false_negative))**0.5
|
||||
|
||||
@ -52,4 +52,5 @@ print(f"""
|
||||
Precision = {precision}
|
||||
F1 = {f1}
|
||||
Accuracy = {accuracy}
|
||||
MCC = {mcc}
|
||||
""")
|
||||
|
@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page):
|
||||
elif url_match and not bad_url_match:
|
||||
# print('url_match: {}'.format(url_match.group(0)))
|
||||
return True
|
||||
elif button_match and not button_match:
|
||||
elif button_match and not bad_button_match:
|
||||
# print('button_match: {}'.format(button_match.group(0)))
|
||||
return True
|
||||
return False
|
||||
|
@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False
|
||||
# 'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
|
||||
#}
|
||||
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
AUTOTHROTTLE_MAX_DELAY = 500
|
||||
AUTOTHROTTLE_MAX_DELAY = 300
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1
|
||||
@ -98,4 +97,3 @@ DEPTH_LIMIT = 3
|
||||
# DEPTH_PRIORITY = 1
|
||||
# SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
|
||||
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
|
||||
|
||||
|
@ -3,17 +3,25 @@
|
||||
split_data() {
|
||||
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
||||
mv part-00 "$1_test.txt"
|
||||
mv part-01 "$1_dev.txt"
|
||||
# mv part-01 "$1_dev.txt"
|
||||
cat part-0* > "$1_train.txt" && rm part-0* $1
|
||||
}
|
||||
|
||||
grep '__label__yes' "$1" > yes.txt
|
||||
grep '__label__no' "$1" > no.txt
|
||||
split_data yes.txt
|
||||
split_data no.txt
|
||||
cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
||||
cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
||||
cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
||||
# grep '__label__yes' "$1" > yes.txt
|
||||
# grep '__label__no' "$1" > no.txt
|
||||
# split_data yes.txt
|
||||
# split_data no.txt
|
||||
# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
||||
# cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
||||
# cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
||||
|
||||
diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
||||
rm yes.txt* no.txt*
|
||||
# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
||||
# rm yes.txt* no.txt*
|
||||
|
||||
|
||||
# ---
|
||||
egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
|
||||
split_data yesno.txt
|
||||
mv yesno.txt_train.txt train.txt
|
||||
mv yesno.txt_test.txt test.txt
|
||||
# mv yesno.txt_dev.txt > dev.txt
|
||||
|
Loading…
Reference in New Issue
Block a user