From c99c21843699e8a9d90429b814ed95f544bdadd6 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Mon, 18 Jun 2018 14:56:41 +0200 Subject: [PATCH] Fix mcc score. Add comments to makefile. Fix get_utterances condition. Adjust craweler settings. Change split-data script --- Makefile | 3 ++- evaluate.py | 3 ++- get_utterances.py | 2 +- parishwebsites/parishwebsites/settings.py | 4 +--- split-data.sh | 28 +++++++++++++++-------- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 10d7bee..4f298dc 100644 --- a/Makefile +++ b/Makefile @@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt ./fasttext predict $< $(word 2,$^) > $@ fs-model.bin: train.txt - ./fasttext supervised -input $< -output `basename $@ .bin` + ./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh ./$< 2tsv | ./$(word 2,$^) > all.txt +# paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt ./split-data.sh all.txt rm all.txt diff --git a/evaluate.py b/evaluate.py index 08bff01..529084d 100755 --- a/evaluate.py +++ b/evaluate.py @@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value # or false_negative / (fals accuracy = (true_positive + true_negative) / (positive + negative) f1 = 2 * (precision * recall) / (precision + recall) -mcc = (true_positive * true_negative) - (false_positive * false_negative) / ( +mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / ( (true_positive + false_positive) * (true_positive + false_negative) * (true_negative + false_positive) * (true_negative + false_negative))**0.5 @@ -52,4 +52,5 @@ print(f""" Precision = {precision} F1 = {f1} Accuracy = {accuracy} + MCC = {mcc} """) diff --git a/get_utterances.py b/get_utterances.py index cb2fab4..67b2145 100755 --- a/get_utterances.py +++ b/get_utterances.py @@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page): elif url_match and not bad_url_match: # print('url_match: {}'.format(url_match.group(0))) return True - elif button_match and not button_match: + elif button_match and not bad_button_match: # print('button_match: {}'.format(button_match.group(0))) return True return False diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py index bccc109..dd900c8 100644 --- a/parishwebsites/parishwebsites/settings.py +++ b/parishwebsites/parishwebsites/settings.py @@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False # 'parishwebsites.pipelines.ParishwebsitesPipeline': 300, #} - # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 @@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True # The initial download delay AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -AUTOTHROTTLE_MAX_DELAY = 500 +AUTOTHROTTLE_MAX_DELAY = 300 # The average number of requests Scrapy should be sending in parallel to # each remote server AUTOTHROTTLE_TARGET_CONCURRENCY = 1 @@ -98,4 +97,3 @@ DEPTH_LIMIT = 3 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' - diff --git a/split-data.sh b/split-data.sh index a292695..4f98394 100755 --- a/split-data.sh +++ b/split-data.sh @@ -3,17 +3,25 @@ split_data() { split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d mv part-00 "$1_test.txt" - mv part-01 "$1_dev.txt" + # mv part-01 "$1_dev.txt" cat part-0* > "$1_train.txt" && rm part-0* $1 } -grep '__label__yes' "$1" > yes.txt -grep '__label__no' "$1" > no.txt -split_data yes.txt -split_data no.txt -cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt -cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt -cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt +# grep '__label__yes' "$1" > yes.txt +# grep '__label__no' "$1" > no.txt +# split_data yes.txt +# split_data no.txt +# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt +# cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt +# cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt -diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort) -rm yes.txt* no.txt* +# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort) +# rm yes.txt* no.txt* + + +# --- +egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt +split_data yesno.txt +mv yesno.txt_train.txt train.txt +mv yesno.txt_test.txt test.txt +# mv yesno.txt_dev.txt > dev.txt