Fix mcc score.

Add comments to makefile. Fix get_utterances condition. Adjust craweler settings. Change split-data script
2018-06-18 14:56:41 +02:00 · 2018-06-18 14:56:41 +02:00 · c99c218436
commit c99c218436
parent 7dd903b3b5
5 changed files with 24 additions and 16 deletions
--- a/3
+++ b/3
@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt
 	./fasttext predict $<  $(word 2,$^) > $@
 fs-model.bin: train.txt
-	./fasttext supervised -input $< -output `basename $@ .bin`
+	./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns
 train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
 	./$< 2tsv | ./$(word 2,$^) > all.txt
 # paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt
 	./split-data.sh all.txt
 	rm all.txt
--- a/evaluate.py
+++ b/evaluate.py
@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value  # or false_negative / (fals
 accuracy = (true_positive + true_negative) / (positive + negative)
 f1 = 2 * (precision * recall) / (precision + recall)
-mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
+mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / (
    (true_positive + false_positive) * (true_positive + false_negative) *
    (true_negative + false_positive) * (true_negative + false_negative))**0.5
@ -52,4 +52,5 @@ print(f"""
 Precision  =  {precision}
       F1  =  {f1}
 Accuracy  =  {accuracy}
      MCC  =  {mcc}
 """)
--- a/get_utterances.py
+++ b/get_utterances.py
@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page):
    elif url_match and not bad_url_match:
        # print('url_match: {}'.format(url_match.group(0)))
        return True
-    elif button_match and not button_match:
+    elif button_match and not bad_button_match:
        # print('button_match: {}'.format(button_match.group(0)))
        return True
    return False
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False
 #    'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
 #}
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-AUTOTHROTTLE_MAX_DELAY = 500
+AUTOTHROTTLE_MAX_DELAY = 300
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 AUTOTHROTTLE_TARGET_CONCURRENCY = 1
@ -98,4 +97,3 @@ DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
--- a/split-data.sh
+++ b/split-data.sh
@ -3,17 +3,25 @@
 split_data() {
    split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
    mv part-00 "$1_test.txt"
-    mv part-01 "$1_dev.txt"
+    # mv part-01 "$1_dev.txt"
    cat part-0* > "$1_train.txt" && rm part-0* $1
 }
-grep '__label__yes' "$1"  > yes.txt
+# grep '__label__yes' "$1"  > yes.txt
-grep '__label__no'  "$1"  > no.txt
+# grep '__label__no'  "$1"  > no.txt
-split_data yes.txt
+# split_data yes.txt
-split_data no.txt
+# split_data no.txt
-cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
+# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
-cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
+# cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
-cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
+# cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
-diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
+# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
-rm yes.txt* no.txt*
+# rm yes.txt* no.txt*
 # ---
 egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
 split_data yesno.txt
 mv yesno.txt_train.txt train.txt
 mv yesno.txt_test.txt test.txt
 # mv yesno.txt_dev.txt > dev.txt