From c99c21843699e8a9d90429b814ed95f544bdadd6 Mon Sep 17 00:00:00 2001
From: siulkilulki <dawjur@st.amu.edu.pl>
Date: Mon, 18 Jun 2018 14:56:41 +0200
Subject: [PATCH] Fix mcc score.

Add comments to makefile.
Fix get_utterances condition.
Adjust craweler settings.
Change split-data script
---
 Makefile                                  |  3 ++-
 evaluate.py                               |  3 ++-
 get_utterances.py                         |  2 +-
 parishwebsites/parishwebsites/settings.py |  4 +---
 split-data.sh                             | 28 +++++++++++++++--------
 5 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 10d7bee..4f298dc 100644
--- a/Makefile
+++ b/Makefile
@@ -16,10 +16,11 @@ predicted.txt: fs-model.bin test.txt
 	./fasttext predict $<  $(word 2,$^) > $@
 
 fs-model.bin: train.txt
-	./fasttext supervised -input $< -output `basename $@ .bin`
+	./fasttext supervised -input $< -output `basename $@ .bin` #-dim 300 -ws 10 -wordNgrams 2 -loss ns
 
 train.txt test.txt dev.txt: ./annotator_console.py tsv2fasttext.py split-data.sh
 	./$< 2tsv | ./$(word 2,$^) > all.txt
+# paste -d ' ' <(cut -f1 -d' ' all.txt) <(cut -d' ' -f2- all.txt | ./tokenizer.perl -threads 12 -l pl) | sponge all.txt
 	./split-data.sh all.txt
 	rm all.txt
 
diff --git a/evaluate.py b/evaluate.py
index 08bff01..529084d 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -43,7 +43,7 @@ false_omission_rate = 1 - negative_predictive_value  # or false_negative / (fals
 accuracy = (true_positive + true_negative) / (positive + negative)
 
 f1 = 2 * (precision * recall) / (precision + recall)
-mcc = (true_positive * true_negative) - (false_positive * false_negative) / (
+mcc = ((true_positive * true_negative) - (false_positive * false_negative)) / (
     (true_positive + false_positive) * (true_positive + false_negative) *
     (true_negative + false_positive) * (true_negative + false_negative))**0.5
 
@@ -52,4 +52,5 @@ print(f"""
 Precision  =  {precision}
        F1  =  {f1}
  Accuracy  =  {accuracy}
+      MCC  =  {mcc}
 """)
diff --git a/get_utterances.py b/get_utterances.py
index cb2fab4..67b2145 100755
--- a/get_utterances.py
+++ b/get_utterances.py
@@ -66,7 +66,7 @@ def has_mass_metadata(url, button_text, page):
     elif url_match and not bad_url_match:
         # print('url_match: {}'.format(url_match.group(0)))
         return True
-    elif button_match and not button_match:
+    elif button_match and not bad_button_match:
         # print('button_match: {}'.format(button_match.group(0)))
         return True
     return False
diff --git a/parishwebsites/parishwebsites/settings.py b/parishwebsites/parishwebsites/settings.py
index bccc109..dd900c8 100644
--- a/parishwebsites/parishwebsites/settings.py
+++ b/parishwebsites/parishwebsites/settings.py
@@ -58,7 +58,6 @@ TELNETCONSOLE_ENABLED = False
 #    'parishwebsites.pipelines.ParishwebsitesPipeline': 300,
 #}
 
-
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 
@@ -77,7 +76,7 @@ AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-AUTOTHROTTLE_MAX_DELAY = 500
+AUTOTHROTTLE_MAX_DELAY = 300
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 AUTOTHROTTLE_TARGET_CONCURRENCY = 1
@@ -98,4 +97,3 @@ DEPTH_LIMIT = 3
 # DEPTH_PRIORITY = 1
 # SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
 # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
-
diff --git a/split-data.sh b/split-data.sh
index a292695..4f98394 100755
--- a/split-data.sh
+++ b/split-data.sh
@@ -3,17 +3,25 @@
 split_data() {
     split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
     mv part-00 "$1_test.txt"
-    mv part-01 "$1_dev.txt"
+    # mv part-01 "$1_dev.txt"
     cat part-0* > "$1_train.txt" && rm part-0* $1
 }
 
-grep '__label__yes' "$1"  > yes.txt
-grep '__label__no'  "$1"  > no.txt
-split_data yes.txt
-split_data no.txt
-cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
-cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
-cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
+# grep '__label__yes' "$1"  > yes.txt
+# grep '__label__no'  "$1"  > no.txt
+# split_data yes.txt
+# split_data no.txt
+# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
+# cat yes.txt_test.txt  no.txt_test.txt  | shuf > test.txt
+# cat yes.txt_dev.txt   no.txt_dev.txt   | shuf > dev.txt
 
-diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
-rm yes.txt* no.txt*
+# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
+# rm yes.txt* no.txt*
+
+
+# ---
+egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
+split_data yesno.txt
+mv yesno.txt_train.txt train.txt
+mv yesno.txt_test.txt test.txt
+# mv yesno.txt_dev.txt > dev.txt