19 changed files with 305516 additions and 348709 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-
+local
 *~
 *.swp
 *.bak
--- a/104
+++ b/104
@ -0,0 +1,104 @@
+SHELL=/bin/bash
+.SECONDARY:
+.DELETE_ON_ERROR:
+
+
+KENLM_DIR=/home/jakub/ISI/kenlm/build/bin
+
+# $< - pierwsza zależność
+# $@ - cel
+
+dev-0/out.tsv: dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv test-A/out.tsv
+	python3 normalize_out.py dev-0/local/paranormal_out.tsv dev-0/local/skeptic_out.tsv $@
+	geval -t dev-0 >> score.txt
+
+test-A/out.tsv: test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv
+	python3 normalize_out.py test-A/local/paranormal_out.tsv test-A/local/skeptic_out.tsv $@
+
+
+dev-0/local/paranormal_out.tsv: dev-0/local/paranormal_out_total parse_out.py dev-0/local/
+	cat $< | python3 parse_out.py > $@
+
+test-A/local/paranormal_out.tsv: test-A/local/paranormal_out_total parse_out.py test-A/local/
+	cat $< | python3 parse_out.py > $@
+
+dev-0/local/skeptic_out.tsv: dev-0/local/skeptic_out_total parse_out.py dev-0/local/
+	cat $< | python3 parse_out.py > $@
+
+test-A/local/skeptic_out.tsv: test-A/local/skeptic_out_total parse_out.py test-A/local/
+	cat $< | python3 parse_out.py > $@
+
+dev-0/local/paranormal_out_total: dev-0/local/raw_paranormal_out dev-0/local/
+	cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
+
+test-A/local/paranormal_out_total: test-A/local/raw_paranormal_out test-A/local/
+	cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
+
+dev-0/local/skeptic_out_total: dev-0/local/raw_skeptic_out dev-0/local/
+	cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
+
+test-A/local/skeptic_out_total: test-A/local/raw_skeptic_out test-A/local/
+	cat $< | egrep -o 'Total: -?[0-9]+(\.[0-9]+)?' > $@
+
+dev-0/local/raw_paranormal_out: local/paranormal.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
+	cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
+
+test-A/local/raw_paranormal_out: local/paranormal.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
+	cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
+
+dev-0/local/raw_skeptic_out: local/skeptic.lm.arpa dev-0/local/tokenized.dev-0.in.tsv dev-0/local/
+	cat dev-0/local/tokenized.dev-0.in.tsv | $(KENLM_DIR)/query $< > $@
+
+test-A/local/raw_skeptic_out: local/skeptic.lm.arpa test-A/local/tokenized.test-A.in.tsv test-A/local/
+	cat test-A/local/tokenized.test-A.in.tsv | $(KENLM_DIR)/query $< > $@
+
+dev-0/local/tokenized.dev-0.in.tsv: dev-0/in.tsv.xz mytokenize.py mytokenizer.py dev-0/local/
+	xzcat $< | python3 mytokenizer.py > $@
+
+test-A/local/tokenized.test-A.in.tsv: test-A/in.tsv.xz mytokenize.py mytokenizer.py test-A/local/
+	xzcat $< | python3 mytokenizer.py > $@
+
+test-A/local/:
+	mkdir test-A/local/
+
+dev-0/local/:
+	mkdir dev-0/local/
+#local/paranormal.lm.bin: local/paranormal.lm.arpa
+#	$(KENLM_DIR)/build_binary <$< >$@
+
+local/paranormal.lm.arpa: local/x100tokenized.in.paranormal local/
+	$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
+# --discount_fallback
+local/skeptic.lm.arpa: local/x100tokenized.in.skeptic local/
+	$(KENLM_DIR)/lmplz -o 5 -S 50% --discount_fallback <$< >$@
+
+local/x100tokenized.in.paranormal: local/x10tokenized.in.paranormal
+	cat $< $< $< $< $< $< $< $< $< $< > $@
+
+local/x10tokenized.in.paranormal: local/tokenized.in.paranormal local/
+	cat $< $< $< $< $< $< $< $< $< $< > $@
+
+local/x100tokenized.in.skeptic: local/x10tokenized.in.skeptic local/
+	cat $< $< $< $< $< $< $< $< $< $< > $@
+
+local/x10tokenized.in.skeptic: local/tokenized.in.skeptic local/
+	cat $< $< $< $< $< $< $< $< $< $< > $@
+
+local/tokenized.in.paranormal: local/in.paranormal mytokenize.py mytokenizer.py local/
+	cat $< | python3 mytokenizer.py > $@
+
+local/tokenized.in.skeptic: local/in.skeptic mytokenize.py mytokenizer.py local/
+	cat $< | python3 mytokenizer.py > $@
+
+local/in.paranormal: train/in.tsv.xz train/expected.tsv local/
+	xzcat $< | paste train/expected.tsv - | egrep '^1.*' > $@
+
+local/in.skeptic: train/in.tsv.xz train/expected.tsv local/
+	xzcat $< | paste train/expected.tsv - | egrep '^0.*' > $@
+
+local/: 
+	mkdir local/
+
+
+
+
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
 "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
 ,Glitch-in-the-Matrix, conspiracytheories).

-Output label is `S` and `P`.
+Output label is the probability of a paranormal subreddit.

 Sources
 -------
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Accuracy --precision 4  
+--metric Likelihood --metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/mytokenize.py
+++ b/mytokenize.py
@ -0,0 +1,32 @@
+import re
+
+"""
+Takes a document and returns a list of tokens.
+"""
+
+
+
+
+def tokenize(d):
+    d = re.sub(r'(\\n)', '', d)
+    p = re.compile('[a-zA-Z\']+')
+    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
+    d = d.lower()
+    return p.findall(d)
+
+
+
+
+
+"""
+def tokenize(d):
+    d = re.sub(r'(\s+|\\n)', ' ', d)
+    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
+    d = re.sub(r'\d+', 'NUM', d)
+
+    d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !").replace("*", " * ")
+    d = d.replace("\"", " \" ")
+
+
+    return re.split(r'\s+', d)
+"""
--- a/mytokenizer.py
+++ b/mytokenizer.py
@ -0,0 +1,11 @@
+from mytokenize import tokenize
+import lzma
+import sys
+
+
+
+for line in sys.stdin:
+    tokens = tokenize(line)
+    print(" ".join(tokens))
+   
+
--- a/normalize_out.py
+++ b/normalize_out.py
@ -0,0 +1,78 @@
+import sys
+import numpy as np
+
+def sigmoid(mean, x):
+    y = np.true_divide(1.0, (1.0 + np.power(np.e, -1.0*(0.00001*(x+mean)))))
+    return y
+
+
+paranormal_out_path = sys.argv[1]
+skeptic_out_path = sys.argv[2]
+
+x_min = 0.0
+x_max = -10000.0
+paranormal_out = []
+with open(paranormal_out_path, 'r') as f:
+    for line in f:
+        v = float(line.rstrip())
+        paranormal_out.append(v)
+        if v > x_max:
+            x_max = v
+        if v < x_min:
+            x_min = v
+
+paranormal_mean = np.mean(paranormal_out)
+print("p mean: " + str(paranormal_mean))
+
+skeptic_out = []
+with open(skeptic_out_path, 'r') as f:
+    for line in f:
+        v = float(line.rstrip())
+        skeptic_out.append(v)
+        if v > x_max:
+            x_max = v
+        if v < x_min:
+            x_min = v
+
+skeptic_mean = np.mean(skeptic_out)
+print("s mean: " + str(skeptic_mean))
+
+if len(skeptic_out) != len(paranormal_out):
+    print("ERROR! s len: %d" % len(skeptic_out), "\tp len: %d" % len(skeptic_out))
+    quit(-1)
+
+print("min: %f" % x_min, "\tmax: %f" % x_max)
+#print(log_out)
+
+#out_path = "dev-0/out.tsv"
+out_path = sys.argv[3]
+with open(out_path, 'w') as o:
+    for i in range(0, len(skeptic_out)):
+        paranormal = sigmoid(paranormal_mean, paranormal_out[i])
+        skeptic = sigmoid(skeptic_mean, skeptic_out[i])
+        if skeptic > paranormal:            
+            out_p = 0.3
+            #out_p = skeptic
+        else:  
+            out_p = 0.7      
+            #out_p = paranormal
+           
+
+        o.write(str(out_p) + "\n")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/parse_out.py
+++ b/parse_out.py
@ -0,0 +1,8 @@
+import sys
+
+
+for line in sys.stdin:
+    log_prob = line.rstrip().split(" ")[1]
+    print(log_prob)
+   
+
--- a/score.txt
+++ b/score.txt
@ -0,0 +1,5 @@
+Likelihood	0.5500
+Accuracy	0.7153
+F1.0	0.3471
+Precision	0.9215
+Recall	0.2138
--- a/scores.txt
+++ b/scores.txt
@ -1,15 +0,0 @@
-0.6920
-0.6857
-0.6969
-0.6931
-0.6927
-0.6952
-0.6969
-0.6969
-0.6959
-0.6959
-0.6965
-0.6965
-0.6965
-0.6954
-0.6965
--- a/solution.py
+++ b/solution.py
@ -1,20 +0,0 @@
-import re
-import sys
-
-for line in sys.stdin:
-    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
-        print("P")
-    else:
-        print("S")
-
-
-
-
-
-
-"""
-
-
-happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
-
-"""
--- a/start.sh
+++ b/start.sh
@ -1,4 +0,0 @@
-xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
-
-xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
-geval -t dev-0 >>scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/PsetsubtractionS_counted.txt
+++ b/train/PsetsubtractionS_counted.txt
--- a/train/PssS_c_clean
+++ b/train/PssS_c_clean
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/s.sh
+++ b/train/s.sh
@ -1,4 +0,0 @@
-xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
-xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
-comm -23 sortedP sortedS > PsetsubtractionS
-cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt