17 changed files with 305504 additions and 348700 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,11 @@
-
+in.tsv
+model.pkl
 *~
 *.swp
 *.bak
 *.pyc
 *.o
+*.pkl
 .DS_Store
 .token
+.idea
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
 "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
 ,Glitch-in-the-Matrix, conspiracytheories).

-Output label is `S` and `P`.
+Output label is 0 (for skeptic) and  1 (for paranormal).

 Sources
 -------
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Accuracy --precision 4  
+--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/mytokenize.py
+++ b/mytokenize.py
@ -0,0 +1,12 @@
+import re
+
+"""
+Takes a document and returns a list of tokens.
+"""
+def tokenize(d):
+    d = re.sub(r'(\s+|\\n)', ' ', d)
+    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
+    d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
+    d = re.sub(r'\d+', 'NUM', d)
+
+    return re.split(r'\s+', d)
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,10 @@
+
+
+
+
+def predict(weights, word_to_index, tokenized_text):
+    ypred = weights[0]  # bias or w0
+    for x in tokenized_text:
+        index = word_to_index[x]
+        ypred += weights[index] * 1
+    return ypred
--- a/pseudokod.txt
+++ b/pseudokod.txt
@ -0,0 +1,83 @@
+vocabulary = .... zbiór wszystkich słów ... 
+
+word_to_index_mapping = {} 
+
+index_to_word_mapping = {} 
+
+ix = 1 
+
+for w in vocabulary: 
+
+      word_to_index_mapping[w] = ix 
+
+      index_to_word_mapping[ix] = w 
+
+      ix += 1 
+
+ 
+
+#inicjalizacja   
+
+weights = [] 
+
+for ix in xrange(0, len(vocabulary)+1): 
+
+     weights[ix] = losowa wartość z przedziału (-0.01, 0.01) 
+
+      
+
+Albo weights[0] = 2012.0 
+
+ 
+
+learning_rate = 0.000001 
+
+  
+
+Loss_sum = 0.0 
+
+Loss_sum_counter = 0 
+
+while .... 
+
+     d, y = losowy przykład ze zbioru uczący 
+
+  
+
+     # predykcja 
+
+     y_hat = weights[0] 
+
+     dla każdego słowa w z dokumentu d: 
+
+            y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d) 
+
+ 
+
+    # funkcja kosztu 
+
+    Loss = (y_hat – y)**2.0 
+
+    Loss_sum += Loss 
+
+    If loss_counter % 1000 == 0: 
+
+        print(loss_sum / 1000) 
+
+        Loss_counter = 0 
+
+        Loss_sum  = 0.0 
+
+            
+
+     # uczenie - update 
+
+     delta = (y_hat - y) * learning_rate 
+
+     weights[0] = weights[0] - delta  
+
+     for każdego słowa w z dokumenty d: 
+
+            weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta 
+
+  
--- a/score.txt
+++ b/score.txt
@ -0,0 +1 @@
+MSE:	373.4577154450468
--- a/scores.txt
+++ b/scores.txt
@ -1,15 +0,0 @@
-0.6920
-0.6857
-0.6969
-0.6931
-0.6927
-0.6952
-0.6969
-0.6969
-0.6959
-0.6959
-0.6965
-0.6965
-0.6965
-0.6954
-0.6965
--- a/solution.py
+++ b/solution.py
@ -1,20 +1,126 @@
-import re
-import sys
+import pandas as pd
+import pickle
+import csv
+from random import uniform

-for line in sys.stdin:
-    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
-        print("P")
+from predict import predict
+from mytokenize import tokenize
+
+#load data:
+train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
+texts = train["text"]
+y = train["date"]
+temp_y = []
+for posix_time in y:
+    floatyear = 1970 + (posix_time / (60*60*24 * 365.25))
+    temp_y.append(floatyear)
+
+y = temp_y
+print(y[0])
+tokenized_texts = []
+word_to_index = {}
+index_to_word = {}
+word_count = {}
+learning_rate = 0.000001
+
+for doc in texts:
+    tokens = tokenize(doc)
+    tokenized_texts.append(tokens)
+    for token in tokens:
+        if token in word_count:
+            word_count[token] += 1
+        else:
+            word_count[token] = 1
+
+#vocabulary sorted from the most frequent words to the least f. w.
+vocabulary = sorted(word_count, key= word_count.get, reverse=True)
+for w in vocabulary:
+    i = len(word_to_index) + 1
+    word_to_index[w] = i
+    index_to_word[i] = w
+
+weights = []
+for i in range(0, len(vocabulary) + 1):
+    weights.append(2012 * uniform(-0.01, 0.01))
+
+
+best_MSE = 408
+
+model = pickle.load(open("BestLinearRegressionModel.pkl", "rb"))
+word_to_index, weights, best_MSE = model
+print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
+
+
+loss_sum = 0.0
+loss_counter = 0
+for_MSE_sum = 0.0
+MSE = 0.0
+counter = 0
+inc_counter = 0
+while(True):
+    for i in range(0, len(tokenized_texts)):
+        """#@@ Obliczanie ypred - start
+        ypred = weights[0]#bias or w0
+        for x in tokenized_texts[i]:
+            index = word_to_index[x]
+            ypred += weights[index] * index
+        #@@ Obliczanie ypred - end"""
+        ypred = predict(weights, word_to_index, tokenized_texts[i])
+
+        #@@ Obliczanie kosztu
+        loss = (ypred - y[i]) ** 2.0
+        loss_sum += loss
+        for_MSE_sum += loss
+        loss_counter += 1
+        if loss_counter == 1000:
+            #print(str(loss_sum/1000), "\t", str(MSE))
+            loss_counter = 0
+            loss_sum = 0.0
+
+        #@@ Obliczanie kosztu - end
+
+        #@@ Uczenie - aktualizacja wag
+        delta = (ypred - y[i]) * learning_rate
+        weights[0] -= delta
+        for x in tokenized_texts[i]:
+            index = word_to_index[x]
+            weights[index] -= delta
+        #@@ Uczenie - aktualizacja wag - end
+
+    temp_MSE = for_MSE_sum / len(tokenized_texts)
+    for_MSE_sum = 0.0
+    if best_MSE > temp_MSE:
+        best_MSE = temp_MSE
+        model = (word_to_index, weights, best_MSE)
+        pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
+        with open("score.txt", "w") as out:
+            out.write("MSE:\t")
+            out.write(str(best_MSE))
+    if temp_MSE > MSE:
+        counter += 1
    else:
-        print("S")
-
+        inc_counter += 1

+    if counter > 2:
+        learning_rate *= 0.1
+        counter = 0

+    if inc_counter > 4:
+        learning_rate /= 0.90
+        inc_counter = 0
+    MSE = temp_MSE
+    print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))



 """
+with open("dev-0/out.tsv", "w") as out1:
+    for line in predicted_dev0:
+        out1.write(str(line))
+        out1.write("\n")

-
-happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
-
-"""
+with open("test-A/out.tsv", "w") as out2:
+    for line in predicted_testA:
+        out2.write(str(line))
+        out2.write("\n")
+"""
--- a/start.sh
+++ b/start.sh
@ -1,4 +0,0 @@
-xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
-
-xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
-geval -t dev-0 >>scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/PsetsubtractionS_counted.txt
+++ b/train/PsetsubtractionS_counted.txt
--- a/train/PssS_c_clean
+++ b/train/PssS_c_clean
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/s.sh
+++ b/train/s.sh
@ -1,4 +0,0 @@
-xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
-xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
-comm -23 sortedP sortedS > PsetsubtractionS
-cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt