17 changed files with 305504 additions and 348700 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,11 @@
-
+in.tsv
 model.pkl
 *~
 *.swp
 *.bak
 *.pyc
 *.o
 *.pkl
 .DS_Store
 .token
 .idea
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
 "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
 ,Glitch-in-the-Matrix, conspiracytheories).
-Output label is `S` and `P`.
+Output label is 0 (for skeptic) and  1 (for paranormal).
 Sources
 -------
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Accuracy --precision 4  
+--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/mytokenize.py
+++ b/mytokenize.py
@ -0,0 +1,12 @@
 import re
 """
 Takes a document and returns a list of tokens.
 """
 def tokenize(d):
    d = re.sub(r'(\s+|\\n)', ' ', d)
    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
    d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
    d = re.sub(r'\d+', 'NUM', d)
    return re.split(r'\s+', d)
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,10 @@
 def predict(weights, word_to_index, tokenized_text):
    ypred = weights[0]  # bias or w0
    for x in tokenized_text:
        index = word_to_index[x]
        ypred += weights[index] * 1
    return ypred
--- a/pseudokod.txt
+++ b/pseudokod.txt
@ -0,0 +1,83 @@
 vocabulary = .... zbiór wszystkich słów ... 
 word_to_index_mapping = {} 
 index_to_word_mapping = {} 
 ix = 1 
 for w in vocabulary: 
      word_to_index_mapping[w] = ix 
      index_to_word_mapping[ix] = w 
      ix += 1 
 #inicjalizacja   
 weights = [] 
 for ix in xrange(0, len(vocabulary)+1): 
     weights[ix] = losowa wartość z przedziału (-0.01, 0.01) 
 Albo weights[0] = 2012.0 
 learning_rate = 0.000001 
 Loss_sum = 0.0 
 Loss_sum_counter = 0 
 while .... 
     d, y = losowy przykład ze zbioru uczący 
     # predykcja 
     y_hat = weights[0] 
     dla każdego słowa w z dokumentu d: 
            y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d) 
    # funkcja kosztu 
    Loss = (y_hat – y)**2.0 
    Loss_sum += Loss 
    If loss_counter % 1000 == 0: 
        print(loss_sum / 1000) 
        Loss_counter = 0 
        Loss_sum  = 0.0 
     # uczenie - update 
     delta = (y_hat - y) * learning_rate 
     weights[0] = weights[0] - delta  
     for każdego słowa w z dokumenty d: 
            weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta 
--- a/score.txt
+++ b/score.txt
@ -0,0 +1 @@
 MSE:	373.4577154450468
--- a/scores.txt
+++ b/scores.txt
@ -1,15 +0,0 @@
 0.6920
 0.6857
 0.6969
 0.6931
 0.6927
 0.6952
 0.6969
 0.6969
 0.6959
 0.6959
 0.6965
 0.6965
 0.6965
 0.6954
 0.6965
--- a/solution.py
+++ b/solution.py
@ -1,20 +1,126 @@
-import re
+import pandas as pd
-import sys
+import pickle
 import csv
 from random import uniform
-for line in sys.stdin:
+from predict import predict
-    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
+from mytokenize import tokenize
-        print("P")
+
 #load data:
 train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
 texts = train["text"]
 y = train["date"]
 temp_y = []
 for posix_time in y:
    floatyear = 1970 + (posix_time / (60*60*24 * 365.25))
    temp_y.append(floatyear)
 y = temp_y
 print(y[0])
 tokenized_texts = []
 word_to_index = {}
 index_to_word = {}
 word_count = {}
 learning_rate = 0.000001
 for doc in texts:
    tokens = tokenize(doc)
    tokenized_texts.append(tokens)
    for token in tokens:
        if token in word_count:
            word_count[token] += 1
        else:
            word_count[token] = 1
 #vocabulary sorted from the most frequent words to the least f. w.
 vocabulary = sorted(word_count, key= word_count.get, reverse=True)
 for w in vocabulary:
    i = len(word_to_index) + 1
    word_to_index[w] = i
    index_to_word[i] = w
 weights = []
 for i in range(0, len(vocabulary) + 1):
    weights.append(2012 * uniform(-0.01, 0.01))
 best_MSE = 408
 model = pickle.load(open("BestLinearRegressionModel.pkl", "rb"))
 word_to_index, weights, best_MSE = model
 print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
 loss_sum = 0.0
 loss_counter = 0
 for_MSE_sum = 0.0
 MSE = 0.0
 counter = 0
 inc_counter = 0
 while(True):
    for i in range(0, len(tokenized_texts)):
        """#@@ Obliczanie ypred - start
        ypred = weights[0]#bias or w0
        for x in tokenized_texts[i]:
            index = word_to_index[x]
            ypred += weights[index] * index
        #@@ Obliczanie ypred - end"""
        ypred = predict(weights, word_to_index, tokenized_texts[i])
        #@@ Obliczanie kosztu
        loss = (ypred - y[i]) ** 2.0
        loss_sum += loss
        for_MSE_sum += loss
        loss_counter += 1
        if loss_counter == 1000:
            #print(str(loss_sum/1000), "\t", str(MSE))
            loss_counter = 0
            loss_sum = 0.0
        #@@ Obliczanie kosztu - end
        #@@ Uczenie - aktualizacja wag
        delta = (ypred - y[i]) * learning_rate
        weights[0] -= delta
        for x in tokenized_texts[i]:
            index = word_to_index[x]
            weights[index] -= delta
        #@@ Uczenie - aktualizacja wag - end
    temp_MSE = for_MSE_sum / len(tokenized_texts)
    for_MSE_sum = 0.0
    if best_MSE > temp_MSE:
        best_MSE = temp_MSE
        model = (word_to_index, weights, best_MSE)
        pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
        with open("score.txt", "w") as out:
            out.write("MSE:\t")
            out.write(str(best_MSE))
    if temp_MSE > MSE:
        counter += 1
    else:
-        print("S")
+        inc_counter += 1
    if counter > 2:
        learning_rate *= 0.1
        counter = 0
    if inc_counter > 4:
        learning_rate /= 0.90
        inc_counter = 0
    MSE = temp_MSE
    print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))
 """
 with open("dev-0/out.tsv", "w") as out1:
    for line in predicted_dev0:
        out1.write(str(line))
        out1.write("\n")
-
+with open("test-A/out.tsv", "w") as out2:
-happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
+    for line in predicted_testA:
-
+        out2.write(str(line))
        out2.write("\n")
 """
--- a/start.sh
+++ b/start.sh
@ -1,4 +0,0 @@
 xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
 xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
 geval -t dev-0 >>scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/PsetsubtractionS_counted.txt
+++ b/train/PsetsubtractionS_counted.txt
--- a/train/PssS_c_clean
+++ b/train/PssS_c_clean
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/s.sh
+++ b/train/s.sh
@ -1,4 +0,0 @@
 xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
 xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
 comm -23 sortedP sortedS > PsetsubtractionS
 cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt
`@ -1 +1 @@`
	`--metric Accuracy --precision 4`	`--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall> --precision 4 --in-header in-header.tsv --out-header out-header.tsv`
-.6920
-.6857
-.6969
-.6931
-.6927
-.6952
-.6969
-.6969
-.6959
-.6959
-.6965
-.6965
-.6965
-.6954
-.6965