19 changed files with 348702 additions and 305528 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,11 +1,8 @@
-in.tsv
-model.pkl
+
 *~
 *.swp
 *.bak
 *.pyc
 *.o
-
 .DS_Store
 .token
-.idea
--- a/BestLinearRegressionModel.pkl
+++ b/BestLinearRegressionModel.pkl
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ Classify a reddit as either from Skeptic subreddit or one of the
 "paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
 ,Glitch-in-the-Matrix, conspiracytheories).

-Output label is 0 (for skeptic) and  1 (for paranormal).
+Output label is `S` and `P`.

 Sources
 -------
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Accuracy --metric F1 --metric F0:N<Precision> --metric F9999999:N<Recall>  --precision 4 --in-header in-header.tsv --out-header out-header.tsv
+--metric Accuracy --precision 4  
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/mytokenize.py
+++ b/mytokenize.py
@ -1,12 +0,0 @@
-import re
-
-"""
-Takes a document and returns a list of tokens.
-"""
-def tokenize(d):
-    d = re.sub(r'(\s+|\\n)', ' ', d)
-    d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
-    d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
-    d = re.sub(r'\d+', 'NUM', d)
-
-    return re.split(r'\s+', d)
--- a/predict.py
+++ b/predict.py
@ -1,13 +0,0 @@
-
-
-
-
-def predict(weights, word_to_index, tokenized_text):
-    ypred = weights[0]  # bias or w0
-    for x in tokenized_text:
-        if x in word_to_index:
-            index = word_to_index[x]
-            ypred += weights[index] * 1
-    return ypred
-
-
--- a/predict_dev0_testA.py
+++ b/predict_dev0_testA.py
@ -1,20 +0,0 @@
-import pickle
-from predict import predict
-from mytokenize import tokenize
-import sys
-
-
-
-path_pkl ="BestLinearRegressionModel.pkl"
-model = pickle.load(open(path_pkl, "rb"))
-word_to_index, weights, best_MSE = model
-
-for line in sys.stdin:
-    text, date = line.split("\t")
-    tokenized_text = tokenize(text)
-    ypred = predict(weights, word_to_index, tokenized_text)
-    if ypred < 0.5:
-        print("0")
-    else:
-        print("1")
-
--- a/pseudokod.txt
+++ b/pseudokod.txt
@ -1,83 +0,0 @@
-vocabulary = .... zbiór wszystkich słów ... 
-
-word_to_index_mapping = {} 
-
-index_to_word_mapping = {} 
-
-ix = 1 
-
-for w in vocabulary: 
-
-      word_to_index_mapping[w] = ix 
-
-      index_to_word_mapping[ix] = w 
-
-      ix += 1 
-
- 
-
-#inicjalizacja   
-
-weights = [] 
-
-for ix in xrange(0, len(vocabulary)+1): 
-
-     weights[ix] = losowa wartość z przedziału (-0.01, 0.01) 
-
-      
-
-Albo weights[0] = 2012.0 
-
- 
-
-learning_rate = 0.000001 
-
-  
-
-Loss_sum = 0.0 
-
-Loss_sum_counter = 0 
-
-while .... 
-
-     d, y = losowy przykład ze zbioru uczący 
-
-  
-
-     # predykcja 
-
-     y_hat = weights[0] 
-
-     dla każdego słowa w z dokumentu d: 
-
-            y_hat += weights[word_to_index_mapping[w]] * (liczba wystąpień w w d) 
-
- 
-
-    # funkcja kosztu 
-
-    Loss = (y_hat – y)**2.0 
-
-    Loss_sum += Loss 
-
-    If loss_counter % 1000 == 0: 
-
-        print(loss_sum / 1000) 
-
-        Loss_counter = 0 
-
-        Loss_sum  = 0.0 
-
-            
-
-     # uczenie - update 
-
-     delta = (y_hat - y) * learning_rate 
-
-     weights[0] = weights[0] - delta  
-
-     for każdego słowa w z dokumenty d: 
-
-            weights[word_to_index_mapping[w]] -= (liczba wystąpień w w d) * delta 
-
-  
--- a/score.txt
+++ b/score.txt
@ -1 +0,0 @@
-MSE:	0.16878347523946954
--- a/scores.txt
+++ b/scores.txt
@ -0,0 +1,15 @@
+0.6920
+0.6857
+0.6969
+0.6931
+0.6927
+0.6952
+0.6969
+0.6969
+0.6959
+0.6959
+0.6965
+0.6965
+0.6965
+0.6954
+0.6965
--- a/solution.py
+++ b/solution.py
@ -1,120 +1,20 @@
-import os
+import re
+import sys

-import pandas as pd
-import pickle
-import csv
-from random import uniform
-
-from predict import predict
-from mytokenize import tokenize
-
-#load data:
-train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
-texts = train["text"]
-y = pd.read_csv("train/expected.tsv", header=None, names=["isparanormal"])
-y = list(y["isparanormal"])
-print(y)
-print(y[0])
-
-tokenized_texts = []
-word_to_index = {}
-index_to_word = {}
-word_count = {}
-learning_rate = 0.000001
-
-for doc in texts:
-    tokens = tokenize(doc)
-    tokenized_texts.append(tokens)
-    for token in tokens:
-        if token in word_count:
-            word_count[token] += 1
-        else:
-            word_count[token] = 1
-
-#vocabulary sorted from the most frequent words to the least f. w.
-vocabulary = sorted(word_count, key= word_count.get, reverse=True)
-for w in vocabulary:
-    i = len(word_to_index) + 1
-    word_to_index[w] = i
-    index_to_word[i] = w
-
-weights = []
-for i in range(0, len(vocabulary) + 1):
-    weights.append(uniform(-0.01, 0.01))
-
-
-best_MSE = 1800
-
-path_pkl ="BestLinearRegressionModel.pkl"
-if os.path.isfile(path_pkl):
-   model = pickle.load(open(path_pkl, "rb"))
-   word_to_index, weights, best_MSE = model
-   print("Unpickled best model.", "\tbest_MSE: ", best_MSE)
-
-
-
-loss_sum = 0.0
-loss_counter = 0
-for_MSE_sum = 0.0
-MSE = 0.0
-counter = 0
-inc_counter = 0
-while(True):
-    for i in range(0, len(tokenized_texts)):
-        #@@ Obliczanie ypred - start
-        ypred = predict(weights, word_to_index, tokenized_texts[i])
-        #@@ Obliczanie ypred - end
-
-
-        #@@ Obliczanie kosztu
-        loss = (ypred - y[i]) ** 2.0
-        loss_sum += loss
-        for_MSE_sum += loss
-        loss_counter += 1
-        #@@ Obliczanie kosztu - end
-
-
-        if loss_counter == 1000:
-            #print(str(loss_sum/1000), "\t", str(MSE))
-            loss_counter = 0
-            loss_sum = 0.0
-
-
-        #@@ Uczenie - aktualizacja wag - start
-        delta = (ypred - y[i]) * learning_rate
-        weights[0] -= delta
-        for x in tokenized_texts[i]:
-            index = word_to_index[x]
-            weights[index] -= delta
-        #@@ Uczenie - aktualizacja wag - end
-
-
-    #@@ Opcjonalna stratefia poprawiająca trenowanie- start
-    temp_MSE = for_MSE_sum / len(tokenized_texts)
-    for_MSE_sum = 0.0
-    if best_MSE > temp_MSE:
-        best_MSE = temp_MSE
-        model = (word_to_index, weights, best_MSE)
-        pickle.dump(model, open("BestLinearRegressionModel.pkl", "wb"))
-        with open("score.txt", "w") as out:
-            out.write("MSE:\t")
-            out.write(str(best_MSE))
-    if temp_MSE > MSE:
-        counter += 1
+for line in sys.stdin:
+    if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
+        print("P")
    else:
-        inc_counter += 1
-
-    if counter > 2:
-        learning_rate *= 0.1
-        counter = 0
-
-    if inc_counter > 4:
-        learning_rate /= 0.90
-        inc_counter = 0
-    #@@ Opcjonalna stratefia poprawiająca trenowanie- end
+        print("S")


-    MSE = temp_MSE
-    print("MSE: " "\t", "%10.10f" % MSE, "\tLearningRate:\t", "%10.10f" % float(learning_rate))


+
+
+"""
+
+
+happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
+
+"""
--- a/start.sh
+++ b/start.sh
@ -1,5 +1,4 @@
-xzcat dev-0/in.tsv.xz | python3 predict_dev0_testA.py > dev-0/out.tsv
+xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv

-xzcat test-A/in.tsv.xz | python3 predict_dev0_testA.py > test-A/out.tsv
-
-geval -t dev-0
+xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
+geval -t dev-0 >>scores.txt
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/PsetsubtractionS_counted.txt
+++ b/train/PsetsubtractionS_counted.txt
--- a/train/PssS_c_clean
+++ b/train/PssS_c_clean
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/s.sh
+++ b/train/s.sh
@ -0,0 +1,4 @@
+xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
+xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
+comm -23 sortedP sortedS > PsetsubtractionS
+cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt