Linear regression higher F1.0 lower accuracy

Linear regression first try
Linear regression 1
2020-04-06 14:01:32 +02:00 · 2020-04-06 13:07:14 +02:00 · 2020-04-04 19:02:51 +02:00 · 2020-04-02 15:45:53 +02:00 · 2020-03-30 18:29:13 +02:00 · 2020-03-30 18:28:23 +02:00
8 changed files with 305449 additions and 294864 deletions
--- a/README.md
+++ b/README.md
@ -1,13 +1,13 @@
-Skeptic vs paranormal subreddits
-================================
-
-Classify a reddit as either from Skeptic subreddit or one of the
-"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
-,Glitch-in-the-Matrix, conspiracytheories).
-
-Output label is `S` and `P`.
-
-Sources
-------
-
-Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
+Skeptic vs paranormal subreddits
+================================
+
+Classify a reddit as either from Skeptic subreddit or one of the
+"paranormal" subreddits (Paranormal, UFOs, TheTruthIsHere, Ghosts,
+,Glitch-in-the-Matrix, conspiracytheories).
+
+Output label is 0 (for skeptic) and  1 (for paranormal).
+
+Sources
+-------
+
+Data taken from <https://archive.org/details/2015_reddit_comments_corpus>.
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,30 @@
+#!/usr/bin/python3
+
+import sys
+import pickle
+from math import log, exp
+from tokenizer import tokenize
+
+#Load model
+model = pickle.load(open("model.pkl","rb"))
+weights, word_to_index_mapping, word_count = model
+sum = 0
+counter = 0
+
+for line in sys.stdin:
+    document = line.rstrip()
+    fields = document.split('\t')
+    document = fields[0]
+    terms = tokenize(document)
+
+    y_predicted = weights[0]
+    for word in terms:
+        y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
+    sum += y_predicted
+    counter += 1
+    if y_predicted <= 0:
+        print(0)
+    else:
+        print(1)
+
+#print(sum / counter)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,21 @@
+#!/usr/bin/python3
+
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import nltk
+import re
+import string
+
+
+stop_words = set(stopwords.words('english'))
+printable = set(string.printable)
+
+def tokenize(d):
+    d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
+    d = re.sub(r'\\n',' ',d)
+    d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
+    d = ''.join(filter(lambda x: x in printable, d))
+    tokenized = word_tokenize(d)
+    lower = [w.lower() for w in tokenized]
+    words = [w for w in lower if not w in stop_words]
+    return words
--- a/train.py
+++ b/train.py
@ -0,0 +1,110 @@
+#!/usr/bin/python3
+
+'''
+Linear regression for paranormal and sceptic challange 2.0.0
+In order to use train.py you need to pass two columns
+label   document
+splited by \t
+Commands used: xzcat, paste
+'''
+
+import sys
+import pickle
+import random
+from math import log, exp
+import collections
+from tokenizer import tokenize
+
+
+def train():
+    #Prepare
+    vocabulary = set()
+    word_to_index_mapping = {}
+    index_to_word_mapping = {}
+    word_count = collections.defaultdict(int)
+
+    #Array x,y to use later for training process
+    x = []
+    y = []
+
+    learning_rate = 0.000001
+
+    #Read values from file
+    for line in sys.stdin:
+        line = line.rstrip()
+        fields = line.split('\t')
+        label = fields[0]
+        document = fields[1]
+        terms = tokenize(document)
+
+        #Add words from document to x and label to y (we need to reconfigure label p is 1 and s is 0)
+        x.append(terms)
+        if label == "P":
+            y.append(1)
+        else:
+            y.append(0)
+
+        #Update vocabulary and count how often word appear
+        for t in terms:
+            word_count[t] += 1
+            vocabulary.add(t)
+
+    #Give numbers for words. Each word its own value. Indexing
+    ix = 1
+    for w in vocabulary:
+        word_to_index_mapping[w] = ix
+        index_to_word_mapping[ix] = w
+        ix += 1
+
+    #Initialize weights with random values from -1.0 to 1.0 (floats)
+    weights = []
+    for ix in range(0,len(vocabulary) + 1):
+        weights.append(random.uniform(-1.00, 1.00))
+    
+    Loss_sum = 0.0
+    Loss_sum_counter = 1
+
+    while True:
+        choose_random_example = random.randint(0,len(x)-1)
+        actual_x = x[choose_random_example] #list of words
+        actual_y = y[choose_random_example] #label for this set of words
+
+        #Predict result
+        y_predicted = weights[0]
+
+        #Iterate over all words in randomly choosen example
+        #With get u can avoid missing words and replace them with value u want
+        #Weights replace value doesnt matter if word is missing cause word_count will give 0
+        for word in actual_x:
+            y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
+
+        #Cost count. Check how good was our prediction
+        Loss = (y_predicted - actual_y) ** 2.0
+        #We sum loss to get average value. It will be easier for us to follow
+        Loss_sum += Loss
+        #We will stop after loss reach some value
+
+        if Loss_sum_counter % 10000 == 0:
+            print(str(Loss_sum_counter) + "   " + str(Loss_sum / 10000))
+            Loss_sum = 0.0
+        Loss_sum_counter += 1
+
+        #Update weights
+        delta = (y_predicted - actual_y) * learning_rate
+        weights[0] = weights[0] - delta
+        for word in actual_x:
+            if word in word_to_index_mapping:
+                weights[word_to_index_mapping[word]] -= (log(word_count[word] / len(word_count) + 1) * delta)
+
+        if Loss_sum_counter > 7000000:
+            break
+        
+
+
+
+    #We save only things we need for prediction
+    model = (weights, word_to_index_mapping, word_count)
+    pickle.dump(model, open("model.pkl", "wb"))
+
+
+train()
--- a/train/expected.tsv
+++ b/train/expected.tsv
Author	SHA1	Message	Date
Th3NiKo	db398db388	Linear regression higher F1.0 lower accuracy	2020-04-06 14:01:32 +02:00
Th3NiKo	d7040c9bc6	Linear regression first try	2020-04-06 13:07:14 +02:00
Th3NiKo	d6158fa514	Linear regression 1	2020-04-04 19:02:51 +02:00
Th3NiKo	14432fab2d	Linear try	2020-04-02 15:45:53 +02:00
Filip Gralinski	abba594b01	Update README.md	2020-03-30 18:29:13 +02:00
Filip Gralinski	73a1b8862f	Switching to O/1	2020-03-30 18:28:23 +02:00
Filip Gralinski	f17f86149c	Fix unwanted spaces	2020-03-30 12:30:04 +02:00