Linear regression first try

2020-04-06 13:07:14 +02:00 · 2020-04-06 13:07:14 +02:00 · d7040c9bc6
commit d7040c9bc6
parent d6158fa514
3 changed files with 1793 additions and 1785 deletions
--- a/predict.py
+++ b/predict.py
@ -2,7 +2,7 @@

 import sys
 import pickle
-from math import log
+from math import log, exp
 from tokenizer import tokenize

 #Load model
@ -19,8 +19,7 @@ for line in sys.stdin:
    for word in terms:
        y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))

-
-    if y_predicted <= 0.63:
+    if y_predicted <= 0.5:
        print(0)
    else:
        print(1)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -1,8 +1,17 @@
 #!/usr/bin/python3

+'''
+Linear regression for paranormal and sceptic challange 2.0.0
+In order to use train.py you need to pass two columns
+label   document
+splited by \t
+Commands used: xzcat, paste
+'''
+
 import sys
 import pickle
 import random
+from math import log, exp
 import collections
 from tokenizer import tokenize

@ -76,7 +85,7 @@ def train():
        #We will stop after loss reach some value

        if Loss_sum_counter % 10000 == 0:
-            print(Loss_sum / 10000)
+            print(str(Loss_sum_counter) + "   " + str(Loss_sum / 10000))
            Loss_sum = 0.0
        Loss_sum_counter += 1

@ -87,13 +96,13 @@ def train():
            if word in word_to_index_mapping:
                weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)

-        if Loss_sum_counter > 50000000:
+        if Loss_sum_counter > 10000000:
            break
        



-    #We save only things we need for predicion
+    #We save only things we need for prediction
    model = (weights, word_to_index_mapping, word_count)
    pickle.dump(model, open("model.pkl", "wb"))