Linear regression higher F1.0 lower accuracy

2020-04-06 14:01:32 +02:00 · 2020-04-06 14:01:32 +02:00 · db398db388
commit db398db388
parent d7040c9bc6
4 changed files with 4461 additions and 4456 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/predict.py
+++ b/predict.py
@ -8,6 +8,8 @@ from tokenizer import tokenize
 #Load model
 model = pickle.load(open("model.pkl","rb"))
 weights, word_to_index_mapping, word_count = model
 sum = 0
 counter = 0
 for line in sys.stdin:
    document = line.rstrip()
@ -17,9 +19,12 @@ for line in sys.stdin:
    y_predicted = weights[0]
    for word in terms:
-        y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
+        y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
-
+    sum += y_predicted
-    if y_predicted <= 0.5:
+    counter += 1
    if y_predicted <= 0:
        print(0)
    else:
        print(1)
 #print(sum / counter)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -76,7 +76,7 @@ def train():
        #With get u can avoid missing words and replace them with value u want
        #Weights replace value doesnt matter if word is missing cause word_count will give 0
        for word in actual_x:
-            y_predicted += weights[word_to_index_mapping.get(word,0)] * (word_count.get(word,0) / len(word_count))
+            y_predicted += weights[word_to_index_mapping.get(word,0)] * log(word_count.get(word,0) / len(word_count) + 1)
        #Cost count. Check how good was our prediction
        Loss = (y_predicted - actual_y) ** 2.0
@ -94,9 +94,9 @@ def train():
        weights[0] = weights[0] - delta
        for word in actual_x:
            if word in word_to_index_mapping:
-                weights[word_to_index_mapping[word]] -= ((word_count[word] / len(word_count)) * delta)
+                weights[word_to_index_mapping[word]] -= (log(word_count[word] / len(word_count) + 1) * delta)
-        if Loss_sum_counter > 10000000:
+        if Loss_sum_counter > 7000000:
            break