Regression

2020-04-06 19:11:16 +02:00 · 2020-04-06 19:11:16 +02:00 · 0e0d33afb4
commit 0e0d33afb4
parent 72f56d6b42
4 changed files with 294932 additions and 294900 deletions
--- a/code_regression.py
+++ b/code_regression.py
@ -6,80 +6,95 @@ import re
 from pip._vendor.msgpack.fallback import xrange
 import random

-vocabulary=[]
+vocabulary = []
+
+file_to_save = open("test.tsv", "w", encoding='utf-8')
+

-file_to_save=open("test.tsv","w",encoding='utf-8')
 def define_vocabulary(file_to_learn_new_words):
-    word_counts={'count': defaultdict(int)}
-    with open(file_to_learn_new_words,encoding='utf-8') as in_file:
+    word_counts = {'count': defaultdict(int)}
+    with open(file_to_learn_new_words, encoding='utf-8') as in_file:
        for line in in_file:
            text, timestamp = line.rstrip('\n').split('\t')
            tokens = text.lower().split(' ')
            for token in tokens:
-                word_counts['count'][token]+=1
+                word_counts['count'][token] += 1
    return word_counts

+
 def read_input(file_path):
-    read_word_counts={'count': defaultdict(int)}
+    read_word_counts = {'count': defaultdict(int)}
    with open(file_path, encoding='utf-8') as in_file:
        for line in in_file:
            text, timestamp = line.rstrip('\n').split('\t')
            tokens = text.lower().split(' ')
            for token in tokens:
-                read_word_counts['count'][token]+=1
+                read_word_counts['count'][token] += 1
    return read_word_counts

-def training(vocabulary,read_input,expected):
-    learning_rate=0.00001
-    learning_precision=0.0000001
-    weights=[]
-    iteration=0
-    loss_sum=0.0
-    ix=1
+
+def training(vocabulary, read_input, expected):
+    file_to_write = open(expected, 'w+', encoding='utf8')
+    learning_rate = 0.00001
+    learning_precision = 0.0000001
+    weights = []
+    iteration = 0
+    loss_sum = 0.0
+    ix = 1
    readed_words_values = []
    for word in read_input['count']:
        if word not in vocabulary['count']:
-            read_input['count'][word]=0
+            read_input['count'][word] = 0
        readed_words_values.append(read_input['count'][word])
-    for ix in range(0,len(vocabulary['count'])+1):
-        weights.append(random.uniform(-0.001,0.001))
-    #max_iteration=len(vocabulary['count'])+1
-    max_iteration=1000
-    delta=1
-    while (delta>learning_precision and iteration<max_iteration):
-        d,y=random.choice(list(read_input['count'].items())) #d-word, y-value of
-        y_hat=weights[0]
-        i=0
+    for ix in range(0, len(vocabulary['count']) + 1):
+        weights.append(random.uniform(-0.001, 0.001))
+    # max_iteration=len(vocabulary['count'])+1
+    max_iteration = 1000
+    delta = 1
+    while delta > learning_precision and iteration < max_iteration:
+        d, y = random.choice(list(read_input['count'].items()))  # d-word, y-value of
+        y_hat = weights[0]
+        i = 0
        for word_d in d:
            if word_d in vocabulary['count'].keys():
-                #print(vocabulary['count'][d])
-                y_hat+=weights[vocabulary['count'][word_d]]*readed_words_values[i]
-                delta=abs(y_hat-y)*learning_rate
-                weights[0]=weights[0]-delta
-                i+=i
-        i=0
+                # print(vocabulary['count'][d])
+                y_hat += weights[vocabulary['count'][word_d]] * readed_words_values[i]
+                i += 1
+            if y_hat > 0.0:
+                file_to_write.write('1\n')
+            else:
+                file_to_write.write('0\n')
+        i = 0
+        delta = (y_hat - y) * learning_rate
+        weights[0] = weights[0] - delta
        for word_w in d:
            if word_w in vocabulary['count'].keys():
-                weights[vocabulary['count'][word_w]]-=readed_words_values[i]*delta
-                i+=1
-        #print(weights)
+                weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta
+                i += 1
+        # print(weights)
        print(y_hat)
        print(y)
-        loss=(y_hat-y)**2.0
-        #loss=(y_hat-y)*(y_hat-y)
-        loss_sum+=loss
-        if(iteration%1000==0):
-            print(loss_sum/1000)
-            iteration=0
-            loss_sum=0.0
-        iteration+=1
+        loss = (y_hat - y) ** 2.0
+        # loss=(y_hat-y)*(y_hat-y)
+        loss_sum += loss
+        if (iteration % 1000 == 0):
+            print(loss_sum / 1000)
+            iteration = 0
+            loss_sum = 0.0
+        iteration += 1
+        file_to_write.close
+    return weights, vocabulary
+
+
 def main():
    vocabulary = define_vocabulary('train/in.tsv')
-    readed_words=read_input('dev-0/in.tsv')
-    training(vocabulary,readed_words,'test.tsv')
+    readed_words = read_input('dev-0/in.tsv')
+    readed_words_test_a = read_input('test-A/in.tsv/in.tsv')
+    training(vocabulary, readed_words, 'test.tsv')
+    training(vocabulary,readed_words_test_a, 'test_a.tsv')


-#def cost_function(y_hat,y):
+# def cost_function(y_hat,y):
 #    loss=(y_hat-y)**2.0
 #    loss_sum+=loss
 #    if loss_counter%1000==0:
@ -88,9 +103,8 @@ def main():
 #        loss_sum=0.0


-
-#def main():
-    # --------------- initialization ---------------------------------
+# def main():
+# --------------- initialization ---------------------------------
 #    vocabulary = define_vocabulary('train/in.tsv')
 #    readed_words=read_input('dev-0/in.tsv')
 #    i=1;
@ -109,7 +123,7 @@ def main():
 #    max_iterations=len(vocabulary['count'])+1
 #    current_iteration=0
 #    rangeReadedValues=len(readed_words['count'])+1
-    # --------------- prediction -------------------------------------
+# --------------- prediction -------------------------------------
 #    while (delta>precision and current_iteration<max_iterations):
 #        y=random.choice(readed_words_values)
 #        y_hat=weights[0]
@ -126,4 +140,3 @@ def main():


 main()
-
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/test.tsv
+++ b/test.tsv
@ -0,0 +1,19 @@
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+0
+0
+0
+0
+0
+0
+0
+0
+0
--- a/train/expected.tsv
+++ b/train/expected.tsv