Regression

2020-04-05 20:10:04 +02:00 · 2020-04-05 20:10:04 +02:00 · 72f56d6b42
commit 72f56d6b42
parent c7241d862d
1 changed files with 95 additions and 34 deletions
--- a/code_regression.py
+++ b/code_regression.py
@ -7,23 +7,6 @@ from pip._vendor.msgpack.fallback import xrange
 import random

 vocabulary=[]
-#word_to_index_mapping=[]
-#index_to_word_mapping=[]
-
-#file_to_save=open("test.tsv","w",encoding='utf-8')
-#def define_vocabulary(file_to_learn_new_words,expected_path):
-#    word_counts = {'paranormal': defaultdict(int), 'skeptic': defaultdict(int)}
-#    with open(file_to_learn_new_words, encoding='utf-8') as in_file, open(expected_path, encoding='utf-8')  as  expected_file:
-#        for line, exp in zip(in_file, expected_file):
-#            class_ = exp.rstrip('\n').replace(' ', '')
-#            text, timestamp = line.rstrip('\n').split('\t')
-#            tokens = text.lower().split(' ')
-#            for token in tokens:
-#                if class_ == 'P':
-#                    word_counts['paranormal'][token] += 1
-#                elif class_ == 'S':
-#                    word_counts['skeptic'][token] += 1
-#    return word_counts

 file_to_save=open("test.tsv","w",encoding='utf-8')
 def define_vocabulary(file_to_learn_new_words):
@ -37,31 +20,109 @@ def define_vocabulary(file_to_learn_new_words):
    return word_counts

 def read_input(file_path):
-    word_counts={'count': defaultdict(int)}
+    read_word_counts={'count': defaultdict(int)}
    with open(file_path, encoding='utf-8') as in_file:
        for line in in_file:
            text, timestamp = line.rstrip('\n').split('\t')
            tokens = text.lower().split(' ')
            for token in tokens:
-                word_counts['count'][token]+=1
-    return word_counts
+                read_word_counts['count'][token]+=1
+    return read_word_counts

-def main():
-    # --------------- initialization ---------------------------------
-    vocabulary = define_vocabulary('train/in.tsv')
-    i=1;
+def training(vocabulary,read_input,expected):
+    learning_rate=0.00001
+    learning_precision=0.0000001
    weights=[]
-    testFuckingPython=len(vocabulary['count'])+1
-    for i in range(testFuckingPython):
-        weights.append(random.randrange(0,len(vocabulary['count'])+1))
-    precision=0.00001
-    learning_rate=0.001
-    prev_step_size=1
-    max_iterations=len(vocabulary['count'])
-    current_iteration=0
-    readed_words=read_input("train/in.tsv")
+    iteration=0
+    loss_sum=0.0
+    ix=1
+    readed_words_values = []
+    for word in read_input['count']:
+        if word not in vocabulary['count']:
+            read_input['count'][word]=0
+        readed_words_values.append(read_input['count'][word])
+    for ix in range(0,len(vocabulary['count'])+1):
+        weights.append(random.uniform(-0.001,0.001))
+    #max_iteration=len(vocabulary['count'])+1
+    max_iteration=1000
+    delta=1
+    while (delta>learning_precision and iteration<max_iteration):
+        d,y=random.choice(list(read_input['count'].items())) #d-word, y-value of
+        y_hat=weights[0]
+        i=0
+        for word_d in d:
+            if word_d in vocabulary['count'].keys():
+                #print(vocabulary['count'][d])
+                y_hat+=weights[vocabulary['count'][word_d]]*readed_words_values[i]
+                delta=abs(y_hat-y)*learning_rate
+                weights[0]=weights[0]-delta
+                i+=i
+        i=0
+        for word_w in d:
+            if word_w in vocabulary['count'].keys():
+                weights[vocabulary['count'][word_w]]-=readed_words_values[i]*delta
+                i+=1
+        #print(weights)
+        print(y_hat)
+        print(y)
+        loss=(y_hat-y)**2.0
+        #loss=(y_hat-y)*(y_hat-y)
+        loss_sum+=loss
+        if(iteration%1000==0):
+            print(loss_sum/1000)
+            iteration=0
+            loss_sum=0.0
+        iteration+=1
+def main():
+    vocabulary = define_vocabulary('train/in.tsv')
+    readed_words=read_input('dev-0/in.tsv')
+    training(vocabulary,readed_words,'test.tsv')
+
+
+#def cost_function(y_hat,y):
+#    loss=(y_hat-y)**2.0
+#    loss_sum+=loss
+#    if loss_counter%1000==0:
+#        print(loss_sum/1000)
+#        loss_counter=0
+#        loss_sum=0.0
+
+
+
+#def main():
+    # --------------- initialization ---------------------------------
+#    vocabulary = define_vocabulary('train/in.tsv')
+#    readed_words=read_input('dev-0/in.tsv')
+#    i=1;
+#    weights=[]
+#    readed_words_values=[]
+#    rangeVocabulary=len(vocabulary['count'])+1
+#    for i in range(rangeVocabulary):
+#        weights.append(random.randrange(0,len(vocabulary['count'])+1))
+#    for word in readed_words['count']:
+#        if word not in vocabulary['count']:
+#            readed_words['count'][word]=0
+#        readed_words_values.append(readed_words['count'][word])
+#    precision=0.00001
+#    learning_rate=0.00001
+#    delta=1
+#    max_iterations=len(vocabulary['count'])+1
+#    current_iteration=0
+#    rangeReadedValues=len(readed_words['count'])+1
    # --------------- prediction -------------------------------------
-    #while (prev_step_size>precision and current_iteration<max_iterations):
+#    while (delta>precision and current_iteration<max_iterations):
+#        y=random.choice(readed_words_values)
+#        y_hat=weights[0]
+#        i=0
+#        j=0
+#        for i in range(rangeReadedValues):
+#            y_hat+=weights[i]*y
+#            delta=abs(y_hat-y)*learning_rate
+#            weights[0]=weights[0]-delta
+#        for j in range(rangeVocabulary):
+#            weights[j]-=y*delta
+#        print(delta)
+#        current_iteration+=1


 main()