Rewrite linear regression

2020-04-09 00:23:08 +02:00 · 2020-04-09 00:23:08 +02:00 · 8d2a814d44
commit 8d2a814d44
parent fa4c673309
4 changed files with 10678 additions and 10564 deletions
--- a/code_regression.py
+++ b/code_regression.py
@ -1,142 +1,266 @@
-from collections import defaultdict
-import math
-import pickle
-import re
-
-from pip._vendor.msgpack.fallback import xrange
 import random
-
-vocabulary = []
-
-file_to_save = open("test.tsv", "w", encoding='utf-8')
-
+import re
+from _collections import defaultdict

 def define_vocabulary(file_to_learn_new_words):
-    word_counts = {'count': defaultdict(int)}
-    with open(file_to_learn_new_words, encoding='utf-8') as in_file:
-        for line in in_file:
-            text, timestamp = line.rstrip('\n').split('\t')
-            tokens = text.lower().split(' ')
+     word_counts = {'count': defaultdict(int)}
+     with open(file_to_learn_new_words, encoding='utf-8') as in_file:
+         for line in in_file:
+             text, timestamp = line.rstrip('\n').split('\t')
+             tokens = text.lower().split(' ')
+             for token in tokens:
+                 word_counts['count'][token] += 1
+     in_file.close()
+     return word_counts
+
+def tokenize_list(string_input):
+    words=[]
+    string=string_input.replace('\\n',' ')
+    text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
+    string=''
+    for word in text:
+        string+=word
+    words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\·+',string)
+    regex=re.compile(r'http|^[a-zA-Z]$|org')
+    filtered_values=[
+        word
+        for word in words if not regex.match(word)
+    ]
+    filtered_values[:] = (
+        value.lower()
+        for value in filtered_values if len(value)!=0
+    )
+    return filtered_values
+
+def read_words(input_path):
+    vocabulary = {'count':defaultdict(int)}
+    index=0
+    with open(input_path,encoding='utf-8') as infile:
+        for line in infile:
+            index+=1
+            tokens = tokenize_list(line)
            for token in tokens:
-                word_counts['count'][token] += 1
-    return word_counts
+                if token not in vocabulary:
+                    vocabulary['vocabulary'][token]+=1
+    infile.close()
+    return vocabulary

-
-def read_input(file_path):
-    read_word_counts = {'count': defaultdict(int)}
-    with open(file_path, encoding='utf-8') as in_file:
-        for line in in_file:
-            text, timestamp = line.rstrip('\n').split('\t')
-            tokens = text.lower().split(' ')
-            for token in tokens:
-                read_word_counts['count'][token] += 1
-    return read_word_counts
-
-
-def training(vocabulary, read_input, expected):
-    file_to_write = open(expected, 'w+', encoding='utf8')
-    file_to_write2 = open('out_y_hat.tsv', 'w+', encoding='utf8')
-    learning_rate = 0.00001
-    learning_precision = 0.0001
-    weights = []
-    iteration = 0
-    loss_sum = 0.0
-    ix = 1
-    readed_words_values = []
-    for word in read_input['count']:
-        if word not in vocabulary['count']:
-            read_input['count'][word] = 0
-        readed_words_values.append(read_input['count'][word])
-    for ix in range(0, len(vocabulary['count']) + 1):
-        weights.append(random.uniform(-0.001, 0.001))
-    # max_iteration=len(vocabulary['count'])+1
-    max_iteration = 10000
-    delta = 1
+def train(vocabulary,input_train,expected_train):
+    learning_rate=0.0001
+    learning_precision=0.00000001
+    words_vocabulary={}
+    with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file:
+        for line, exp in zip(input_file,expected_file):
+            words_vocabulary[line]=int(exp)
+    weights={}
+    weight={}
+    delta=1
+    iteration=0
+    loss_sum=0.0
+    error=10.0
+    max_iteration=len(vocabulary)
+    for i in vocabulary['count'].keys():
+        weights[i]=random.uniform(-0.01,0.01)
    while delta>learning_precision and iteration<max_iteration:
-        d, y = random.choice(list(read_input['count'].items()))  # d-word, y-value of
-        y_hat = weights[0]
-        i = 0
-        for word_d in d:
-            if word_d in vocabulary['count'].keys():
-                # print(vocabulary['count'][d])
-                y_hat += weights[vocabulary['count'][word_d]] * readed_words_values[i]
-                i += 1
-            print(f'Y_hat: {y_hat}')
-            file_to_write2.write(f'Y_hat: {y_hat}\n')
-            if y_hat > 0.5:
-                file_to_write.write('1\n')
-            else:
-                file_to_write.write('0\n')
-        i = 0
-        delta = (y_hat - y) * learning_rate
-        weights[0] = weights[0] - delta
-        for word_w in d:
-            if word_w in vocabulary['count'].keys():
-                weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta
-                i += 1
-        # print(weights)
-        #print(f'Y: {y}')
-        loss = (y_hat - y) ** 2.0
-        # loss=(y_hat-y)*(y_hat-y)
+        d,y = random.choice(list(words_vocabulary.items()))
+        y_hat=0
+        tokens=tokenize_list(d)
+        for token in tokens:
+            if token in vocabulary['count'].keys():
+                y_hat += weights[token] * tokens.count(token)
+        delta=(y_hat-y) * learning_rate
+        for word in tokens:
+            if word in words_vocabulary:
+                weights[word] -= (tokens.count(word)) * delta
+        loss = (y_hat - y)**2.0
        loss_sum += loss
-        if (iteration % 1000 == 0):
-            #print(loss_sum / 1000)
-            iteration = 0
-            loss_sum = 0.0
+        if iteration%1000 == 0:
+            if (error>(loss_sum/1000)):
+                weight=weights
+                error=loss_sum/1000
+            loss_sum=0.0
        iteration += 1
-        file_to_write.close
+    input_file.close()
+    expected_file.close()
+    return weight, vocabulary
+
+def prediction(input,output,weights,vocabulary):
+    with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output:
+        for line in input_file:
+            y_hat=0
+            tokens=tokenize_list(line)
+            for token in tokens:
+                if token in vocabulary['count'].keys():
+                    y_hat += weights[token] * (token.count(token))
+            if y_hat>0.0:
+                output.write('1\n')
+            else:
+                output.write('0\n')
+    output.close()
+    input_file.close()
+

 def main():
-    vocabulary = define_vocabulary('train/in.tsv')
-    readed_words = read_input('dev-0/in.tsv')
-    readed_words_test_a = read_input('test-A/in.tsv/in.tsv')
-    training(vocabulary, readed_words, 'dev-0/out.tsv')
-    training(vocabulary, readed_words_test_a, 'test-A/out.tsv')
-
-
-# def cost_function(y_hat,y):
-#    loss=(y_hat-y)**2.0
-#    loss_sum+=loss
-#    if loss_counter%1000==0:
-#        print(loss_sum/1000)
-#        loss_counter=0
-#        loss_sum=0.0
-
-
-# def main():
-# --------------- initialization ---------------------------------
-#    vocabulary = define_vocabulary('train/in.tsv')
-#    readed_words=read_input('dev-0/in.tsv')
-#    i=1;
-#    weights=[]
-#    readed_words_values=[]
-#    rangeVocabulary=len(vocabulary['count'])+1
-#    for i in range(rangeVocabulary):
-#        weights.append(random.randrange(0,len(vocabulary['count'])+1))
-#    for word in readed_words['count']:
-#        if word not in vocabulary['count']:
-#            readed_words['count'][word]=0
-#        readed_words_values.append(readed_words['count'][word])
-#    precision=0.00001
-#    learning_rate=0.00001
-#    delta=1
-#    max_iterations=len(vocabulary['count'])+1
-#    current_iteration=0
-#    rangeReadedValues=len(readed_words['count'])+1
-# --------------- prediction -------------------------------------
-#    while (delta>precision and current_iteration<max_iterations):
-#        y=random.choice(readed_words_values)
-#        y_hat=weights[0]
-#        i=0
-#        j=0
-#        for i in range(rangeReadedValues):
-#            y_hat+=weights[i]*y
-#            delta=abs(y_hat-y)*learning_rate
-#            weights[0]=weights[0]-delta
-#        for j in range(rangeVocabulary):
-#            weights[j]-=y*delta
-#        print(delta)
-#        current_iteration+=1
-
+    vocabulary=define_vocabulary('train/in.tsv');
+    weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv')
+    prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words)
+    prediction('test-A/in.tsv/in.tsv','test-A/out.tsv',weights,words)

 main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# from collections import defaultdict
+# import math
+# import pickle
+# import re
+#
+# from pip._vendor.msgpack.fallback import xrange
+# import random
+#
+# vocabulary = []
+#
+# file_to_save = open("test.tsv", "w", encoding='utf-8')
+#
+#
+# def define_vocabulary(file_to_learn_new_words):
+#     word_counts = {'count': defaultdict(int)}
+#     with open(file_to_learn_new_words, encoding='utf-8') as in_file:
+#         for line in in_file:
+#             text, timestamp = line.rstrip('\n').split('\t')
+#             tokens = text.lower().split(' ')
+#             for token in tokens:
+#                 word_counts['count'][token] += 1
+#     return word_counts
+#
+#
+# def read_input(file_path):
+#     read_word_counts = {'count': defaultdict(int)}
+#     with open(file_path, encoding='utf-8') as in_file:
+#         for line in in_file:
+#             text, timestamp = line.rstrip('\n').split('\t')
+#             tokens = text.lower().split(' ')
+#             for token in tokens:
+#                 read_word_counts['count'][token] += 1
+#     return read_word_counts
+#
+#
+# def training(vocabulary, read_input, expected):
+#     file_to_write = open(expected, 'w+', encoding='utf8')
+#     file_to_write2 = open('out_y_hat.tsv', 'w+', encoding='utf8')
+#     learning_rate = 0.00001
+#     learning_precision = 0.0001
+#     weights = []
+#     iteration = 0
+#     loss_sum = 0.0
+#     ix = 1
+#     readed_words_values = []
+#     for word in read_input['count']:
+#         if word not in vocabulary['count']:
+#             read_input['count'][word] = 0
+#         readed_words_values.append(read_input['count'][word])
+#     for ix in range(0, len(vocabulary['count']) + 1):
+#         weights.append(random.uniform(-0.001, 0.001))
+#     # max_iteration=len(vocabulary['count'])+1
+#     max_iteration = 10000
+#     delta = 1
+#     while delta>learning_precision and iteration<max_iteration:
+#         d, y = random.choice(list(read_input['count'].items()))  # d-word, y-value of
+#         y_hat = weights[0]
+#         i = 0
+#         for word_d in d:
+#             if word_d in vocabulary['count'].keys():
+#                 # print(vocabulary['count'][d])
+#                 y_hat += weights[vocabulary['count'][word_d]] * readed_words_values[i]
+#                 i += 1
+#             print(f'Y_hat: {y_hat}')
+#             file_to_write2.write(f'Y_hat: {y_hat}\n')
+#             if y_hat > 0.5:
+#                 file_to_write.write('1\n')
+#             else:
+#                 file_to_write.write('0\n')
+#         i = 0
+#         delta = (y_hat - y) * learning_rate
+#         weights[0] = weights[0] - delta
+#         for word_w in d:
+#             if word_w in vocabulary['count'].keys():
+#                 weights[vocabulary['count'][word_w]] -= readed_words_values[i] * delta
+#                 i += 1
+#         # print(weights)
+#         #print(f'Y: {y}')
+#         loss = (y_hat - y) ** 2.0
+#         # loss=(y_hat-y)*(y_hat-y)
+#         loss_sum += loss
+#         if (iteration % 1000 == 0):
+#             #print(loss_sum / 1000)
+#             iteration = 0
+#             loss_sum = 0.0
+#         iteration += 1
+#         file_to_write.close
+#
+# def main():
+#     vocabulary = define_vocabulary('train/in.tsv')
+#     readed_words = read_input('dev-0/in.tsv')
+#     readed_words_test_a = read_input('test-A/in.tsv/in.tsv')
+#     training(vocabulary, readed_words, 'dev-0/out.tsv')
+#     training(vocabulary, readed_words_test_a, 'test-A/out.tsv')
+#
+#
+# # def cost_function(y_hat,y):
+# #    loss=(y_hat-y)**2.0
+# #    loss_sum+=loss
+# #    if loss_counter%1000==0:
+# #        print(loss_sum/1000)
+# #        loss_counter=0
+# #        loss_sum=0.0
+#
+#
+# # def main():
+# # --------------- initialization ---------------------------------
+# #    vocabulary = define_vocabulary('train/in.tsv')
+# #    readed_words=read_input('dev-0/in.tsv')
+# #    i=1;
+# #    weights=[]
+# #    readed_words_values=[]
+# #    rangeVocabulary=len(vocabulary['count'])+1
+# #    for i in range(rangeVocabulary):
+# #        weights.append(random.randrange(0,len(vocabulary['count'])+1))
+# #    for word in readed_words['count']:
+# #        if word not in vocabulary['count']:
+# #            readed_words['count'][word]=0
+# #        readed_words_values.append(readed_words['count'][word])
+# #    precision=0.00001
+# #    learning_rate=0.00001
+# #    delta=1
+# #    max_iterations=len(vocabulary['count'])+1
+# #    current_iteration=0
+# #    rangeReadedValues=len(readed_words['count'])+1
+# # --------------- prediction -------------------------------------
+# #    while (delta>precision and current_iteration<max_iterations):
+# #        y=random.choice(readed_words_values)
+# #        y_hat=weights[0]
+# #        i=0
+# #        j=0
+# #        for i in range(rangeReadedValues):
+# #            y_hat+=weights[i]*y
+# #            delta=abs(y_hat-y)*learning_rate
+# #            weights[0]=weights[0]-delta
+# #        for j in range(rangeVocabulary):
+# #            weights[j]-=y*delta
+# #        print(delta)
+# #        current_iteration+=1
+#
+#
+# main()
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test.tsv
+++ b/test.tsv
@ -1,10 +0,0 @@
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1