add linear_regression

2020-04-18 20:39:32 +02:00 · 2020-04-18 20:39:32 +02:00 · 5c9327ab4b
commit 5c9327ab4b
parent 6aadccfd7e
1 changed files with 140 additions and 0 deletions
--- a/linear_regression.py
+++ b/linear_regression.py
@ -0,0 +1,140 @@
+import csv
+import re
+import random
+import json
+from math import sqrt
+
+# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
+def make_dict(path):
+    dict = {}
+    with open(path) as in_file:
+        for line in in_file:
+            for word in re.findall(r"[\w']+", line):
+                if not word in dict:
+                    weight = round(random.random()%0.2-0.1,2)
+                    dict[word] = weight
+    
+    print("dict maked")
+    with open('dict.txt', 'w') as file:
+        json.dump(dict, file)
+    return dict
+
+def make_posts_list(in_file):
+    posts = []
+    counter = 0
+    with open(in_file) as f:
+            for line in f:
+                if counter < 1000:
+                    posts.append(line)
+                else:
+                    counter +=1
+                
+    return posts
+
+def make_exp_list(exp_file):
+    exp_list = []
+    with open(exp_file) as f:
+        for exp_line in f:
+            y = exp_line
+            exp_list.append(float(y.split('\n')[0]))
+
+    return exp_list
+
+def train_model(in_path, exp_path):
+    with open('dict.txt', 'r') as file:
+        dict = json.load(file)
+    posts = make_posts_list(in_path)
+    exp = make_exp_list(exp_path)
+    w0 = 2013
+    lr = 0.0000001
+    epchos = 0
+    loss_sum = 0
+    last_sum = 10
+    loss_counter = 0
+    print("Zaczynam")
+    while epchos < 10000:
+        
+        loss_cost = 0            
+        for in_line, exp_line in zip(posts, exp):
+            loss_counter+=1
+            #losowy przykład ze zbioru uczącego
+            #print("new post" + str(random.randint(0,10)))
+            post = in_line
+            error_rate = 1
+            y = int(exp_line)
+            #loop_counter = 0
+            #while (error_rate > 0.2 and loop_counter < 10000):
+                #loop_counter +=1
+            y_hat = w0
+            for word in re.findall(r"[\w']+", post):
+                #dict[word] -= (y_hat - y)*lr
+                y_hat += dict[word]
+            loss = (y_hat - y)**2
+            loss_sum += loss
+            #error_rate = (y_hat - y)**2
+            # if loop_counter%1000 == 0:
+            #     print(error_rate)
+            # loss_cost += error_rate
+            # if loss_counter%1000==0:
+            #     print(loss_sum/1000)
+            #     loss_sum = 0
+
+            #uczenie
+            delta = (y_hat - y) * lr
+            w0 = w0 - delta
+            for word in re.findall(r"[\w']+", post):
+                dict[word] -= delta
+
+        
+        real_loss = loss_sum/loss_counter
+        print(real_loss)
+
+        # if real_loss > last_sum:
+        #     break
+        # else:
+        #     last_sum = real_loss
+        last_sum = real_loss
+        loss_sum = 0
+        loss_counter = 0
+        epchos +=1
+    with open('dict2.txt', 'w') as file:
+        json.dump(dict, file)
+
+def predict(path):
+    results = []
+    with open('dict2.txt', 'r') as file:
+        dict = json.load(file)
+
+    with open(path+"/in.tsv") as in_file:
+        for in_line in in_file:
+            print("new post" + str(random.randint(0,10)))
+            post = in_line
+            y=0
+            for word in re.findall(r"[\w']+", post):
+                if word in dict:
+                    y += dict[word]
+            if y > 0.5:
+                results.append("1")
+            else:
+                results.append("0")
+        
+        with open(path+"/out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            for i in results:
+                tsv_writer.writerow(i)
+
+#make_dict("train/in.tsv")
+train_model("train/in.tsv", "train/expected.tsv")
+
+def check_dev():
+    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+        counter = 0
+        positive = 0
+        for out_line, exp_line in zip(out_file, exp_file):
+            counter+=1
+            if out_line == exp_line:
+                positive += 1
+        print(positive/counter)
+
+#predict("dev-0")
+#predict("test-A")