new solution

2020-05-02 22:45:56 +02:00 · 2020-05-02 22:45:56 +02:00 · 4e8abfc83d
commit 4e8abfc83d
parent 5c9327ab4b
1 changed files with 207 additions and 117 deletions
--- a/linear_regression.py
+++ b/linear_regression.py
@ -1,140 +1,230 @@
-import csv
+#using NLTK library, we can do lot of text preprocesing
-import re
+import nltk
 from nltk.tokenize import word_tokenize
 #nltk.download('stopwords')
 from nltk.corpus import stopwords
 import random
-import json
+import pickle
-from math import sqrt
+#function to split text into word
-# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
+def my_tokenize(text):
-def make_dict(path):
+    tokens = word_tokenize(text)
-    dict = {}
+    stop_words = set(stopwords.words('english'))
-    with open(path) as in_file:
+    tokens = [w for w in tokens if not w in stop_words]
-        for line in in_file:
+    return tokens
            for word in re.findall(r"[\w']+", line):
                if not word in dict:
                    weight = round(random.random()%0.2-0.1,2)
                    dict[word] = weight
    print("dict maked")
    with open('dict.txt', 'w') as file:
        json.dump(dict, file)
    return dict
-def make_posts_list(in_file):
+def post_list(in_file):
-    posts = []
+    post_list = []
    counter = 0
    with open(in_file) as f:
        for line in f:
-                if counter < 1000:
+            tokens = my_tokenize(line)
-                    posts.append(line)
+            post_list.append(tokens)
                else:
                    counter +=1
-    return posts
+    return post_list
-def make_exp_list(exp_file):
+
 def exp_list(in_file):
    exp_list = []
-    with open(exp_file) as f:
+    with open(in_file) as f:
-        for exp_line in f:
+        for line in f:
-            y = exp_line
+            exp_list.append(float(line))
            exp_list.append(float(y.split('\n')[0]))
    return exp_list
 def train_model(in_path, exp_path):
    with open('dict.txt', 'r') as file:
        dict = json.load(file)
    posts = make_posts_list(in_path)
    exp = make_exp_list(exp_path)
    w0 = 2013
    lr = 0.0000001
    epchos = 0
    loss_sum = 0
    last_sum = 10
    loss_counter = 0
    print("Zaczynam")
    while epchos < 10000:
-        loss_cost = 0            
+def make_dictionary(posts):
-        for in_line, exp_line in zip(posts, exp):
+    my_dict = dict()
-            loss_counter+=1
+    for post in posts:
-            #losowy przykład ze zbioru uczącego
+        for t in post:
-            #print("new post" + str(random.randint(0,10)))
+            if not t in my_dict:
-            post = in_line
+                my_dict[t] = random.randint(-1,1)*0.1
-            error_rate = 1
+
-            y = int(exp_line)
+    with open('dict.pickle', 'wb') as handle:
-            #loop_counter = 0
+        pickle.dump(my_dict, handle)
-            #while (error_rate > 0.2 and loop_counter < 10000):
+
-                #loop_counter +=1
+
 def train(in_file, exp_file):
    pl = post_list(in_file)
    print("pl created")
    el = exp_list(exp_file)
    print("el created")
    #make_dictionary(pl)
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
    lr = 0.001
    w0 = 0.1
    loss_sum = 0
    loss_sum_counter = 1
    while True:
        for post, y in zip(pl,el):
            y_hat = w0
-            for word in re.findall(r"[\w']+", post):
+            for token in post:
-                #dict[word] -= (y_hat - y)*lr
+                y_hat += dictionary[token]
                y_hat += dict[word]
            loss = (y_hat - y)**2
            loss_sum += loss
            #error_rate = (y_hat - y)**2
            # if loop_counter%1000 == 0:
            #     print(error_rate)
            # loss_cost += error_rate
            # if loss_counter%1000==0:
            #     print(loss_sum/1000)
            #     loss_sum = 0
-            #uczenie
+            if loss_sum_counter % 10000 == 0:
                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
                loss_sum = 0.0
            loss_sum_counter += 1
            #updating weights
            delta = (y_hat - y) * lr
-            w0 = w0 - delta
+            w0 -= delta
-            for word in re.findall(r"[\w']+", post):
+            for token in post:
-                dict[word] -= delta
+                dictionary[token] -= delta
-        real_loss = loss_sum/loss_counter
+        if loss_sum_counter > 7000000:
-        print(real_loss)
+            break
-        # if real_loss > last_sum:
+    #We save only things we need for prediction
-        #     break
+    model = (dictionary)
    pickle.dump(model, open("model.pkl", "wb"))
 train("train/in.tsv", "train/expected.tsv")
 # import csv
 # import re
 # import random
 # import json
 # from math import sqrt
 # def make_dict(path):
 #     dict = {}
 #     with open(path) as in_file:
 #         for line in in_file:
 #             for word in re.findall(r"[\w']+", line):
 #                 if not word in dict:
 #                     weight = round(random.random()%0.2-0.1,2)
 #                     dict[word] = weight
 #     print("dict maked")
 #     with open('dict.txt', 'w') as file:
 #         json.dump(dict, file)
 #     return dict
 # def make_posts_list(in_file):
 #     posts = []
 #     counter = 0
 #     with open(in_file) as f:
 #             for line in f:
 #                 if counter < 1000:
 #                     posts.append(line)
 #                 else:
 #                     counter +=1
 #     return posts
 # def make_exp_list(exp_file):
 #     exp_list = []
 #     with open(exp_file) as f:
 #         for exp_line in f:
 #             y = exp_line
 #             exp_list.append(float(y.split('\n')[0]))
 #     return exp_list
 # def train_model(in_path, exp_path):
 #     with open('dict.txt', 'r') as file:
 #         dict = json.load(file)
 #     posts = make_posts_list(in_path)
 #     exp = make_exp_list(exp_path)
 #     w0 = 2013
 #     lr = 0.0000001
 #     epchos = 0
 #     loss_sum = 0
 #     last_sum = 10
 #     loss_counter = 0
 #     print("Zaczynam")
 #     while epchos < 10000:
 #         loss_cost = 0            
 #         for in_line, exp_line in zip(posts, exp):
 #             loss_counter+=1
 #             #losowy przykład ze zbioru uczącego
 #             #print("new post" + str(random.randint(0,10)))
 #             post = in_line
 #             error_rate = 1
 #             y = int(exp_line)
 #             #loop_counter = 0
 #             #while (error_rate > 0.2 and loop_counter < 10000):
 #                 #loop_counter +=1
 #             y_hat = w0
 #             for word in re.findall(r"[\w']+", post):
 #                 #dict[word] -= (y_hat - y)*lr
 #                 y_hat += dict[word]
 #             loss = (y_hat - y)**2
 #             loss_sum += loss
 #             #error_rate = (y_hat - y)**2
 #             # if loop_counter%1000 == 0:
 #             #     print(error_rate)
 #             # loss_cost += error_rate
 #             # if loss_counter%1000==0:
 #             #     print(loss_sum/1000)
 #             #     loss_sum = 0
 #             #uczenie
 #             delta = (y_hat - y) * lr
 #             w0 = w0 - delta
 #             for word in re.findall(r"[\w']+", post):
 #                 dict[word] -= delta
 #         real_loss = loss_sum/loss_counter
 #         print(real_loss)
 #         # if real_loss > last_sum:
 #         #     break
 #         # else:
 #         #     last_sum = real_loss
 #         last_sum = real_loss
-        last_sum = real_loss
+#         loss_sum = 0
-        loss_sum = 0
+#         loss_counter = 0
-        loss_counter = 0
+#         epchos +=1
-        epchos +=1
+#     with open('dict2.txt', 'w') as file:
-    with open('dict2.txt', 'w') as file:
+#         json.dump(dict, file)
        json.dump(dict, file)
-def predict(path):
+# def predict(path):
-    results = []
+#     results = []
-    with open('dict2.txt', 'r') as file:
+#     with open('dict2.txt', 'r') as file:
-        dict = json.load(file)
+#         dict = json.load(file)
-    with open(path+"/in.tsv") as in_file:
+#     with open(path+"/in.tsv") as in_file:
-        for in_line in in_file:
+#         for in_line in in_file:
-            print("new post" + str(random.randint(0,10)))
+#             print("new post" + str(random.randint(0,10)))
-            post = in_line
+#             post = in_line
-            y=0
+#             y=0
-            for word in re.findall(r"[\w']+", post):
+#             for word in re.findall(r"[\w']+", post):
-                if word in dict:
+#                 if word in dict:
-                    y += dict[word]
+#                     y += dict[word]
-            if y > 0.5:
+#             if y > 0.5:
-                results.append("1")
+#                 results.append("1")
-            else:
+#             else:
-                results.append("0")
+#                 results.append("0")
-        with open(path+"/out.tsv", 'wt') as tsvfile:
+#         with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in results:
+#             for i in results:
-                tsv_writer.writerow(i)
+#                 tsv_writer.writerow(i)
-#make_dict("train/in.tsv")
+# #make_dict("train/in.tsv")
-train_model("train/in.tsv", "train/expected.tsv")
+# #train_model("train/in.tsv", "train/expected.tsv")
-def check_dev():
+# def check_dev():
-    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+#     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-        counter = 0
+#         counter = 0
-        positive = 0
+#         positive = 0
-        for out_line, exp_line in zip(out_file, exp_file):
+#         for out_line, exp_line in zip(out_file, exp_file):
-            counter+=1
+#             counter+=1
-            if out_line == exp_line:
+#             if out_line == exp_line:
-                positive += 1
+#                 positive += 1
-        print(positive/counter)
+#         print(positive/counter)
-#predict("dev-0")
+# #predict("dev-0")
-#predict("test-A")
+# #predict("test-A")