new solution

2020-05-02 22:45:56 +02:00 · 2020-05-02 22:45:56 +02:00 · 4e8abfc83d
commit 4e8abfc83d
parent 5c9327ab4b
1 changed files with 207 additions and 117 deletions
--- a/linear_regression.py
+++ b/linear_regression.py
@ -1,140 +1,230 @@
-import csv
-import re
+#using NLTK library, we can do lot of text preprocesing
+import nltk
+from nltk.tokenize import word_tokenize
+#nltk.download('stopwords')
+from nltk.corpus import stopwords
 import random
-import json
-from math import sqrt
+import pickle
+#function to split text into word

-# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
-def make_dict(path):
-    dict = {}
-    with open(path) as in_file:
-        for line in in_file:
-            for word in re.findall(r"[\w']+", line):
-                if not word in dict:
-                    weight = round(random.random()%0.2-0.1,2)
-                    dict[word] = weight
-    
-    print("dict maked")
-    with open('dict.txt', 'w') as file:
-        json.dump(dict, file)
-    return dict
+def my_tokenize(text):
+    tokens = word_tokenize(text)
+    stop_words = set(stopwords.words('english'))
+    tokens = [w for w in tokens if not w in stop_words]
+    return tokens

-def make_posts_list(in_file):
-    posts = []
-    counter = 0
+
+def post_list(in_file):
+    post_list = []
    with open(in_file) as f:
-            for line in f:
-                if counter < 1000:
-                    posts.append(line)
-                else:
-                    counter +=1
-                
-    return posts
+        for line in f:
+            tokens = my_tokenize(line)
+            post_list.append(tokens)

-def make_exp_list(exp_file):
+    return post_list
+
+
+def exp_list(in_file):
    exp_list = []
-    with open(exp_file) as f:
-        for exp_line in f:
-            y = exp_line
-            exp_list.append(float(y.split('\n')[0]))
-
+    with open(in_file) as f:
+        for line in f:
+            exp_list.append(float(line))
+    
    return exp_list

-def train_model(in_path, exp_path):
-    with open('dict.txt', 'r') as file:
-        dict = json.load(file)
-    posts = make_posts_list(in_path)
-    exp = make_exp_list(exp_path)
-    w0 = 2013
-    lr = 0.0000001
-    epchos = 0
+
+def make_dictionary(posts):
+    my_dict = dict()
+    for post in posts:
+        for t in post:
+            if not t in my_dict:
+                my_dict[t] = random.randint(-1,1)*0.1
+
+    with open('dict.pickle', 'wb') as handle:
+        pickle.dump(my_dict, handle)
+
+
+
+def train(in_file, exp_file):
+    pl = post_list(in_file)
+    print("pl created")
+    el = exp_list(exp_file)
+    print("el created")
+    #make_dictionary(pl)
+    with open('dict.pickle', 'rb') as f:
+        dictionary = pickle.load(f)
+    print("dict created")
+    lr = 0.001
+    w0 = 0.1
    loss_sum = 0
-    last_sum = 10
-    loss_counter = 0
-    print("Zaczynam")
-    while epchos < 10000:
-        
-        loss_cost = 0            
-        for in_line, exp_line in zip(posts, exp):
-            loss_counter+=1
-            #losowy przykład ze zbioru uczącego
-            #print("new post" + str(random.randint(0,10)))
-            post = in_line
-            error_rate = 1
-            y = int(exp_line)
-            #loop_counter = 0
-            #while (error_rate > 0.2 and loop_counter < 10000):
-                #loop_counter +=1
+    loss_sum_counter = 1
+
+
+    while True:
+        for post, y in zip(pl,el):
            y_hat = w0
-            for word in re.findall(r"[\w']+", post):
-                #dict[word] -= (y_hat - y)*lr
-                y_hat += dict[word]
+            for token in post:
+                y_hat += dictionary[token]
            loss = (y_hat - y)**2
            loss_sum += loss
-            #error_rate = (y_hat - y)**2
-            # if loop_counter%1000 == 0:
-            #     print(error_rate)
-            # loss_cost += error_rate
-            # if loss_counter%1000==0:
-            #     print(loss_sum/1000)
-            #     loss_sum = 0

-            #uczenie
+            if loss_sum_counter % 10000 == 0:
+                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
+                loss_sum = 0.0
+            loss_sum_counter += 1
+            
+            #updating weights
            delta = (y_hat - y) * lr
-            w0 = w0 - delta
-            for word in re.findall(r"[\w']+", post):
-                dict[word] -= delta
+            w0 -= delta
+            for token in post:
+                dictionary[token] -= delta
+                
+
+        if loss_sum_counter > 7000000:
+            break
+            
+    #We save only things we need for prediction
+    model = (dictionary)
+    pickle.dump(model, open("model.pkl", "wb"))
+
+train("train/in.tsv", "train/expected.tsv")
+
+# import csv
+# import re
+# import random
+# import json
+# from math import sqrt
+
+# def make_dict(path):
+#     dict = {}
+#     with open(path) as in_file:
+#         for line in in_file:
+#             for word in re.findall(r"[\w']+", line):
+#                 if not word in dict:
+#                     weight = round(random.random()%0.2-0.1,2)
+#                     dict[word] = weight
+    
+#     print("dict maked")
+#     with open('dict.txt', 'w') as file:
+#         json.dump(dict, file)
+#     return dict
+
+# def make_posts_list(in_file):
+#     posts = []
+#     counter = 0
+#     with open(in_file) as f:
+#             for line in f:
+#                 if counter < 1000:
+#                     posts.append(line)
+#                 else:
+#                     counter +=1
+                
+#     return posts
+
+# def make_exp_list(exp_file):
+#     exp_list = []
+#     with open(exp_file) as f:
+#         for exp_line in f:
+#             y = exp_line
+#             exp_list.append(float(y.split('\n')[0]))
+
+#     return exp_list
+
+# def train_model(in_path, exp_path):
+#     with open('dict.txt', 'r') as file:
+#         dict = json.load(file)
+#     posts = make_posts_list(in_path)
+#     exp = make_exp_list(exp_path)
+#     w0 = 2013
+#     lr = 0.0000001
+#     epchos = 0
+#     loss_sum = 0
+#     last_sum = 10
+#     loss_counter = 0
+#     print("Zaczynam")
+#     while epchos < 10000:
+        
+#         loss_cost = 0            
+#         for in_line, exp_line in zip(posts, exp):
+#             loss_counter+=1
+#             #losowy przykład ze zbioru uczącego
+#             #print("new post" + str(random.randint(0,10)))
+#             post = in_line
+#             error_rate = 1
+#             y = int(exp_line)
+#             #loop_counter = 0
+#             #while (error_rate > 0.2 and loop_counter < 10000):
+#                 #loop_counter +=1
+#             y_hat = w0
+#             for word in re.findall(r"[\w']+", post):
+#                 #dict[word] -= (y_hat - y)*lr
+#                 y_hat += dict[word]
+#             loss = (y_hat - y)**2
+#             loss_sum += loss
+#             #error_rate = (y_hat - y)**2
+#             # if loop_counter%1000 == 0:
+#             #     print(error_rate)
+#             # loss_cost += error_rate
+#             # if loss_counter%1000==0:
+#             #     print(loss_sum/1000)
+#             #     loss_sum = 0
+
+#             #uczenie
+#             delta = (y_hat - y) * lr
+#             w0 = w0 - delta
+#             for word in re.findall(r"[\w']+", post):
+#                 dict[word] -= delta

        
-        real_loss = loss_sum/loss_counter
-        print(real_loss)
+#         real_loss = loss_sum/loss_counter
+#         print(real_loss)

-        # if real_loss > last_sum:
-        #     break
-        # else:
-        #     last_sum = real_loss
-        last_sum = real_loss
-        loss_sum = 0
-        loss_counter = 0
-        epchos +=1
-    with open('dict2.txt', 'w') as file:
-        json.dump(dict, file)
+#         # if real_loss > last_sum:
+#         #     break
+#         # else:
+#         #     last_sum = real_loss
+#         last_sum = real_loss
+#         loss_sum = 0
+#         loss_counter = 0
+#         epchos +=1
+#     with open('dict2.txt', 'w') as file:
+#         json.dump(dict, file)

-def predict(path):
-    results = []
-    with open('dict2.txt', 'r') as file:
-        dict = json.load(file)
+# def predict(path):
+#     results = []
+#     with open('dict2.txt', 'r') as file:
+#         dict = json.load(file)

-    with open(path+"/in.tsv") as in_file:
-        for in_line in in_file:
-            print("new post" + str(random.randint(0,10)))
-            post = in_line
-            y=0
-            for word in re.findall(r"[\w']+", post):
-                if word in dict:
-                    y += dict[word]
-            if y > 0.5:
-                results.append("1")
-            else:
-                results.append("0")
+#     with open(path+"/in.tsv") as in_file:
+#         for in_line in in_file:
+#             print("new post" + str(random.randint(0,10)))
+#             post = in_line
+#             y=0
+#             for word in re.findall(r"[\w']+", post):
+#                 if word in dict:
+#                     y += dict[word]
+#             if y > 0.5:
+#                 results.append("1")
+#             else:
+#                 results.append("0")
        
-        with open(path+"/out.tsv", 'wt') as tsvfile:
-            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-            for i in results:
-                tsv_writer.writerow(i)
+#         with open(path+"/out.tsv", 'wt') as tsvfile:
+#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
+#             for i in results:
+#                 tsv_writer.writerow(i)

-#make_dict("train/in.tsv")
-train_model("train/in.tsv", "train/expected.tsv")
+# #make_dict("train/in.tsv")
+# #train_model("train/in.tsv", "train/expected.tsv")

-def check_dev():
-    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-        counter = 0
-        positive = 0
-        for out_line, exp_line in zip(out_file, exp_file):
-            counter+=1
-            if out_line == exp_line:
-                positive += 1
-        print(positive/counter)
+# def check_dev():
+#     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
+#         counter = 0
+#         positive = 0
+#         for out_line, exp_line in zip(out_file, exp_file):
+#             counter+=1
+#             if out_line == exp_line:
+#                 positive += 1
+#         print(positive/counter)

-#predict("dev-0")
-#predict("test-A")
+# #predict("dev-0")
+# #predict("test-A")