commit

2020-05-05 14:51:40 +02:00 · 2020-05-05 14:51:40 +02:00 · cd273579b2
commit cd273579b2
parent bd02ae1b3c
3 changed files with 200073 additions and 200157 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/linear_regression.py
+++ b/linear_regression.py
@ -5,7 +5,10 @@ from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 import random
 import pickle
+import time
 #function to split text into word
+import os
+import csv

 def my_tokenize(text):
    tokens = word_tokenize(text)
@ -16,46 +19,69 @@ def my_tokenize(text):

 def post_list(in_file):
    post_list = []
-    with open(in_file) as f:
-        for line in f:
+    f = open(in_file, encoding="utf8")
+    for i, line in enumerate(f):
        tokens = my_tokenize(line)
        post_list.append(tokens)
-
+        # if i%1000000 == 0:
+        #     name = "posts" + str(i) + ".pickle"
+        #     with open(name, 'wb') as handle:
+        #         pickle.dump(post_list, handle)
+        #     post_list = []
+    f.close()
+    # with open('posts.pickle', 'wb') as handle:
+    #     pickle.dump(post_list, handle)
    return post_list


 def exp_list(in_file):
    exp_list = []
-    with open(in_file) as f:
+    with open(in_file, encoding="utf8") as f:
        for line in f:
            exp_list.append(float(line))

    return exp_list


-def make_dictionary(posts):
+def make_dictionary():
    my_dict = dict()
-    for post in posts:
-        for t in post:
-            if not t in my_dict:
-                my_dict[t] = random.randint(-1,1)*0.1
+    with open('posts1000000.pickle', 'rb') as f:
+        posts = pickle.load(f)
+    with open('posts2000000.pickle', 'rb') as f:
+        posts +=(pickle.load(f))
+    with open('posts3000000.pickle', 'rb') as f:
+        posts +=(pickle.load(f))
+    with open('posts4000000.pickle', 'rb') as f:
+        posts += (pickle.load(f))
+    with open('posts.pickle', 'rb') as f:
+        posts += (pickle.load(f))

-    with open('dict.pickle', 'wb') as handle:
-        pickle.dump(my_dict, handle)
+    # with open("allposts", 'wb') as handle:
+    #     pickle.dump(posts, handle)
+    # for post in posts:
+    #     for t in post:
+    #         if not t in my_dict:
+    #             my_dict[t] = random.randint(-1,1)*0.1
+    #
+    # with open('dict.pickle', 'wb') as handle:
+    #     pickle.dump(my_dict, handle)

+    return posts


 def train(in_file, exp_file):
-    pl = post_list(in_file)
-    print("pl created")
    el = exp_list(exp_file)
    print("el created")
-    #make_dictionary(pl)
+    #pl = post_list(in_file)
+    print("pl created")
+    # with open('posts.pickle', 'rb') as f:
+    #     pl = pickle.load(f)
+    pl = make_dictionary()
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
-    lr = 0.001
-    w0 = 0.1
+    lr = 0.00000005
+    w0 = 2014
    loss_sum = 0
    loss_sum_counter = 1

@ -80,151 +106,41 @@ def train(in_file, exp_file):
                dictionary[token] -= delta
                

-        if loss_sum_counter > 7000000:
+            if loss_sum_counter > 40000000:
+                break
+        if loss_sum_counter > 40000000:
            break
            
    #We save only things we need for prediction
-    model = (dictionary)
-    pickle.dump(model, open("model.pkl", "wb"))
-
-train("train/in.tsv", "train/expected.tsv")
-
-# import csv
-# import re
-# import random
-# import json
-# from math import sqrt
-
-# def make_dict(path):
-#     dict = {}
-#     with open(path) as in_file:
-#         for line in in_file:
-#             for word in re.findall(r"[\w']+", line):
-#                 if not word in dict:
-#                     weight = round(random.random()%0.2-0.1,2)
-#                     dict[word] = weight
-    
-#     print("dict maked")
-#     with open('dict.txt', 'w') as file:
-#         json.dump(dict, file)
-#     return dict
-
-# def make_posts_list(in_file):
-#     posts = []
-#     counter = 0
-#     with open(in_file) as f:
-#             for line in f:
-#                 if counter < 1000:
-#                     posts.append(line)
-#                 else:
-#                     counter +=1
-                
-#     return posts
-
-# def make_exp_list(exp_file):
-#     exp_list = []
-#     with open(exp_file) as f:
-#         for exp_line in f:
-#             y = exp_line
-#             exp_list.append(float(y.split('\n')[0]))
-
-#     return exp_list
-
-# def train_model(in_path, exp_path):
-#     with open('dict.txt', 'r') as file:
-#         dict = json.load(file)
-#     posts = make_posts_list(in_path)
-#     exp = make_exp_list(exp_path)
-#     w0 = 2013
-#     lr = 0.0000001
-#     epchos = 0
-#     loss_sum = 0
-#     last_sum = 10
-#     loss_counter = 0
-#     print("Zaczynam")
-#     while epchos < 10000:
-        
-#         loss_cost = 0            
-#         for in_line, exp_line in zip(posts, exp):
-#             loss_counter+=1
-#             #losowy przykład ze zbioru uczącego
-#             #print("new post" + str(random.randint(0,10)))
-#             post = in_line
-#             error_rate = 1
-#             y = int(exp_line)
-#             #loop_counter = 0
-#             #while (error_rate > 0.2 and loop_counter < 10000):
-#                 #loop_counter +=1
-#             y_hat = w0
-#             for word in re.findall(r"[\w']+", post):
-#                 #dict[word] -= (y_hat - y)*lr
-#                 y_hat += dict[word]
-#             loss = (y_hat - y)**2
-#             loss_sum += loss
-#             #error_rate = (y_hat - y)**2
-#             # if loop_counter%1000 == 0:
-#             #     print(error_rate)
-#             # loss_cost += error_rate
-#             # if loss_counter%1000==0:
-#             #     print(loss_sum/1000)
-#             #     loss_sum = 0
-
-#             #uczenie
-#             delta = (y_hat - y) * lr
-#             w0 = w0 - delta
-#             for word in re.findall(r"[\w']+", post):
-#                 dict[word] -= delta
+    model = (dictionary, w0)
+    pickle.dump(model, open("model.pickle", "wb"))


-#         real_loss = loss_sum/loss_counter
-#         print(real_loss)
+def predict(path):
+    with open('model.pickle', 'rb') as f:
+        dictionary, w0 = pickle.load(f)
+    pl = post_list(path+"\\in.tsv")
+    print("pl created")
+    exp_list = []
+    for post in pl:
+        y_hat = w0
+        for token in post:
+            try:
+                if token in dictionary:
+                    y_hat += dictionary[token]
+            except KeyError:
+                print("blad")
+        exp_list.append(y_hat)

-#         # if real_loss > last_sum:
-#         #     break
-#         # else:
-#         #     last_sum = real_loss
-#         last_sum = real_loss
-#         loss_sum = 0
-#         loss_counter = 0
-#         epchos +=1
-#     with open('dict2.txt', 'w') as file:
-#         json.dump(dict, file)
+    with open(path+"\\out.tsv", 'wt') as tsvfile:
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
+            # for i in exp_list:
+            #     tsv_writer.writerow(i)
+            tsv_writer.writerows(map(lambda x: [-x], exp_list))

-# def predict(path):
-#     results = []
-#     with open('dict2.txt', 'r') as file:
-#         dict = json.load(file)

-#     with open(path+"/in.tsv") as in_file:
-#         for in_line in in_file:
-#             print("new post" + str(random.randint(0,10)))
-#             post = in_line
-#             y=0
-#             for word in re.findall(r"[\w']+", post):
-#                 if word in dict:
-#                     y += dict[word]
-#             if y > 0.5:
-#                 results.append("1")
-#             else:
-#                 results.append("0")

-#         with open(path+"/out.tsv", 'wt') as tsvfile:
-#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
-#             for i in results:
-#                 tsv_writer.writerow(i)
+#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
+predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
+predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")

-# #make_dict("train/in.tsv")
-# #train_model("train/in.tsv", "train/expected.tsv")
-
-# def check_dev():
-#     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
-#         counter = 0
-#         positive = 0
-#         for out_line, exp_line in zip(out_file, exp_file):
-#             counter+=1
-#             if out_line == exp_line:
-#                 positive += 1
-#         print(positive/counter)
-
-# #predict("dev-0")
-# #predict("test-A")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv