commit

2020-05-05 14:51:40 +02:00 · 2020-05-05 14:51:40 +02:00 · cd273579b2
commit cd273579b2
parent bd02ae1b3c
3 changed files with 200073 additions and 200157 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/linear_regression.py
+++ b/linear_regression.py
@ -5,7 +5,10 @@ from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 import random
 import pickle
 import time
 #function to split text into word
 import os
 import csv
 def my_tokenize(text):
    tokens = word_tokenize(text)
@ -16,46 +19,69 @@ def my_tokenize(text):
 def post_list(in_file):
    post_list = []
-    with open(in_file) as f:
+    f = open(in_file, encoding="utf8")
-        for line in f:
+    for i, line in enumerate(f):
-            tokens = my_tokenize(line)
+        tokens = my_tokenize(line)
-            post_list.append(tokens)
+        post_list.append(tokens)
-
+        # if i%1000000 == 0:
        #     name = "posts" + str(i) + ".pickle"
        #     with open(name, 'wb') as handle:
        #         pickle.dump(post_list, handle)
        #     post_list = []
    f.close()
    # with open('posts.pickle', 'wb') as handle:
    #     pickle.dump(post_list, handle)
    return post_list
 def exp_list(in_file):
    exp_list = []
-    with open(in_file) as f:
+    with open(in_file, encoding="utf8") as f:
        for line in f:
            exp_list.append(float(line))
    return exp_list
-def make_dictionary(posts):
+def make_dictionary():
    my_dict = dict()
-    for post in posts:
+    with open('posts1000000.pickle', 'rb') as f:
-        for t in post:
+        posts = pickle.load(f)
-            if not t in my_dict:
+    with open('posts2000000.pickle', 'rb') as f:
-                my_dict[t] = random.randint(-1,1)*0.1
+        posts +=(pickle.load(f))
    with open('posts3000000.pickle', 'rb') as f:
        posts +=(pickle.load(f))
    with open('posts4000000.pickle', 'rb') as f:
        posts += (pickle.load(f))
    with open('posts.pickle', 'rb') as f:
        posts += (pickle.load(f))
-    with open('dict.pickle', 'wb') as handle:
+    # with open("allposts", 'wb') as handle:
-        pickle.dump(my_dict, handle)
+    #     pickle.dump(posts, handle)
    # for post in posts:
    #     for t in post:
    #         if not t in my_dict:
    #             my_dict[t] = random.randint(-1,1)*0.1
    #
    # with open('dict.pickle', 'wb') as handle:
    #     pickle.dump(my_dict, handle)
    return posts
 def train(in_file, exp_file):
    pl = post_list(in_file)
    print("pl created")
    el = exp_list(exp_file)
    print("el created")
-    #make_dictionary(pl)
+    #pl = post_list(in_file)
    print("pl created")
    # with open('posts.pickle', 'rb') as f:
    #     pl = pickle.load(f)
    pl = make_dictionary()
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
-    lr = 0.001
+    lr = 0.00000005
-    w0 = 0.1
+    w0 = 2014
    loss_sum = 0
    loss_sum_counter = 1
@ -80,151 +106,41 @@ def train(in_file, exp_file):
                dictionary[token] -= delta
-        if loss_sum_counter > 7000000:
+            if loss_sum_counter > 40000000:
                break
        if loss_sum_counter > 40000000:
            break
    #We save only things we need for prediction
-    model = (dictionary)
+    model = (dictionary, w0)
-    pickle.dump(model, open("model.pkl", "wb"))
+    pickle.dump(model, open("model.pickle", "wb"))
 train("train/in.tsv", "train/expected.tsv")
 # import csv
 # import re
 # import random
 # import json
 # from math import sqrt
 # def make_dict(path):
 #     dict = {}
 #     with open(path) as in_file:
 #         for line in in_file:
 #             for word in re.findall(r"[\w']+", line):
 #                 if not word in dict:
 #                     weight = round(random.random()%0.2-0.1,2)
 #                     dict[word] = weight
 #     print("dict maked")
 #     with open('dict.txt', 'w') as file:
 #         json.dump(dict, file)
 #     return dict
 # def make_posts_list(in_file):
 #     posts = []
 #     counter = 0
 #     with open(in_file) as f:
 #             for line in f:
 #                 if counter < 1000:
 #                     posts.append(line)
 #                 else:
 #                     counter +=1
 #     return posts
 # def make_exp_list(exp_file):
 #     exp_list = []
 #     with open(exp_file) as f:
 #         for exp_line in f:
 #             y = exp_line
 #             exp_list.append(float(y.split('\n')[0]))
 #     return exp_list
 # def train_model(in_path, exp_path):
 #     with open('dict.txt', 'r') as file:
 #         dict = json.load(file)
 #     posts = make_posts_list(in_path)
 #     exp = make_exp_list(exp_path)
 #     w0 = 2013
 #     lr = 0.0000001
 #     epchos = 0
 #     loss_sum = 0
 #     last_sum = 10
 #     loss_counter = 0
 #     print("Zaczynam")
 #     while epchos < 10000:
 #         loss_cost = 0            
 #         for in_line, exp_line in zip(posts, exp):
 #             loss_counter+=1
 #             #losowy przykład ze zbioru uczącego
 #             #print("new post" + str(random.randint(0,10)))
 #             post = in_line
 #             error_rate = 1
 #             y = int(exp_line)
 #             #loop_counter = 0
 #             #while (error_rate > 0.2 and loop_counter < 10000):
 #                 #loop_counter +=1
 #             y_hat = w0
 #             for word in re.findall(r"[\w']+", post):
 #                 #dict[word] -= (y_hat - y)*lr
 #                 y_hat += dict[word]
 #             loss = (y_hat - y)**2
 #             loss_sum += loss
 #             #error_rate = (y_hat - y)**2
 #             # if loop_counter%1000 == 0:
 #             #     print(error_rate)
 #             # loss_cost += error_rate
 #             # if loss_counter%1000==0:
 #             #     print(loss_sum/1000)
 #             #     loss_sum = 0
 #             #uczenie
 #             delta = (y_hat - y) * lr
 #             w0 = w0 - delta
 #             for word in re.findall(r"[\w']+", post):
 #                 dict[word] -= delta
-#         real_loss = loss_sum/loss_counter
+def predict(path):
-#         print(real_loss)
+    with open('model.pickle', 'rb') as f:
        dictionary, w0 = pickle.load(f)
    pl = post_list(path+"\\in.tsv")
    print("pl created")
    exp_list = []
    for post in pl:
        y_hat = w0
        for token in post:
            try:
                if token in dictionary:
                    y_hat += dictionary[token]
            except KeyError:
                print("blad")
        exp_list.append(y_hat)
-#         # if real_loss > last_sum:
+    with open(path+"\\out.tsv", 'wt') as tsvfile:
-#         #     break
+            tsv_writer = csv.writer(tsvfile, delimiter='\t')
-#         # else:
+            # for i in exp_list:
-#         #     last_sum = real_loss
+            #     tsv_writer.writerow(i)
-#         last_sum = real_loss
+            tsv_writer.writerows(map(lambda x: [-x], exp_list))
 #         loss_sum = 0
 #         loss_counter = 0
 #         epchos +=1
 #     with open('dict2.txt', 'w') as file:
 #         json.dump(dict, file)
 # def predict(path):
 #     results = []
 #     with open('dict2.txt', 'r') as file:
 #         dict = json.load(file)
 #     with open(path+"/in.tsv") as in_file:
 #         for in_line in in_file:
 #             print("new post" + str(random.randint(0,10)))
 #             post = in_line
 #             y=0
 #             for word in re.findall(r"[\w']+", post):
 #                 if word in dict:
 #                     y += dict[word]
 #             if y > 0.5:
 #                 results.append("1")
 #             else:
 #                 results.append("0")
-#         with open(path+"/out.tsv", 'wt') as tsvfile:
+#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
-#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
+predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
-#             for i in results:
+predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")
 #                 tsv_writer.writerow(i)
 # #make_dict("train/in.tsv")
 # #train_model("train/in.tsv", "train/expected.tsv")
 # def check_dev():
 #     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
 #         counter = 0
 #         positive = 0
 #         for out_line, exp_line in zip(out_file, exp_file):
 #             counter+=1
 #             if out_line == exp_line:
 #                 positive += 1
 #         print(positive/counter)
 # #predict("dev-0")
 # #predict("test-A")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv