guess-reddit-date/linear_regression.py

#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import pickle
#function to split text into word

def my_tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens


def post_list(in_file):
    post_list = []
    with open(in_file) as f:
        for line in f:
            tokens = my_tokenize(line)
            post_list.append(tokens)

    return post_list


def exp_list(in_file):
    exp_list = []
    with open(in_file) as f:
        for line in f:
            exp_list.append(float(line))
    
    return exp_list


def make_dictionary(posts):
    my_dict = dict()
    for post in posts:
        for t in post:
            if not t in my_dict:
                my_dict[t] = random.randint(-1,1)*0.1

    with open('dict.pickle', 'wb') as handle:
        pickle.dump(my_dict, handle)


def train(in_file, exp_file):
    pl = post_list(in_file)
    print("pl created")
    el = exp_list(exp_file)
    print("el created")
    #make_dictionary(pl)
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
    lr = 0.001
    w0 = 0.1
    loss_sum = 0
    loss_sum_counter = 1


    while True:
        for post, y in zip(pl,el):
            y_hat = w0
            for token in post:
                y_hat += dictionary[token]
            loss = (y_hat - y)**2
            loss_sum += loss

            if loss_sum_counter % 10000 == 0:
                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
                loss_sum = 0.0
            loss_sum_counter += 1
            
            #updating weights
            delta = (y_hat - y) * lr
            w0 -= delta
            for token in post:
                dictionary[token] -= delta
                

        if loss_sum_counter > 7000000:
            break
            
    #We save only things we need for prediction
    model = (dictionary)
    pickle.dump(model, open("model.pkl", "wb"))

train("train/in.tsv", "train/expected.tsv")

# import csv
# import re
# import random
# import json
# from math import sqrt

# def make_dict(path):
#     dict = {}
#     with open(path) as in_file:
#         for line in in_file:
#             for word in re.findall(r"[\w']+", line):
#                 if not word in dict:
#                     weight = round(random.random()%0.2-0.1,2)
#                     dict[word] = weight
    
#     print("dict maked")
#     with open('dict.txt', 'w') as file:
#         json.dump(dict, file)
#     return dict

# def make_posts_list(in_file):
#     posts = []
#     counter = 0
#     with open(in_file) as f:
#             for line in f:
#                 if counter < 1000:
#                     posts.append(line)
#                 else:
#                     counter +=1
                
#     return posts

# def make_exp_list(exp_file):
#     exp_list = []
#     with open(exp_file) as f:
#         for exp_line in f:
#             y = exp_line
#             exp_list.append(float(y.split('\n')[0]))

#     return exp_list

# def train_model(in_path, exp_path):
#     with open('dict.txt', 'r') as file:
#         dict = json.load(file)
#     posts = make_posts_list(in_path)
#     exp = make_exp_list(exp_path)
#     w0 = 2013
#     lr = 0.0000001
#     epchos = 0
#     loss_sum = 0
#     last_sum = 10
#     loss_counter = 0
#     print("Zaczynam")
#     while epchos < 10000:
        
#         loss_cost = 0            
#         for in_line, exp_line in zip(posts, exp):
#             loss_counter+=1
#             #losowy przykład ze zbioru uczącego
#             #print("new post" + str(random.randint(0,10)))
#             post = in_line
#             error_rate = 1
#             y = int(exp_line)
#             #loop_counter = 0
#             #while (error_rate > 0.2 and loop_counter < 10000):
#                 #loop_counter +=1
#             y_hat = w0
#             for word in re.findall(r"[\w']+", post):
#                 #dict[word] -= (y_hat - y)*lr
#                 y_hat += dict[word]
#             loss = (y_hat - y)**2
#             loss_sum += loss
#             #error_rate = (y_hat - y)**2
#             # if loop_counter%1000 == 0:
#             #     print(error_rate)
#             # loss_cost += error_rate
#             # if loss_counter%1000==0:
#             #     print(loss_sum/1000)
#             #     loss_sum = 0

#             #uczenie
#             delta = (y_hat - y) * lr
#             w0 = w0 - delta
#             for word in re.findall(r"[\w']+", post):
#                 dict[word] -= delta

        
#         real_loss = loss_sum/loss_counter
#         print(real_loss)

#         # if real_loss > last_sum:
#         #     break
#         # else:
#         #     last_sum = real_loss
#         last_sum = real_loss
#         loss_sum = 0
#         loss_counter = 0
#         epchos +=1
#     with open('dict2.txt', 'w') as file:
#         json.dump(dict, file)

# def predict(path):
#     results = []
#     with open('dict2.txt', 'r') as file:
#         dict = json.load(file)

#     with open(path+"/in.tsv") as in_file:
#         for in_line in in_file:
#             print("new post" + str(random.randint(0,10)))
#             post = in_line
#             y=0
#             for word in re.findall(r"[\w']+", post):
#                 if word in dict:
#                     y += dict[word]
#             if y > 0.5:
#                 results.append("1")
#             else:
#                 results.append("0")
        
#         with open(path+"/out.tsv", 'wt') as tsvfile:
#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
#             for i in results:
#                 tsv_writer.writerow(i)

# #make_dict("train/in.tsv")
# #train_model("train/in.tsv", "train/expected.tsv")

# def check_dev():
#     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
#         counter = 0
#         positive = 0
#         for out_line, exp_line in zip(out_file, exp_file):
#             counter+=1
#             if out_line == exp_line:
#                 positive += 1
#         print(positive/counter)

# #predict("dev-0")
# #predict("test-A")
new solution 2020-05-02 22:45:56 +02:00			`#using NLTK library, we can do lot of text preprocesing`
			`import nltk`
			`from nltk.tokenize import word_tokenize`
			`#nltk.download('stopwords')`
			`from nltk.corpus import stopwords`
add linear_regression 2020-04-18 20:39:32 +02:00			`import random`
new solution 2020-05-02 22:45:56 +02:00			`import pickle`
			`#function to split text into word`

			`def my_tokenize(text):`
			`tokens = word_tokenize(text)`
			`stop_words = set(stopwords.words('english'))`
			`tokens = [w for w in tokens if not w in stop_words]`
			`return tokens`


			`def post_list(in_file):`
			`post_list = []`
add linear_regression 2020-04-18 20:39:32 +02:00			`with open(in_file) as f:`
new solution 2020-05-02 22:45:56 +02:00			`for line in f:`
			`tokens = my_tokenize(line)`
			`post_list.append(tokens)`

			`return post_list`
add linear_regression 2020-04-18 20:39:32 +02:00

new solution 2020-05-02 22:45:56 +02:00			`def exp_list(in_file):`
			`exp_list = []`
			`with open(in_file) as f:`
			`for line in f:`
			`exp_list.append(float(line))`

add linear_regression 2020-04-18 20:39:32 +02:00			`return exp_list`

new solution 2020-05-02 22:45:56 +02:00
			`def make_dictionary(posts):`
			`my_dict = dict()`
			`for post in posts:`
			`for t in post:`
			`if not t in my_dict:`
			`my_dict[t] = random.randint(-1,1)*0.1`

			`with open('dict.pickle', 'wb') as handle:`
			`pickle.dump(my_dict, handle)`



			`def train(in_file, exp_file):`
			`pl = post_list(in_file)`
			`print("pl created")`
			`el = exp_list(exp_file)`
			`print("el created")`
			`#make_dictionary(pl)`
			`with open('dict.pickle', 'rb') as f:`
			`dictionary = pickle.load(f)`
			`print("dict created")`
			`lr = 0.001`
			`w0 = 0.1`
add linear_regression 2020-04-18 20:39:32 +02:00			`loss_sum = 0`
new solution 2020-05-02 22:45:56 +02:00			`loss_sum_counter = 1`


			`while True:`
			`for post, y in zip(pl,el):`
add linear_regression 2020-04-18 20:39:32 +02:00			`y_hat = w0`
new solution 2020-05-02 22:45:56 +02:00			`for token in post:`
			`y_hat += dictionary[token]`
add linear_regression 2020-04-18 20:39:32 +02:00			`loss = (y_hat - y)**2`
			`loss_sum += loss`
new solution 2020-05-02 22:45:56 +02:00
			`if loss_sum_counter % 10000 == 0:`
			`print(str(loss_sum_counter) + " " + str(loss_sum / 10000))`
			`loss_sum = 0.0`
			`loss_sum_counter += 1`

			`#updating weights`
add linear_regression 2020-04-18 20:39:32 +02:00			`delta = (y_hat - y) * lr`
new solution 2020-05-02 22:45:56 +02:00			`w0 -= delta`
			`for token in post:`
			`dictionary[token] -= delta`


			`if loss_sum_counter > 7000000:`
			`break`

			`#We save only things we need for prediction`
			`model = (dictionary)`
			`pickle.dump(model, open("model.pkl", "wb"))`

			`train("train/in.tsv", "train/expected.tsv")`

			`# import csv`
			`# import re`
			`# import random`
			`# import json`
			`# from math import sqrt`

			`# def make_dict(path):`
			`# dict = {}`
			`# with open(path) as in_file:`
			`# for line in in_file:`
			`# for word in re.findall(r"[\w']+", line):`
			`# if not word in dict:`
			`# weight = round(random.random()%0.2-0.1,2)`
			`# dict[word] = weight`

			`# print("dict maked")`
			`# with open('dict.txt', 'w') as file:`
			`# json.dump(dict, file)`
			`# return dict`

			`# def make_posts_list(in_file):`
			`# posts = []`
			`# counter = 0`
			`# with open(in_file) as f:`
			`# for line in f:`
			`# if counter < 1000:`
			`# posts.append(line)`
			`# else:`
			`# counter +=1`

			`# return posts`

			`# def make_exp_list(exp_file):`
			`# exp_list = []`
			`# with open(exp_file) as f:`
			`# for exp_line in f:`
			`# y = exp_line`
			`# exp_list.append(float(y.split('\n')[0]))`

			`# return exp_list`
add linear_regression 2020-04-18 20:39:32 +02:00
new solution 2020-05-02 22:45:56 +02:00			`# def train_model(in_path, exp_path):`
			`# with open('dict.txt', 'r') as file:`
			`# dict = json.load(file)`
			`# posts = make_posts_list(in_path)`
			`# exp = make_exp_list(exp_path)`
			`# w0 = 2013`
			`# lr = 0.0000001`
			`# epchos = 0`
			`# loss_sum = 0`
			`# last_sum = 10`
			`# loss_counter = 0`
			`# print("Zaczynam")`
			`# while epchos < 10000:`
add linear_regression 2020-04-18 20:39:32 +02:00
new solution 2020-05-02 22:45:56 +02:00			`# loss_cost = 0`
			`# for in_line, exp_line in zip(posts, exp):`
			`# loss_counter+=1`
			`# #losowy przykład ze zbioru uczącego`
			`# #print("new post" + str(random.randint(0,10)))`
			`# post = in_line`
			`# error_rate = 1`
			`# y = int(exp_line)`
			`# #loop_counter = 0`
			`# #while (error_rate > 0.2 and loop_counter < 10000):`
			`# #loop_counter +=1`
			`# y_hat = w0`
			`# for word in re.findall(r"[\w']+", post):`
			`# #dict[word] -= (y_hat - y)*lr`
			`# y_hat += dict[word]`
			`# loss = (y_hat - y)**2`
			`# loss_sum += loss`
			`# #error_rate = (y_hat - y)**2`
			`# # if loop_counter%1000 == 0:`
			`# # print(error_rate)`
			`# # loss_cost += error_rate`
			`# # if loss_counter%1000==0:`
			`# # print(loss_sum/1000)`
			`# # loss_sum = 0`

			`# #uczenie`
			`# delta = (y_hat - y) * lr`
			`# w0 = w0 - delta`
			`# for word in re.findall(r"[\w']+", post):`
			`# dict[word] -= delta`

add linear_regression 2020-04-18 20:39:32 +02:00
new solution 2020-05-02 22:45:56 +02:00			`# real_loss = loss_sum/loss_counter`
			`# print(real_loss)`

			`# # if real_loss > last_sum:`
			`# # break`
			`# # else:`
			`# # last_sum = real_loss`
			`# last_sum = real_loss`
			`# loss_sum = 0`
			`# loss_counter = 0`
			`# epchos +=1`
			`# with open('dict2.txt', 'w') as file:`
			`# json.dump(dict, file)`

			`# def predict(path):`
			`# results = []`
			`# with open('dict2.txt', 'r') as file:`
			`# dict = json.load(file)`

			`# with open(path+"/in.tsv") as in_file:`
			`# for in_line in in_file:`
			`# print("new post" + str(random.randint(0,10)))`
			`# post = in_line`
			`# y=0`
			`# for word in re.findall(r"[\w']+", post):`
			`# if word in dict:`
			`# y += dict[word]`
			`# if y > 0.5:`
			`# results.append("1")`
			`# else:`
			`# results.append("0")`

			`# with open(path+"/out.tsv", 'wt') as tsvfile:`
			`# tsv_writer = csv.writer(tsvfile, delimiter='\t')`
			`# for i in results:`
			`# tsv_writer.writerow(i)`

			`# #make_dict("train/in.tsv")`
			`# #train_model("train/in.tsv", "train/expected.tsv")`

			`# def check_dev():`
			`# with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:`
			`# counter = 0`
			`# positive = 0`
			`# for out_line, exp_line in zip(out_file, exp_file):`
			`# counter+=1`
			`# if out_line == exp_line:`
			`# positive += 1`
			`# print(positive/counter)`

			`# #predict("dev-0")`
			`# #predict("test-A")`