guess-reddit-date/linear_regression.py

#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import pickle
import time
#function to split text into word
import os
import csv

def my_tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens


def post_list(in_file):
    post_list = []
    f = open(in_file, encoding="utf8")
    for i, line in enumerate(f):
        tokens = my_tokenize(line)
        post_list.append(tokens)
        # if i%1000000 == 0:
        #     name = "posts" + str(i) + ".pickle"
        #     with open(name, 'wb') as handle:
        #         pickle.dump(post_list, handle)
        #     post_list = []
    f.close()
    # with open('posts.pickle', 'wb') as handle:
    #     pickle.dump(post_list, handle)
    return post_list


def exp_list(in_file):
    exp_list = []
    with open(in_file, encoding="utf8") as f:
        for line in f:
            exp_list.append(float(line))

    return exp_list


def make_dictionary():
    my_dict = dict()
    with open('posts1000000.pickle', 'rb') as f:
        posts = pickle.load(f)
    with open('posts2000000.pickle', 'rb') as f:
        posts +=(pickle.load(f))
    with open('posts3000000.pickle', 'rb') as f:
        posts +=(pickle.load(f))
    with open('posts4000000.pickle', 'rb') as f:
        posts += (pickle.load(f))
    with open('posts.pickle', 'rb') as f:
        posts += (pickle.load(f))

    # with open("allposts", 'wb') as handle:
    #     pickle.dump(posts, handle)
    # for post in posts:
    #     for t in post:
    #         if not t in my_dict:
    #             my_dict[t] = random.randint(-1,1)*0.1
    #
    # with open('dict.pickle', 'wb') as handle:
    #     pickle.dump(my_dict, handle)

    return posts


def train(in_file, exp_file):
    el = exp_list(exp_file)
    print("el created")
    #pl = post_list(in_file)
    print("pl created")
    # with open('posts.pickle', 'rb') as f:
    #     pl = pickle.load(f)
    pl = make_dictionary()
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
    lr = 0.00000005
    w0 = 2014
    loss_sum = 0
    loss_sum_counter = 1


    while True:
        for post, y in zip(pl,el):
            y_hat = w0
            for token in post:
                y_hat += dictionary[token]
            loss = (y_hat - y)**2
            loss_sum += loss

            if loss_sum_counter % 10000 == 0:
                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
                loss_sum = 0.0
            loss_sum_counter += 1
            
            #updating weights
            delta = (y_hat - y) * lr
            w0 -= delta
            for token in post:
                dictionary[token] -= delta
                

            if loss_sum_counter > 40000000:
                break
        if loss_sum_counter > 40000000:
            break
            
    #We save only things we need for prediction
    model = (dictionary, w0)
    pickle.dump(model, open("model.pickle", "wb"))


def predict(path):
    with open('model.pickle', 'rb') as f:
        dictionary, w0 = pickle.load(f)
    pl = post_list(path+"\\in.tsv")
    print("pl created")
    exp_list = []
    for post in pl:
        y_hat = w0
        for token in post:
            try:
                if token in dictionary:
                    y_hat += dictionary[token]
            except KeyError:
                print("blad")
        exp_list.append(y_hat)

    with open(path+"\\out.tsv", 'wt') as tsvfile:
            tsv_writer = csv.writer(tsvfile, delimiter='\t')
            # for i in exp_list:
            #     tsv_writer.writerow(i)
            tsv_writer.writerows(map(lambda x: [-x], exp_list))


#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")
new solution 2020-05-02 22:45:56 +02:00			`#using NLTK library, we can do lot of text preprocesing`
			`import nltk`
			`from nltk.tokenize import word_tokenize`
			`#nltk.download('stopwords')`
			`from nltk.corpus import stopwords`
add linear_regression 2020-04-18 20:39:32 +02:00			`import random`
new solution 2020-05-02 22:45:56 +02:00			`import pickle`
commit 2020-05-05 14:51:40 +02:00			`import time`
new solution 2020-05-02 22:45:56 +02:00			`#function to split text into word`
commit 2020-05-05 14:51:40 +02:00			`import os`
			`import csv`
new solution 2020-05-02 22:45:56 +02:00
			`def my_tokenize(text):`
			`tokens = word_tokenize(text)`
			`stop_words = set(stopwords.words('english'))`
			`tokens = [w for w in tokens if not w in stop_words]`
			`return tokens`


			`def post_list(in_file):`
			`post_list = []`
commit 2020-05-05 14:51:40 +02:00			`f = open(in_file, encoding="utf8")`
			`for i, line in enumerate(f):`
			`tokens = my_tokenize(line)`
			`post_list.append(tokens)`
			`# if i%1000000 == 0:`
			`# name = "posts" + str(i) + ".pickle"`
			`# with open(name, 'wb') as handle:`
			`# pickle.dump(post_list, handle)`
			`# post_list = []`
			`f.close()`
			`# with open('posts.pickle', 'wb') as handle:`
			`# pickle.dump(post_list, handle)`
new solution 2020-05-02 22:45:56 +02:00			`return post_list`
add linear_regression 2020-04-18 20:39:32 +02:00

new solution 2020-05-02 22:45:56 +02:00			`def exp_list(in_file):`
			`exp_list = []`
commit 2020-05-05 14:51:40 +02:00			`with open(in_file, encoding="utf8") as f:`
new solution 2020-05-02 22:45:56 +02:00			`for line in f:`
			`exp_list.append(float(line))`
commit 2020-05-05 14:51:40 +02:00
add linear_regression 2020-04-18 20:39:32 +02:00			`return exp_list`

new solution 2020-05-02 22:45:56 +02:00
commit 2020-05-05 14:51:40 +02:00			`def make_dictionary():`
new solution 2020-05-02 22:45:56 +02:00			`my_dict = dict()`
commit 2020-05-05 14:51:40 +02:00			`with open('posts1000000.pickle', 'rb') as f:`
			`posts = pickle.load(f)`
			`with open('posts2000000.pickle', 'rb') as f:`
			`posts +=(pickle.load(f))`
			`with open('posts3000000.pickle', 'rb') as f:`
			`posts +=(pickle.load(f))`
			`with open('posts4000000.pickle', 'rb') as f:`
			`posts += (pickle.load(f))`
			`with open('posts.pickle', 'rb') as f:`
			`posts += (pickle.load(f))`

			`# with open("allposts", 'wb') as handle:`
			`# pickle.dump(posts, handle)`
			`# for post in posts:`
			`# for t in post:`
			`# if not t in my_dict:`
			`# my_dict[t] = random.randint(-1,1)*0.1`
			`#`
			`# with open('dict.pickle', 'wb') as handle:`
			`# pickle.dump(my_dict, handle)`

			`return posts`
new solution 2020-05-02 22:45:56 +02:00

			`def train(in_file, exp_file):`
			`el = exp_list(exp_file)`
			`print("el created")`
commit 2020-05-05 14:51:40 +02:00			`#pl = post_list(in_file)`
			`print("pl created")`
			`# with open('posts.pickle', 'rb') as f:`
			`# pl = pickle.load(f)`
			`pl = make_dictionary()`
new solution 2020-05-02 22:45:56 +02:00			`with open('dict.pickle', 'rb') as f:`
			`dictionary = pickle.load(f)`
			`print("dict created")`
commit 2020-05-05 14:51:40 +02:00			`lr = 0.00000005`
			`w0 = 2014`
add linear_regression 2020-04-18 20:39:32 +02:00			`loss_sum = 0`
new solution 2020-05-02 22:45:56 +02:00			`loss_sum_counter = 1`


			`while True:`
			`for post, y in zip(pl,el):`
add linear_regression 2020-04-18 20:39:32 +02:00			`y_hat = w0`
new solution 2020-05-02 22:45:56 +02:00			`for token in post:`
			`y_hat += dictionary[token]`
add linear_regression 2020-04-18 20:39:32 +02:00			`loss = (y_hat - y)**2`
			`loss_sum += loss`
new solution 2020-05-02 22:45:56 +02:00
			`if loss_sum_counter % 10000 == 0:`
			`print(str(loss_sum_counter) + " " + str(loss_sum / 10000))`
			`loss_sum = 0.0`
			`loss_sum_counter += 1`

			`#updating weights`
add linear_regression 2020-04-18 20:39:32 +02:00			`delta = (y_hat - y) * lr`
new solution 2020-05-02 22:45:56 +02:00			`w0 -= delta`
			`for token in post:`
			`dictionary[token] -= delta`


commit 2020-05-05 14:51:40 +02:00			`if loss_sum_counter > 40000000:`
			`break`
			`if loss_sum_counter > 40000000:`
new solution 2020-05-02 22:45:56 +02:00			`break`

			`#We save only things we need for prediction`
commit 2020-05-05 14:51:40 +02:00			`model = (dictionary, w0)`
			`pickle.dump(model, open("model.pickle", "wb"))`


			`def predict(path):`
			`with open('model.pickle', 'rb') as f:`
			`dictionary, w0 = pickle.load(f)`
			`pl = post_list(path+"\\in.tsv")`
			`print("pl created")`
			`exp_list = []`
			`for post in pl:`
			`y_hat = w0`
			`for token in post:`
			`try:`
			`if token in dictionary:`
			`y_hat += dictionary[token]`
			`except KeyError:`
			`print("blad")`
			`exp_list.append(y_hat)`

			`with open(path+"\\out.tsv", 'wt') as tsvfile:`
			`tsv_writer = csv.writer(tsvfile, delimiter='\t')`
			`# for i in exp_list:`
			`# tsv_writer.writerow(i)`
			`tsv_writer.writerows(map(lambda x: [-x], exp_list))`



			`#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")`
			`predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")`
			`predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")`