guess-reddit-date/linear_regression.py

#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import pickle
import time
#function to split text into word
import os
import csv

def my_tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens


def post_list(in_file):
    post_list = []
    f = open(in_file, encoding="utf8")
    for i, line in enumerate(f):
        tokens = my_tokenize(line)
        post_list.append(tokens)
        # if i%1000000 == 0:
        #     name = "posts" + str(i) + ".pickle"
        #     with open(name, 'wb') as handle:
        #         pickle.dump(post_list, handle)
        #     post_list = []
    f.close()
    # with open('posts.pickle', 'wb') as handle:
    #     pickle.dump(post_list, handle)
    return post_list


def exp_list(in_file):
    exp_list = []
    with open(in_file, encoding="utf8") as f:
        for line in f:
            exp_list.append(float(line))

    return exp_list


def make_dictionary():
    my_dict = dict()
    with open('posts1000000.pickle', 'rb') as f:
        posts = pickle.load(f)
    with open('posts2000000.pickle', 'rb') as f:
        posts +=(pickle.load(f))
    with open('posts3000000.pickle', 'rb') as f:
        posts +=(pickle.load(f))
    with open('posts4000000.pickle', 'rb') as f:
        posts += (pickle.load(f))
    with open('posts.pickle', 'rb') as f:
        posts += (pickle.load(f))

    # with open("allposts", 'wb') as handle:
    #     pickle.dump(posts, handle)
    # for post in posts:
    #     for t in post:
    #         if not t in my_dict:
    #             my_dict[t] = random.randint(-1,1)*0.1
    #
    # with open('dict.pickle', 'wb') as handle:
    #     pickle.dump(my_dict, handle)

    return posts


def train(in_file, exp_file):
    el = exp_list(exp_file)
    print("el created")
    #pl = post_list(in_file)
    print("pl created")
    # with open('posts.pickle', 'rb') as f:
    #     pl = pickle.load(f)
    pl = make_dictionary()
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
    lr = 0.00000005
    w0 = 2014
    loss_sum = 0
    loss_sum_counter = 1


    while True:
        for post, y in zip(pl,el):
            y_hat = w0
            for token in post:
                y_hat += dictionary[token]
            loss = (y_hat - y)**2
            loss_sum += loss

            if loss_sum_counter % 10000 == 0:
                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
                loss_sum = 0.0
            loss_sum_counter += 1

            #updating weights
            delta = (y_hat - y) * lr
            w0 -= delta
            for token in post:
                dictionary[token] -= delta


            if loss_sum_counter > 40000000:
                break
        if loss_sum_counter > 40000000:
            break

    #We save only things we need for prediction
    model = (dictionary, w0)
    pickle.dump(model, open("model.pickle", "wb"))


def predict(path):
    with open('model.pickle', 'rb') as f:
        dictionary, w0 = pickle.load(f)
    pl = post_list(path+"\\in.tsv")
    print("pl created")
    exp_list = []
    for post in pl:
        y_hat = w0
        for token in post:
            try:
                if token in dictionary:
                    y_hat += dictionary[token]
            except KeyError:
                print("blad")
        exp_list.append(round(y_hat,0))

    with open(path+"\\out.tsv", 'wt') as tsvfile:
            tsv_writer = csv.writer(tsvfile, delimiter='\n')
            tsv_writer.writerow(exp_list)


#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")