guess-reddit-date/linear_regression.py

import csv
import re
import random
import json
from math import sqrt

# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
def make_dict(path):
    dict = {}
    with open(path) as in_file:
        for line in in_file:
            for word in re.findall(r"[\w']+", line):
                if not word in dict:
                    weight = round(random.random()%0.2-0.1,2)
                    dict[word] = weight

    print("dict maked")
    with open('dict.txt', 'w') as file:
        json.dump(dict, file)
    return dict

def make_posts_list(in_file):
    posts = []
    counter = 0
    with open(in_file) as f:
            for line in f:
                if counter < 1000:
                    posts.append(line)
                else:
                    counter +=1

    return posts

def make_exp_list(exp_file):
    exp_list = []
    with open(exp_file) as f:
        for exp_line in f:
            y = exp_line
            exp_list.append(float(y.split('\n')[0]))

    return exp_list

def train_model(in_path, exp_path):
    with open('dict.txt', 'r') as file:
        dict = json.load(file)
    posts = make_posts_list(in_path)
    exp = make_exp_list(exp_path)
    w0 = 2013
    lr = 0.0000001
    epchos = 0
    loss_sum = 0
    last_sum = 10
    loss_counter = 0
    print("Zaczynam")
    while epchos < 10000:

        loss_cost = 0
        for in_line, exp_line in zip(posts, exp):
            loss_counter+=1
            #losowy przykład ze zbioru uczącego
            #print("new post" + str(random.randint(0,10)))
            post = in_line
            error_rate = 1
            y = int(exp_line)
            #loop_counter = 0
            #while (error_rate > 0.2 and loop_counter < 10000):
                #loop_counter +=1
            y_hat = w0
            for word in re.findall(r"[\w']+", post):
                #dict[word] -= (y_hat - y)*lr
                y_hat += dict[word]
            loss = (y_hat - y)**2
            loss_sum += loss
            #error_rate = (y_hat - y)**2
            # if loop_counter%1000 == 0:
            #     print(error_rate)
            # loss_cost += error_rate
            # if loss_counter%1000==0:
            #     print(loss_sum/1000)
            #     loss_sum = 0

            #uczenie
            delta = (y_hat - y) * lr
            w0 = w0 - delta
            for word in re.findall(r"[\w']+", post):
                dict[word] -= delta


        real_loss = loss_sum/loss_counter
        print(real_loss)

        # if real_loss > last_sum:
        #     break
        # else:
        #     last_sum = real_loss
        last_sum = real_loss
        loss_sum = 0
        loss_counter = 0
        epchos +=1
    with open('dict2.txt', 'w') as file:
        json.dump(dict, file)

def predict(path):
    results = []
    with open('dict2.txt', 'r') as file:
        dict = json.load(file)

    with open(path+"/in.tsv") as in_file:
        for in_line in in_file:
            print("new post" + str(random.randint(0,10)))
            post = in_line
            y=0
            for word in re.findall(r"[\w']+", post):
                if word in dict:
                    y += dict[word]
            if y > 0.5:
                results.append("1")
            else:
                results.append("0")

        with open(path+"/out.tsv", 'wt') as tsvfile:
            tsv_writer = csv.writer(tsvfile, delimiter='\t')
            for i in results:
                tsv_writer.writerow(i)

#make_dict("train/in.tsv")
train_model("train/in.tsv", "train/expected.tsv")

def check_dev():
    with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
        counter = 0
        positive = 0
        for out_line, exp_line in zip(out_file, exp_file):
            counter+=1
            if out_line == exp_line:
                positive += 1
        print(positive/counter)

#predict("dev-0")
#predict("test-A")