guess-reddit-date/linear_regression.py
2020-04-18 20:39:32 +02:00

141 lines
3.9 KiB
Python

import csv
import re
import random
import json
from math import sqrt
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
def make_dict(path):
dict = {}
with open(path) as in_file:
for line in in_file:
for word in re.findall(r"[\w']+", line):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
print("dict maked")
with open('dict.txt', 'w') as file:
json.dump(dict, file)
return dict
def make_posts_list(in_file):
posts = []
counter = 0
with open(in_file) as f:
for line in f:
if counter < 1000:
posts.append(line)
else:
counter +=1
return posts
def make_exp_list(exp_file):
exp_list = []
with open(exp_file) as f:
for exp_line in f:
y = exp_line
exp_list.append(float(y.split('\n')[0]))
return exp_list
def train_model(in_path, exp_path):
with open('dict.txt', 'r') as file:
dict = json.load(file)
posts = make_posts_list(in_path)
exp = make_exp_list(exp_path)
w0 = 2013
lr = 0.0000001
epchos = 0
loss_sum = 0
last_sum = 10
loss_counter = 0
print("Zaczynam")
while epchos < 10000:
loss_cost = 0
for in_line, exp_line in zip(posts, exp):
loss_counter+=1
#losowy przykład ze zbioru uczącego
#print("new post" + str(random.randint(0,10)))
post = in_line
error_rate = 1
y = int(exp_line)
#loop_counter = 0
#while (error_rate > 0.2 and loop_counter < 10000):
#loop_counter +=1
y_hat = w0
for word in re.findall(r"[\w']+", post):
#dict[word] -= (y_hat - y)*lr
y_hat += dict[word]
loss = (y_hat - y)**2
loss_sum += loss
#error_rate = (y_hat - y)**2
# if loop_counter%1000 == 0:
# print(error_rate)
# loss_cost += error_rate
# if loss_counter%1000==0:
# print(loss_sum/1000)
# loss_sum = 0
#uczenie
delta = (y_hat - y) * lr
w0 = w0 - delta
for word in re.findall(r"[\w']+", post):
dict[word] -= delta
real_loss = loss_sum/loss_counter
print(real_loss)
# if real_loss > last_sum:
# break
# else:
# last_sum = real_loss
last_sum = real_loss
loss_sum = 0
loss_counter = 0
epchos +=1
with open('dict2.txt', 'w') as file:
json.dump(dict, file)
def predict(path):
results = []
with open('dict2.txt', 'r') as file:
dict = json.load(file)
with open(path+"/in.tsv") as in_file:
for in_line in in_file:
print("new post" + str(random.randint(0,10)))
post = in_line
y=0
for word in re.findall(r"[\w']+", post):
if word in dict:
y += dict[word]
if y > 0.5:
results.append("1")
else:
results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results:
tsv_writer.writerow(i)
#make_dict("train/in.tsv")
train_model("train/in.tsv", "train/expected.tsv")
def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0
positive = 0
for out_line, exp_line in zip(out_file, exp_file):
counter+=1
if out_line == exp_line:
positive += 1
print(positive/counter)
#predict("dev-0")
#predict("test-A")