2020-04-14 14:18:59 +02:00
|
|
|
import csv
|
|
|
|
import re
|
|
|
|
import random
|
|
|
|
import json
|
|
|
|
from math import sqrt
|
|
|
|
|
|
|
|
def make_dict(path):
|
|
|
|
dict = {}
|
|
|
|
with open(path) as in_file:
|
|
|
|
for line in in_file:
|
|
|
|
post = (line.split('\t')[0])
|
|
|
|
for word in re.findall(r"[\w']+", post):
|
|
|
|
if not word in dict:
|
|
|
|
weight = round(random.random()%0.2-0.1,2)
|
|
|
|
dict[word] = weight
|
2020-04-20 16:14:42 +02:00
|
|
|
with open('dict.txt', 'w') as file:
|
|
|
|
json.dump(dict, file)
|
2020-04-14 14:18:59 +02:00
|
|
|
|
|
|
|
def make_posts_list(in_file):
|
|
|
|
posts = []
|
|
|
|
with open(in_file) as f:
|
|
|
|
for line in f:
|
|
|
|
post = (line.split('\t')[0])
|
|
|
|
posts.append(post)
|
|
|
|
return posts
|
|
|
|
|
|
|
|
def make_exp_list(exp_file):
|
|
|
|
exp_list = []
|
|
|
|
with open(exp_file) as f:
|
|
|
|
for exp_line in f:
|
|
|
|
y = int(exp_line)
|
|
|
|
exp_list.append(y)
|
|
|
|
|
|
|
|
return exp_list
|
|
|
|
|
|
|
|
def train_model(in_path, exp_path):
|
2020-04-20 16:14:42 +02:00
|
|
|
with open('dict.txt', 'r') as file:
|
|
|
|
dict = json.load(file)
|
2020-04-14 14:18:59 +02:00
|
|
|
posts = make_posts_list(in_path)
|
|
|
|
exp = make_exp_list(exp_path)
|
|
|
|
w0 = 0.1
|
|
|
|
lr = 0.00001
|
|
|
|
loss_counter = 0
|
|
|
|
loss_sum = 0
|
|
|
|
last_sum = 10
|
|
|
|
while loss_counter < 1000:
|
|
|
|
|
|
|
|
loss_cost = 0
|
|
|
|
for in_line, exp_line in zip(posts, exp):
|
|
|
|
loss_counter+=1
|
|
|
|
#losowy przykład ze zbioru uczącego
|
|
|
|
#print("new post" + str(random.randint(0,10)))
|
|
|
|
post = (in_line.split('\t')[0])
|
|
|
|
error_rate = 1
|
|
|
|
y = int(exp_line)
|
|
|
|
y_hat = w0
|
|
|
|
for word in re.findall(r"[\w']+", post):
|
|
|
|
y_hat += dict[word]
|
|
|
|
loss = (y_hat - y)**2
|
|
|
|
loss_sum += loss
|
|
|
|
|
|
|
|
#uczenie
|
|
|
|
delta = (y_hat - y) * lr
|
|
|
|
w0 = w0 - delta
|
|
|
|
for word in re.findall(r"[\w']+", post):
|
|
|
|
dict[word] -= delta
|
|
|
|
|
|
|
|
|
|
|
|
real_loss = loss_sum/loss_counter
|
|
|
|
print(real_loss)
|
|
|
|
|
|
|
|
if real_loss > last_sum:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
last_sum = real_loss
|
|
|
|
loss_sum = 0
|
|
|
|
loss_counter = 0
|
2020-04-20 16:14:42 +02:00
|
|
|
dict["w0"] = w0
|
|
|
|
with open('dict2.txt', 'w') as file:
|
2020-04-14 14:18:59 +02:00
|
|
|
json.dump(dict, file)
|
|
|
|
|
|
|
|
def predict(path):
|
|
|
|
results = []
|
2020-04-20 16:14:42 +02:00
|
|
|
with open('dict2.txt', 'r') as file:
|
2020-04-14 14:18:59 +02:00
|
|
|
dict = json.load(file)
|
|
|
|
|
|
|
|
with open(path+"/in.tsv") as in_file:
|
|
|
|
for in_line in in_file:
|
|
|
|
print("new post" + str(random.randint(0,10)))
|
|
|
|
post = (in_line.split('\t')[0])
|
2020-04-20 16:14:42 +02:00
|
|
|
y=dict["w0"]
|
2020-04-14 14:18:59 +02:00
|
|
|
for word in re.findall(r"[\w']+", post):
|
|
|
|
if word in dict:
|
|
|
|
y += dict[word]
|
|
|
|
if y > 0.5:
|
|
|
|
results.append("1")
|
|
|
|
else:
|
|
|
|
results.append("0")
|
|
|
|
|
|
|
|
with open(path+"/out.tsv", 'wt') as tsvfile:
|
|
|
|
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
|
|
|
for i in results:
|
|
|
|
tsv_writer.writerow(i)
|
|
|
|
|
2020-04-20 16:14:42 +02:00
|
|
|
make_dict("train/in.tsv")
|
|
|
|
train_model("train/in.tsv", "train/expected.tsv")
|
2020-04-14 14:18:59 +02:00
|
|
|
|
|
|
|
def check_dev():
|
|
|
|
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
|
|
|
counter = 0
|
|
|
|
positive = 0
|
|
|
|
for out_line, exp_line in zip(out_file, exp_file):
|
|
|
|
counter+=1
|
|
|
|
if out_line == exp_line:
|
|
|
|
positive += 1
|
|
|
|
print(positive/counter)
|
|
|
|
|
2020-04-20 16:14:42 +02:00
|
|
|
predict("dev-0")
|
|
|
|
check_dev()
|