import csv import re import random import json from math import sqrt # Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here'] def make_dict(path): dict = {} with open(path) as in_file: for line in in_file: for word in re.findall(r"[\w']+", line): if not word in dict: weight = round(random.random()%0.2-0.1,2) dict[word] = weight print("dict maked") with open('dict.txt', 'w') as file: json.dump(dict, file) return dict def make_posts_list(in_file): posts = [] counter = 0 with open(in_file) as f: for line in f: if counter < 1000: posts.append(line) else: counter +=1 return posts def make_exp_list(exp_file): exp_list = [] with open(exp_file) as f: for exp_line in f: y = exp_line exp_list.append(float(y.split('\n')[0])) return exp_list def train_model(in_path, exp_path): with open('dict.txt', 'r') as file: dict = json.load(file) posts = make_posts_list(in_path) exp = make_exp_list(exp_path) w0 = 2013 lr = 0.0000001 epchos = 0 loss_sum = 0 last_sum = 10 loss_counter = 0 print("Zaczynam") while epchos < 10000: loss_cost = 0 for in_line, exp_line in zip(posts, exp): loss_counter+=1 #losowy przykład ze zbioru uczącego #print("new post" + str(random.randint(0,10))) post = in_line error_rate = 1 y = int(exp_line) #loop_counter = 0 #while (error_rate > 0.2 and loop_counter < 10000): #loop_counter +=1 y_hat = w0 for word in re.findall(r"[\w']+", post): #dict[word] -= (y_hat - y)*lr y_hat += dict[word] loss = (y_hat - y)**2 loss_sum += loss #error_rate = (y_hat - y)**2 # if loop_counter%1000 == 0: # print(error_rate) # loss_cost += error_rate # if loss_counter%1000==0: # print(loss_sum/1000) # loss_sum = 0 #uczenie delta = (y_hat - y) * lr w0 = w0 - delta for word in re.findall(r"[\w']+", post): dict[word] -= delta real_loss = loss_sum/loss_counter print(real_loss) # if real_loss > last_sum: # break # else: # last_sum = real_loss last_sum = real_loss loss_sum = 0 loss_counter = 0 epchos +=1 with open('dict2.txt', 'w') as file: json.dump(dict, file) def predict(path): results = [] with open('dict2.txt', 'r') as file: dict = json.load(file) with open(path+"/in.tsv") as in_file: for in_line in in_file: print("new post" + str(random.randint(0,10))) post = in_line y=0 for word in re.findall(r"[\w']+", post): if word in dict: y += dict[word] if y > 0.5: results.append("1") else: results.append("0") with open(path+"/out.tsv", 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\t') for i in results: tsv_writer.writerow(i) #make_dict("train/in.tsv") train_model("train/in.tsv", "train/expected.tsv") def check_dev(): with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: counter = 0 positive = 0 for out_line, exp_line in zip(out_file, exp_file): counter+=1 if out_line == exp_line: positive += 1 print(positive/counter) #predict("dev-0") #predict("test-A")