diff --git a/linear_regression.py b/linear_regression.py new file mode 100644 index 0000000..4b93296 --- /dev/null +++ b/linear_regression.py @@ -0,0 +1,140 @@ +import csv +import re +import random +import json +from math import sqrt + +# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here'] +def make_dict(path): + dict = {} + with open(path) as in_file: + for line in in_file: + for word in re.findall(r"[\w']+", line): + if not word in dict: + weight = round(random.random()%0.2-0.1,2) + dict[word] = weight + + print("dict maked") + with open('dict.txt', 'w') as file: + json.dump(dict, file) + return dict + +def make_posts_list(in_file): + posts = [] + counter = 0 + with open(in_file) as f: + for line in f: + if counter < 1000: + posts.append(line) + else: + counter +=1 + + return posts + +def make_exp_list(exp_file): + exp_list = [] + with open(exp_file) as f: + for exp_line in f: + y = exp_line + exp_list.append(float(y.split('\n')[0])) + + return exp_list + +def train_model(in_path, exp_path): + with open('dict.txt', 'r') as file: + dict = json.load(file) + posts = make_posts_list(in_path) + exp = make_exp_list(exp_path) + w0 = 2013 + lr = 0.0000001 + epchos = 0 + loss_sum = 0 + last_sum = 10 + loss_counter = 0 + print("Zaczynam") + while epchos < 10000: + + loss_cost = 0 + for in_line, exp_line in zip(posts, exp): + loss_counter+=1 + #losowy przykład ze zbioru uczącego + #print("new post" + str(random.randint(0,10))) + post = in_line + error_rate = 1 + y = int(exp_line) + #loop_counter = 0 + #while (error_rate > 0.2 and loop_counter < 10000): + #loop_counter +=1 + y_hat = w0 + for word in re.findall(r"[\w']+", post): + #dict[word] -= (y_hat - y)*lr + y_hat += dict[word] + loss = (y_hat - y)**2 + loss_sum += loss + #error_rate = (y_hat - y)**2 + # if loop_counter%1000 == 0: + # print(error_rate) + # loss_cost += error_rate + # if loss_counter%1000==0: + # print(loss_sum/1000) + # loss_sum = 0 + + #uczenie + delta = (y_hat - y) * lr + w0 = w0 - delta + for word in re.findall(r"[\w']+", post): + dict[word] -= delta + + + real_loss = loss_sum/loss_counter + print(real_loss) + + # if real_loss > last_sum: + # break + # else: + # last_sum = real_loss + last_sum = real_loss + loss_sum = 0 + loss_counter = 0 + epchos +=1 + with open('dict2.txt', 'w') as file: + json.dump(dict, file) + +def predict(path): + results = [] + with open('dict2.txt', 'r') as file: + dict = json.load(file) + + with open(path+"/in.tsv") as in_file: + for in_line in in_file: + print("new post" + str(random.randint(0,10))) + post = in_line + y=0 + for word in re.findall(r"[\w']+", post): + if word in dict: + y += dict[word] + if y > 0.5: + results.append("1") + else: + results.append("0") + + with open(path+"/out.tsv", 'wt') as tsvfile: + tsv_writer = csv.writer(tsvfile, delimiter='\t') + for i in results: + tsv_writer.writerow(i) + +#make_dict("train/in.tsv") +train_model("train/in.tsv", "train/expected.tsv") + +def check_dev(): + with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: + counter = 0 + positive = 0 + for out_line, exp_line in zip(out_file, exp_file): + counter+=1 + if out_line == exp_line: + positive += 1 + print(positive/counter) + +#predict("dev-0") +#predict("test-A")