diff --git a/linear_regression.py b/linear_regression.py index 4b93296..3741075 100644 --- a/linear_regression.py +++ b/linear_regression.py @@ -1,140 +1,230 @@ -import csv -import re +#using NLTK library, we can do lot of text preprocesing +import nltk +from nltk.tokenize import word_tokenize +#nltk.download('stopwords') +from nltk.corpus import stopwords import random -import json -from math import sqrt +import pickle +#function to split text into word -# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here'] -def make_dict(path): - dict = {} - with open(path) as in_file: - for line in in_file: - for word in re.findall(r"[\w']+", line): - if not word in dict: - weight = round(random.random()%0.2-0.1,2) - dict[word] = weight - - print("dict maked") - with open('dict.txt', 'w') as file: - json.dump(dict, file) - return dict +def my_tokenize(text): + tokens = word_tokenize(text) + stop_words = set(stopwords.words('english')) + tokens = [w for w in tokens if not w in stop_words] + return tokens -def make_posts_list(in_file): - posts = [] - counter = 0 + +def post_list(in_file): + post_list = [] with open(in_file) as f: - for line in f: - if counter < 1000: - posts.append(line) - else: - counter +=1 - - return posts + for line in f: + tokens = my_tokenize(line) + post_list.append(tokens) -def make_exp_list(exp_file): + return post_list + + +def exp_list(in_file): exp_list = [] - with open(exp_file) as f: - for exp_line in f: - y = exp_line - exp_list.append(float(y.split('\n')[0])) - + with open(in_file) as f: + for line in f: + exp_list.append(float(line)) + return exp_list -def train_model(in_path, exp_path): - with open('dict.txt', 'r') as file: - dict = json.load(file) - posts = make_posts_list(in_path) - exp = make_exp_list(exp_path) - w0 = 2013 - lr = 0.0000001 - epchos = 0 + +def make_dictionary(posts): + my_dict = dict() + for post in posts: + for t in post: + if not t in my_dict: + my_dict[t] = random.randint(-1,1)*0.1 + + with open('dict.pickle', 'wb') as handle: + pickle.dump(my_dict, handle) + + + +def train(in_file, exp_file): + pl = post_list(in_file) + print("pl created") + el = exp_list(exp_file) + print("el created") + #make_dictionary(pl) + with open('dict.pickle', 'rb') as f: + dictionary = pickle.load(f) + print("dict created") + lr = 0.001 + w0 = 0.1 loss_sum = 0 - last_sum = 10 - loss_counter = 0 - print("Zaczynam") - while epchos < 10000: - - loss_cost = 0 - for in_line, exp_line in zip(posts, exp): - loss_counter+=1 - #losowy przykład ze zbioru uczącego - #print("new post" + str(random.randint(0,10))) - post = in_line - error_rate = 1 - y = int(exp_line) - #loop_counter = 0 - #while (error_rate > 0.2 and loop_counter < 10000): - #loop_counter +=1 + loss_sum_counter = 1 + + + while True: + for post, y in zip(pl,el): y_hat = w0 - for word in re.findall(r"[\w']+", post): - #dict[word] -= (y_hat - y)*lr - y_hat += dict[word] + for token in post: + y_hat += dictionary[token] loss = (y_hat - y)**2 loss_sum += loss - #error_rate = (y_hat - y)**2 - # if loop_counter%1000 == 0: - # print(error_rate) - # loss_cost += error_rate - # if loss_counter%1000==0: - # print(loss_sum/1000) - # loss_sum = 0 - #uczenie + if loss_sum_counter % 10000 == 0: + print(str(loss_sum_counter) + " " + str(loss_sum / 10000)) + loss_sum = 0.0 + loss_sum_counter += 1 + + #updating weights delta = (y_hat - y) * lr - w0 = w0 - delta - for word in re.findall(r"[\w']+", post): - dict[word] -= delta + w0 -= delta + for token in post: + dictionary[token] -= delta + + + if loss_sum_counter > 7000000: + break + + #We save only things we need for prediction + model = (dictionary) + pickle.dump(model, open("model.pkl", "wb")) + +train("train/in.tsv", "train/expected.tsv") + +# import csv +# import re +# import random +# import json +# from math import sqrt + +# def make_dict(path): +# dict = {} +# with open(path) as in_file: +# for line in in_file: +# for word in re.findall(r"[\w']+", line): +# if not word in dict: +# weight = round(random.random()%0.2-0.1,2) +# dict[word] = weight + +# print("dict maked") +# with open('dict.txt', 'w') as file: +# json.dump(dict, file) +# return dict + +# def make_posts_list(in_file): +# posts = [] +# counter = 0 +# with open(in_file) as f: +# for line in f: +# if counter < 1000: +# posts.append(line) +# else: +# counter +=1 + +# return posts + +# def make_exp_list(exp_file): +# exp_list = [] +# with open(exp_file) as f: +# for exp_line in f: +# y = exp_line +# exp_list.append(float(y.split('\n')[0])) + +# return exp_list + +# def train_model(in_path, exp_path): +# with open('dict.txt', 'r') as file: +# dict = json.load(file) +# posts = make_posts_list(in_path) +# exp = make_exp_list(exp_path) +# w0 = 2013 +# lr = 0.0000001 +# epchos = 0 +# loss_sum = 0 +# last_sum = 10 +# loss_counter = 0 +# print("Zaczynam") +# while epchos < 10000: + +# loss_cost = 0 +# for in_line, exp_line in zip(posts, exp): +# loss_counter+=1 +# #losowy przykład ze zbioru uczącego +# #print("new post" + str(random.randint(0,10))) +# post = in_line +# error_rate = 1 +# y = int(exp_line) +# #loop_counter = 0 +# #while (error_rate > 0.2 and loop_counter < 10000): +# #loop_counter +=1 +# y_hat = w0 +# for word in re.findall(r"[\w']+", post): +# #dict[word] -= (y_hat - y)*lr +# y_hat += dict[word] +# loss = (y_hat - y)**2 +# loss_sum += loss +# #error_rate = (y_hat - y)**2 +# # if loop_counter%1000 == 0: +# # print(error_rate) +# # loss_cost += error_rate +# # if loss_counter%1000==0: +# # print(loss_sum/1000) +# # loss_sum = 0 + +# #uczenie +# delta = (y_hat - y) * lr +# w0 = w0 - delta +# for word in re.findall(r"[\w']+", post): +# dict[word] -= delta - real_loss = loss_sum/loss_counter - print(real_loss) +# real_loss = loss_sum/loss_counter +# print(real_loss) - # if real_loss > last_sum: - # break - # else: - # last_sum = real_loss - last_sum = real_loss - loss_sum = 0 - loss_counter = 0 - epchos +=1 - with open('dict2.txt', 'w') as file: - json.dump(dict, file) +# # if real_loss > last_sum: +# # break +# # else: +# # last_sum = real_loss +# last_sum = real_loss +# loss_sum = 0 +# loss_counter = 0 +# epchos +=1 +# with open('dict2.txt', 'w') as file: +# json.dump(dict, file) -def predict(path): - results = [] - with open('dict2.txt', 'r') as file: - dict = json.load(file) +# def predict(path): +# results = [] +# with open('dict2.txt', 'r') as file: +# dict = json.load(file) - with open(path+"/in.tsv") as in_file: - for in_line in in_file: - print("new post" + str(random.randint(0,10))) - post = in_line - y=0 - for word in re.findall(r"[\w']+", post): - if word in dict: - y += dict[word] - if y > 0.5: - results.append("1") - else: - results.append("0") +# with open(path+"/in.tsv") as in_file: +# for in_line in in_file: +# print("new post" + str(random.randint(0,10))) +# post = in_line +# y=0 +# for word in re.findall(r"[\w']+", post): +# if word in dict: +# y += dict[word] +# if y > 0.5: +# results.append("1") +# else: +# results.append("0") - with open(path+"/out.tsv", 'wt') as tsvfile: - tsv_writer = csv.writer(tsvfile, delimiter='\t') - for i in results: - tsv_writer.writerow(i) +# with open(path+"/out.tsv", 'wt') as tsvfile: +# tsv_writer = csv.writer(tsvfile, delimiter='\t') +# for i in results: +# tsv_writer.writerow(i) -#make_dict("train/in.tsv") -train_model("train/in.tsv", "train/expected.tsv") +# #make_dict("train/in.tsv") +# #train_model("train/in.tsv", "train/expected.tsv") -def check_dev(): - with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: - counter = 0 - positive = 0 - for out_line, exp_line in zip(out_file, exp_file): - counter+=1 - if out_line == exp_line: - positive += 1 - print(positive/counter) +# def check_dev(): +# with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: +# counter = 0 +# positive = 0 +# for out_line, exp_line in zip(out_file, exp_file): +# counter+=1 +# if out_line == exp_line: +# positive += 1 +# print(positive/counter) -#predict("dev-0") -#predict("test-A") +# #predict("dev-0") +# #predict("test-A")