#using NLTK library, we can do lot of text preprocesing import nltk from nltk.tokenize import word_tokenize #nltk.download('stopwords') from nltk.corpus import stopwords import random import pickle #function to split text into word def my_tokenize(text): tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) tokens = [w for w in tokens if not w in stop_words] return tokens def post_list(in_file): post_list = [] with open(in_file) as f: for line in f: tokens = my_tokenize(line) post_list.append(tokens) return post_list def exp_list(in_file): exp_list = [] with open(in_file) as f: for line in f: exp_list.append(float(line)) return exp_list def make_dictionary(posts): my_dict = dict() for post in posts: for t in post: if not t in my_dict: my_dict[t] = random.randint(-1,1)*0.1 with open('dict.pickle', 'wb') as handle: pickle.dump(my_dict, handle) def train(in_file, exp_file): pl = post_list(in_file) print("pl created") el = exp_list(exp_file) print("el created") #make_dictionary(pl) with open('dict.pickle', 'rb') as f: dictionary = pickle.load(f) print("dict created") lr = 0.001 w0 = 0.1 loss_sum = 0 loss_sum_counter = 1 while True: for post, y in zip(pl,el): y_hat = w0 for token in post: y_hat += dictionary[token] loss = (y_hat - y)**2 loss_sum += loss if loss_sum_counter % 10000 == 0: print(str(loss_sum_counter) + " " + str(loss_sum / 10000)) loss_sum = 0.0 loss_sum_counter += 1 #updating weights delta = (y_hat - y) * lr w0 -= delta for token in post: dictionary[token] -= delta if loss_sum_counter > 7000000: break #We save only things we need for prediction model = (dictionary) pickle.dump(model, open("model.pkl", "wb")) train("train/in.tsv", "train/expected.tsv") # import csv # import re # import random # import json # from math import sqrt # def make_dict(path): # dict = {} # with open(path) as in_file: # for line in in_file: # for word in re.findall(r"[\w']+", line): # if not word in dict: # weight = round(random.random()%0.2-0.1,2) # dict[word] = weight # print("dict maked") # with open('dict.txt', 'w') as file: # json.dump(dict, file) # return dict # def make_posts_list(in_file): # posts = [] # counter = 0 # with open(in_file) as f: # for line in f: # if counter < 1000: # posts.append(line) # else: # counter +=1 # return posts # def make_exp_list(exp_file): # exp_list = [] # with open(exp_file) as f: # for exp_line in f: # y = exp_line # exp_list.append(float(y.split('\n')[0])) # return exp_list # def train_model(in_path, exp_path): # with open('dict.txt', 'r') as file: # dict = json.load(file) # posts = make_posts_list(in_path) # exp = make_exp_list(exp_path) # w0 = 2013 # lr = 0.0000001 # epchos = 0 # loss_sum = 0 # last_sum = 10 # loss_counter = 0 # print("Zaczynam") # while epchos < 10000: # loss_cost = 0 # for in_line, exp_line in zip(posts, exp): # loss_counter+=1 # #losowy przykład ze zbioru uczącego # #print("new post" + str(random.randint(0,10))) # post = in_line # error_rate = 1 # y = int(exp_line) # #loop_counter = 0 # #while (error_rate > 0.2 and loop_counter < 10000): # #loop_counter +=1 # y_hat = w0 # for word in re.findall(r"[\w']+", post): # #dict[word] -= (y_hat - y)*lr # y_hat += dict[word] # loss = (y_hat - y)**2 # loss_sum += loss # #error_rate = (y_hat - y)**2 # # if loop_counter%1000 == 0: # # print(error_rate) # # loss_cost += error_rate # # if loss_counter%1000==0: # # print(loss_sum/1000) # # loss_sum = 0 # #uczenie # delta = (y_hat - y) * lr # w0 = w0 - delta # for word in re.findall(r"[\w']+", post): # dict[word] -= delta # real_loss = loss_sum/loss_counter # print(real_loss) # # if real_loss > last_sum: # # break # # else: # # last_sum = real_loss # last_sum = real_loss # loss_sum = 0 # loss_counter = 0 # epchos +=1 # with open('dict2.txt', 'w') as file: # json.dump(dict, file) # def predict(path): # results = [] # with open('dict2.txt', 'r') as file: # dict = json.load(file) # with open(path+"/in.tsv") as in_file: # for in_line in in_file: # print("new post" + str(random.randint(0,10))) # post = in_line # y=0 # for word in re.findall(r"[\w']+", post): # if word in dict: # y += dict[word] # if y > 0.5: # results.append("1") # else: # results.append("0") # with open(path+"/out.tsv", 'wt') as tsvfile: # tsv_writer = csv.writer(tsvfile, delimiter='\t') # for i in results: # tsv_writer.writerow(i) # #make_dict("train/in.tsv") # #train_model("train/in.tsv", "train/expected.tsv") # def check_dev(): # with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: # counter = 0 # positive = 0 # for out_line, exp_line in zip(out_file, exp_file): # counter+=1 # if out_line == exp_line: # positive += 1 # print(positive/counter) # #predict("dev-0") # #predict("test-A")