#using NLTK library, we can do lot of text preprocesing import nltk from nltk.tokenize import word_tokenize #nltk.download('stopwords') from nltk.corpus import stopwords import random import pickle import time #function to split text into word import os import csv def my_tokenize(text): tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) tokens = [w for w in tokens if not w in stop_words] return tokens def post_list(in_file): post_list = [] f = open(in_file, encoding="utf8") for i, line in enumerate(f): tokens = my_tokenize(line) post_list.append(tokens) # if i%1000000 == 0: # name = "posts" + str(i) + ".pickle" # with open(name, 'wb') as handle: # pickle.dump(post_list, handle) # post_list = [] f.close() # with open('posts.pickle', 'wb') as handle: # pickle.dump(post_list, handle) return post_list def exp_list(in_file): exp_list = [] with open(in_file, encoding="utf8") as f: for line in f: exp_list.append(float(line)) return exp_list def make_dictionary(): my_dict = dict() with open('posts1000000.pickle', 'rb') as f: posts = pickle.load(f) with open('posts2000000.pickle', 'rb') as f: posts +=(pickle.load(f)) with open('posts3000000.pickle', 'rb') as f: posts +=(pickle.load(f)) with open('posts4000000.pickle', 'rb') as f: posts += (pickle.load(f)) with open('posts.pickle', 'rb') as f: posts += (pickle.load(f)) # with open("allposts", 'wb') as handle: # pickle.dump(posts, handle) # for post in posts: # for t in post: # if not t in my_dict: # my_dict[t] = random.randint(-1,1)*0.1 # # with open('dict.pickle', 'wb') as handle: # pickle.dump(my_dict, handle) return posts def train(in_file, exp_file): el = exp_list(exp_file) print("el created") #pl = post_list(in_file) print("pl created") # with open('posts.pickle', 'rb') as f: # pl = pickle.load(f) pl = make_dictionary() with open('dict.pickle', 'rb') as f: dictionary = pickle.load(f) print("dict created") lr = 0.00000005 w0 = 2014 loss_sum = 0 loss_sum_counter = 1 while True: for post, y in zip(pl,el): y_hat = w0 for token in post: y_hat += dictionary[token] loss = (y_hat - y)**2 loss_sum += loss if loss_sum_counter % 10000 == 0: print(str(loss_sum_counter) + " " + str(loss_sum / 10000)) loss_sum = 0.0 loss_sum_counter += 1 #updating weights delta = (y_hat - y) * lr w0 -= delta for token in post: dictionary[token] -= delta if loss_sum_counter > 40000000: break if loss_sum_counter > 40000000: break #We save only things we need for prediction model = (dictionary, w0) pickle.dump(model, open("model.pickle", "wb")) def predict(path): with open('model.pickle', 'rb') as f: dictionary, w0 = pickle.load(f) pl = post_list(path+"\\in.tsv") print("pl created") exp_list = [] for post in pl: y_hat = w0 for token in post: try: if token in dictionary: y_hat += dictionary[token] except KeyError: print("blad") exp_list.append(round(y_hat,0)) with open(path+"\\out.tsv", 'wt') as tsvfile: tsv_writer = csv.writer(tsvfile, delimiter='\n') tsv_writer.writerow(exp_list) #train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv") predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0") predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")