2020-05-02 22:45:56 +02:00
|
|
|
#using NLTK library, we can do lot of text preprocesing
|
|
|
|
import nltk
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
|
|
#nltk.download('stopwords')
|
|
|
|
from nltk.corpus import stopwords
|
2020-04-18 20:39:32 +02:00
|
|
|
import random
|
2020-05-02 22:45:56 +02:00
|
|
|
import pickle
|
2020-05-05 14:51:40 +02:00
|
|
|
import time
|
2020-05-02 22:45:56 +02:00
|
|
|
#function to split text into word
|
2020-05-05 14:51:40 +02:00
|
|
|
import os
|
|
|
|
import csv
|
2020-05-02 22:45:56 +02:00
|
|
|
|
|
|
|
def my_tokenize(text):
|
|
|
|
tokens = word_tokenize(text)
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
|
|
tokens = [w for w in tokens if not w in stop_words]
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
def post_list(in_file):
|
|
|
|
post_list = []
|
2020-05-05 14:51:40 +02:00
|
|
|
f = open(in_file, encoding="utf8")
|
|
|
|
for i, line in enumerate(f):
|
|
|
|
tokens = my_tokenize(line)
|
|
|
|
post_list.append(tokens)
|
|
|
|
# if i%1000000 == 0:
|
|
|
|
# name = "posts" + str(i) + ".pickle"
|
|
|
|
# with open(name, 'wb') as handle:
|
|
|
|
# pickle.dump(post_list, handle)
|
|
|
|
# post_list = []
|
|
|
|
f.close()
|
|
|
|
# with open('posts.pickle', 'wb') as handle:
|
|
|
|
# pickle.dump(post_list, handle)
|
2020-05-02 22:45:56 +02:00
|
|
|
return post_list
|
2020-04-18 20:39:32 +02:00
|
|
|
|
|
|
|
|
2020-05-02 22:45:56 +02:00
|
|
|
def exp_list(in_file):
|
|
|
|
exp_list = []
|
2020-05-05 14:51:40 +02:00
|
|
|
with open(in_file, encoding="utf8") as f:
|
2020-05-02 22:45:56 +02:00
|
|
|
for line in f:
|
|
|
|
exp_list.append(float(line))
|
2020-05-05 14:51:40 +02:00
|
|
|
|
2020-04-18 20:39:32 +02:00
|
|
|
return exp_list
|
|
|
|
|
2020-05-02 22:45:56 +02:00
|
|
|
|
2020-05-05 14:51:40 +02:00
|
|
|
def make_dictionary():
|
2020-05-02 22:45:56 +02:00
|
|
|
my_dict = dict()
|
2020-05-05 14:51:40 +02:00
|
|
|
with open('posts1000000.pickle', 'rb') as f:
|
|
|
|
posts = pickle.load(f)
|
|
|
|
with open('posts2000000.pickle', 'rb') as f:
|
|
|
|
posts +=(pickle.load(f))
|
|
|
|
with open('posts3000000.pickle', 'rb') as f:
|
|
|
|
posts +=(pickle.load(f))
|
|
|
|
with open('posts4000000.pickle', 'rb') as f:
|
|
|
|
posts += (pickle.load(f))
|
|
|
|
with open('posts.pickle', 'rb') as f:
|
|
|
|
posts += (pickle.load(f))
|
|
|
|
|
|
|
|
# with open("allposts", 'wb') as handle:
|
|
|
|
# pickle.dump(posts, handle)
|
|
|
|
# for post in posts:
|
|
|
|
# for t in post:
|
|
|
|
# if not t in my_dict:
|
|
|
|
# my_dict[t] = random.randint(-1,1)*0.1
|
|
|
|
#
|
|
|
|
# with open('dict.pickle', 'wb') as handle:
|
|
|
|
# pickle.dump(my_dict, handle)
|
|
|
|
|
|
|
|
return posts
|
2020-05-02 22:45:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
def train(in_file, exp_file):
|
|
|
|
el = exp_list(exp_file)
|
|
|
|
print("el created")
|
2020-05-05 14:51:40 +02:00
|
|
|
#pl = post_list(in_file)
|
|
|
|
print("pl created")
|
|
|
|
# with open('posts.pickle', 'rb') as f:
|
|
|
|
# pl = pickle.load(f)
|
|
|
|
pl = make_dictionary()
|
2020-05-02 22:45:56 +02:00
|
|
|
with open('dict.pickle', 'rb') as f:
|
|
|
|
dictionary = pickle.load(f)
|
|
|
|
print("dict created")
|
2020-05-05 14:51:40 +02:00
|
|
|
lr = 0.00000005
|
|
|
|
w0 = 2014
|
2020-04-18 20:39:32 +02:00
|
|
|
loss_sum = 0
|
2020-05-02 22:45:56 +02:00
|
|
|
loss_sum_counter = 1
|
|
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
for post, y in zip(pl,el):
|
2020-04-18 20:39:32 +02:00
|
|
|
y_hat = w0
|
2020-05-02 22:45:56 +02:00
|
|
|
for token in post:
|
|
|
|
y_hat += dictionary[token]
|
2020-04-18 20:39:32 +02:00
|
|
|
loss = (y_hat - y)**2
|
|
|
|
loss_sum += loss
|
2020-05-02 22:45:56 +02:00
|
|
|
|
|
|
|
if loss_sum_counter % 10000 == 0:
|
|
|
|
print(str(loss_sum_counter) + " " + str(loss_sum / 10000))
|
|
|
|
loss_sum = 0.0
|
|
|
|
loss_sum_counter += 1
|
|
|
|
|
|
|
|
#updating weights
|
2020-04-18 20:39:32 +02:00
|
|
|
delta = (y_hat - y) * lr
|
2020-05-02 22:45:56 +02:00
|
|
|
w0 -= delta
|
|
|
|
for token in post:
|
|
|
|
dictionary[token] -= delta
|
|
|
|
|
|
|
|
|
2020-05-05 14:51:40 +02:00
|
|
|
if loss_sum_counter > 40000000:
|
|
|
|
break
|
|
|
|
if loss_sum_counter > 40000000:
|
2020-05-02 22:45:56 +02:00
|
|
|
break
|
|
|
|
|
|
|
|
#We save only things we need for prediction
|
2020-05-05 14:51:40 +02:00
|
|
|
model = (dictionary, w0)
|
|
|
|
pickle.dump(model, open("model.pickle", "wb"))
|
|
|
|
|
|
|
|
|
|
|
|
def predict(path):
|
|
|
|
with open('model.pickle', 'rb') as f:
|
|
|
|
dictionary, w0 = pickle.load(f)
|
|
|
|
pl = post_list(path+"\\in.tsv")
|
|
|
|
print("pl created")
|
|
|
|
exp_list = []
|
|
|
|
for post in pl:
|
|
|
|
y_hat = w0
|
|
|
|
for token in post:
|
|
|
|
try:
|
|
|
|
if token in dictionary:
|
|
|
|
y_hat += dictionary[token]
|
|
|
|
except KeyError:
|
|
|
|
print("blad")
|
|
|
|
exp_list.append(y_hat)
|
|
|
|
|
|
|
|
with open(path+"\\out.tsv", 'wt') as tsvfile:
|
|
|
|
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
|
|
|
# for i in exp_list:
|
|
|
|
# tsv_writer.writerow(i)
|
|
|
|
tsv_writer.writerows(map(lambda x: [-x], exp_list))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
|
|
|
|
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
|
|
|
|
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")
|
|
|
|
|