guess-reddit-date/linear_regression.py

145 lines
4.0 KiB
Python

#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import pickle
import time
#function to split text into word
import os
import csv
def my_tokenize(text):
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
return tokens
def post_list(in_file):
post_list = []
f = open(in_file, encoding="utf8")
for i, line in enumerate(f):
tokens = my_tokenize(line)
post_list.append(tokens)
# if i%1000000 == 0:
# name = "posts" + str(i) + ".pickle"
# with open(name, 'wb') as handle:
# pickle.dump(post_list, handle)
# post_list = []
f.close()
# with open('posts.pickle', 'wb') as handle:
# pickle.dump(post_list, handle)
return post_list
def exp_list(in_file):
exp_list = []
with open(in_file, encoding="utf8") as f:
for line in f:
exp_list.append(float(line))
return exp_list
def make_dictionary():
my_dict = dict()
with open('posts1000000.pickle', 'rb') as f:
posts = pickle.load(f)
with open('posts2000000.pickle', 'rb') as f:
posts +=(pickle.load(f))
with open('posts3000000.pickle', 'rb') as f:
posts +=(pickle.load(f))
with open('posts4000000.pickle', 'rb') as f:
posts += (pickle.load(f))
with open('posts.pickle', 'rb') as f:
posts += (pickle.load(f))
# with open("allposts", 'wb') as handle:
# pickle.dump(posts, handle)
# for post in posts:
# for t in post:
# if not t in my_dict:
# my_dict[t] = random.randint(-1,1)*0.1
#
# with open('dict.pickle', 'wb') as handle:
# pickle.dump(my_dict, handle)
return posts
def train(in_file, exp_file):
el = exp_list(exp_file)
print("el created")
#pl = post_list(in_file)
print("pl created")
# with open('posts.pickle', 'rb') as f:
# pl = pickle.load(f)
pl = make_dictionary()
with open('dict.pickle', 'rb') as f:
dictionary = pickle.load(f)
print("dict created")
lr = 0.00000005
w0 = 2014
loss_sum = 0
loss_sum_counter = 1
while True:
for post, y in zip(pl,el):
y_hat = w0
for token in post:
y_hat += dictionary[token]
loss = (y_hat - y)**2
loss_sum += loss
if loss_sum_counter % 10000 == 0:
print(str(loss_sum_counter) + " " + str(loss_sum / 10000))
loss_sum = 0.0
loss_sum_counter += 1
#updating weights
delta = (y_hat - y) * lr
w0 -= delta
for token in post:
dictionary[token] -= delta
if loss_sum_counter > 40000000:
break
if loss_sum_counter > 40000000:
break
#We save only things we need for prediction
model = (dictionary, w0)
pickle.dump(model, open("model.pickle", "wb"))
def predict(path):
with open('model.pickle', 'rb') as f:
dictionary, w0 = pickle.load(f)
pl = post_list(path+"\\in.tsv")
print("pl created")
exp_list = []
for post in pl:
y_hat = w0
for token in post:
try:
if token in dictionary:
y_hat += dictionary[token]
except KeyError:
print("blad")
exp_list.append(round(y_hat,0))
with open(path+"\\out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\n')
tsv_writer.writerow(exp_list)
#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")