commit
This commit is contained in:
parent
bd02ae1b3c
commit
cd273579b2
200000
dev-0/out.tsv
200000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,10 @@ from nltk.tokenize import word_tokenize
|
|||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
import random
|
import random
|
||||||
import pickle
|
import pickle
|
||||||
|
import time
|
||||||
#function to split text into word
|
#function to split text into word
|
||||||
|
import os
|
||||||
|
import csv
|
||||||
|
|
||||||
def my_tokenize(text):
|
def my_tokenize(text):
|
||||||
tokens = word_tokenize(text)
|
tokens = word_tokenize(text)
|
||||||
@ -16,46 +19,69 @@ def my_tokenize(text):
|
|||||||
|
|
||||||
def post_list(in_file):
|
def post_list(in_file):
|
||||||
post_list = []
|
post_list = []
|
||||||
with open(in_file) as f:
|
f = open(in_file, encoding="utf8")
|
||||||
for line in f:
|
for i, line in enumerate(f):
|
||||||
tokens = my_tokenize(line)
|
tokens = my_tokenize(line)
|
||||||
post_list.append(tokens)
|
post_list.append(tokens)
|
||||||
|
# if i%1000000 == 0:
|
||||||
|
# name = "posts" + str(i) + ".pickle"
|
||||||
|
# with open(name, 'wb') as handle:
|
||||||
|
# pickle.dump(post_list, handle)
|
||||||
|
# post_list = []
|
||||||
|
f.close()
|
||||||
|
# with open('posts.pickle', 'wb') as handle:
|
||||||
|
# pickle.dump(post_list, handle)
|
||||||
return post_list
|
return post_list
|
||||||
|
|
||||||
|
|
||||||
def exp_list(in_file):
|
def exp_list(in_file):
|
||||||
exp_list = []
|
exp_list = []
|
||||||
with open(in_file) as f:
|
with open(in_file, encoding="utf8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
exp_list.append(float(line))
|
exp_list.append(float(line))
|
||||||
|
|
||||||
return exp_list
|
return exp_list
|
||||||
|
|
||||||
|
|
||||||
def make_dictionary(posts):
|
def make_dictionary():
|
||||||
my_dict = dict()
|
my_dict = dict()
|
||||||
for post in posts:
|
with open('posts1000000.pickle', 'rb') as f:
|
||||||
for t in post:
|
posts = pickle.load(f)
|
||||||
if not t in my_dict:
|
with open('posts2000000.pickle', 'rb') as f:
|
||||||
my_dict[t] = random.randint(-1,1)*0.1
|
posts +=(pickle.load(f))
|
||||||
|
with open('posts3000000.pickle', 'rb') as f:
|
||||||
|
posts +=(pickle.load(f))
|
||||||
|
with open('posts4000000.pickle', 'rb') as f:
|
||||||
|
posts += (pickle.load(f))
|
||||||
|
with open('posts.pickle', 'rb') as f:
|
||||||
|
posts += (pickle.load(f))
|
||||||
|
|
||||||
with open('dict.pickle', 'wb') as handle:
|
# with open("allposts", 'wb') as handle:
|
||||||
pickle.dump(my_dict, handle)
|
# pickle.dump(posts, handle)
|
||||||
|
# for post in posts:
|
||||||
|
# for t in post:
|
||||||
|
# if not t in my_dict:
|
||||||
|
# my_dict[t] = random.randint(-1,1)*0.1
|
||||||
|
#
|
||||||
|
# with open('dict.pickle', 'wb') as handle:
|
||||||
|
# pickle.dump(my_dict, handle)
|
||||||
|
|
||||||
|
return posts
|
||||||
|
|
||||||
|
|
||||||
def train(in_file, exp_file):
|
def train(in_file, exp_file):
|
||||||
pl = post_list(in_file)
|
|
||||||
print("pl created")
|
|
||||||
el = exp_list(exp_file)
|
el = exp_list(exp_file)
|
||||||
print("el created")
|
print("el created")
|
||||||
#make_dictionary(pl)
|
#pl = post_list(in_file)
|
||||||
|
print("pl created")
|
||||||
|
# with open('posts.pickle', 'rb') as f:
|
||||||
|
# pl = pickle.load(f)
|
||||||
|
pl = make_dictionary()
|
||||||
with open('dict.pickle', 'rb') as f:
|
with open('dict.pickle', 'rb') as f:
|
||||||
dictionary = pickle.load(f)
|
dictionary = pickle.load(f)
|
||||||
print("dict created")
|
print("dict created")
|
||||||
lr = 0.001
|
lr = 0.00000005
|
||||||
w0 = 0.1
|
w0 = 2014
|
||||||
loss_sum = 0
|
loss_sum = 0
|
||||||
loss_sum_counter = 1
|
loss_sum_counter = 1
|
||||||
|
|
||||||
@ -80,151 +106,41 @@ def train(in_file, exp_file):
|
|||||||
dictionary[token] -= delta
|
dictionary[token] -= delta
|
||||||
|
|
||||||
|
|
||||||
if loss_sum_counter > 7000000:
|
if loss_sum_counter > 40000000:
|
||||||
|
break
|
||||||
|
if loss_sum_counter > 40000000:
|
||||||
break
|
break
|
||||||
|
|
||||||
#We save only things we need for prediction
|
#We save only things we need for prediction
|
||||||
model = (dictionary)
|
model = (dictionary, w0)
|
||||||
pickle.dump(model, open("model.pkl", "wb"))
|
pickle.dump(model, open("model.pickle", "wb"))
|
||||||
|
|
||||||
train("train/in.tsv", "train/expected.tsv")
|
|
||||||
|
|
||||||
# import csv
|
|
||||||
# import re
|
|
||||||
# import random
|
|
||||||
# import json
|
|
||||||
# from math import sqrt
|
|
||||||
|
|
||||||
# def make_dict(path):
|
|
||||||
# dict = {}
|
|
||||||
# with open(path) as in_file:
|
|
||||||
# for line in in_file:
|
|
||||||
# for word in re.findall(r"[\w']+", line):
|
|
||||||
# if not word in dict:
|
|
||||||
# weight = round(random.random()%0.2-0.1,2)
|
|
||||||
# dict[word] = weight
|
|
||||||
|
|
||||||
# print("dict maked")
|
|
||||||
# with open('dict.txt', 'w') as file:
|
|
||||||
# json.dump(dict, file)
|
|
||||||
# return dict
|
|
||||||
|
|
||||||
# def make_posts_list(in_file):
|
|
||||||
# posts = []
|
|
||||||
# counter = 0
|
|
||||||
# with open(in_file) as f:
|
|
||||||
# for line in f:
|
|
||||||
# if counter < 1000:
|
|
||||||
# posts.append(line)
|
|
||||||
# else:
|
|
||||||
# counter +=1
|
|
||||||
|
|
||||||
# return posts
|
|
||||||
|
|
||||||
# def make_exp_list(exp_file):
|
|
||||||
# exp_list = []
|
|
||||||
# with open(exp_file) as f:
|
|
||||||
# for exp_line in f:
|
|
||||||
# y = exp_line
|
|
||||||
# exp_list.append(float(y.split('\n')[0]))
|
|
||||||
|
|
||||||
# return exp_list
|
|
||||||
|
|
||||||
# def train_model(in_path, exp_path):
|
|
||||||
# with open('dict.txt', 'r') as file:
|
|
||||||
# dict = json.load(file)
|
|
||||||
# posts = make_posts_list(in_path)
|
|
||||||
# exp = make_exp_list(exp_path)
|
|
||||||
# w0 = 2013
|
|
||||||
# lr = 0.0000001
|
|
||||||
# epchos = 0
|
|
||||||
# loss_sum = 0
|
|
||||||
# last_sum = 10
|
|
||||||
# loss_counter = 0
|
|
||||||
# print("Zaczynam")
|
|
||||||
# while epchos < 10000:
|
|
||||||
|
|
||||||
# loss_cost = 0
|
|
||||||
# for in_line, exp_line in zip(posts, exp):
|
|
||||||
# loss_counter+=1
|
|
||||||
# #losowy przykład ze zbioru uczącego
|
|
||||||
# #print("new post" + str(random.randint(0,10)))
|
|
||||||
# post = in_line
|
|
||||||
# error_rate = 1
|
|
||||||
# y = int(exp_line)
|
|
||||||
# #loop_counter = 0
|
|
||||||
# #while (error_rate > 0.2 and loop_counter < 10000):
|
|
||||||
# #loop_counter +=1
|
|
||||||
# y_hat = w0
|
|
||||||
# for word in re.findall(r"[\w']+", post):
|
|
||||||
# #dict[word] -= (y_hat - y)*lr
|
|
||||||
# y_hat += dict[word]
|
|
||||||
# loss = (y_hat - y)**2
|
|
||||||
# loss_sum += loss
|
|
||||||
# #error_rate = (y_hat - y)**2
|
|
||||||
# # if loop_counter%1000 == 0:
|
|
||||||
# # print(error_rate)
|
|
||||||
# # loss_cost += error_rate
|
|
||||||
# # if loss_counter%1000==0:
|
|
||||||
# # print(loss_sum/1000)
|
|
||||||
# # loss_sum = 0
|
|
||||||
|
|
||||||
# #uczenie
|
|
||||||
# delta = (y_hat - y) * lr
|
|
||||||
# w0 = w0 - delta
|
|
||||||
# for word in re.findall(r"[\w']+", post):
|
|
||||||
# dict[word] -= delta
|
|
||||||
|
|
||||||
|
|
||||||
# real_loss = loss_sum/loss_counter
|
def predict(path):
|
||||||
# print(real_loss)
|
with open('model.pickle', 'rb') as f:
|
||||||
|
dictionary, w0 = pickle.load(f)
|
||||||
|
pl = post_list(path+"\\in.tsv")
|
||||||
|
print("pl created")
|
||||||
|
exp_list = []
|
||||||
|
for post in pl:
|
||||||
|
y_hat = w0
|
||||||
|
for token in post:
|
||||||
|
try:
|
||||||
|
if token in dictionary:
|
||||||
|
y_hat += dictionary[token]
|
||||||
|
except KeyError:
|
||||||
|
print("blad")
|
||||||
|
exp_list.append(y_hat)
|
||||||
|
|
||||||
# # if real_loss > last_sum:
|
with open(path+"\\out.tsv", 'wt') as tsvfile:
|
||||||
# # break
|
tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
||||||
# # else:
|
# for i in exp_list:
|
||||||
# # last_sum = real_loss
|
# tsv_writer.writerow(i)
|
||||||
# last_sum = real_loss
|
tsv_writer.writerows(map(lambda x: [-x], exp_list))
|
||||||
# loss_sum = 0
|
|
||||||
# loss_counter = 0
|
|
||||||
# epchos +=1
|
|
||||||
# with open('dict2.txt', 'w') as file:
|
|
||||||
# json.dump(dict, file)
|
|
||||||
|
|
||||||
# def predict(path):
|
|
||||||
# results = []
|
|
||||||
# with open('dict2.txt', 'r') as file:
|
|
||||||
# dict = json.load(file)
|
|
||||||
|
|
||||||
# with open(path+"/in.tsv") as in_file:
|
|
||||||
# for in_line in in_file:
|
|
||||||
# print("new post" + str(random.randint(0,10)))
|
|
||||||
# post = in_line
|
|
||||||
# y=0
|
|
||||||
# for word in re.findall(r"[\w']+", post):
|
|
||||||
# if word in dict:
|
|
||||||
# y += dict[word]
|
|
||||||
# if y > 0.5:
|
|
||||||
# results.append("1")
|
|
||||||
# else:
|
|
||||||
# results.append("0")
|
|
||||||
|
|
||||||
# with open(path+"/out.tsv", 'wt') as tsvfile:
|
#train("C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\in.tsv", "C:\\Artur\\repos\\UAM\\guess-reddit-date\\train\\expected.tsv")
|
||||||
# tsv_writer = csv.writer(tsvfile, delimiter='\t')
|
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\dev-0")
|
||||||
# for i in results:
|
predict("C:\\Artur\\repos\\UAM\\guess-reddit-date\\test-A")
|
||||||
# tsv_writer.writerow(i)
|
|
||||||
|
|
||||||
# #make_dict("train/in.tsv")
|
|
||||||
# #train_model("train/in.tsv", "train/expected.tsv")
|
|
||||||
|
|
||||||
# def check_dev():
|
|
||||||
# with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
|
|
||||||
# counter = 0
|
|
||||||
# positive = 0
|
|
||||||
# for out_line, exp_line in zip(out_file, exp_file):
|
|
||||||
# counter+=1
|
|
||||||
# if out_line == exp_line:
|
|
||||||
# positive += 1
|
|
||||||
# print(positive/counter)
|
|
||||||
|
|
||||||
# #predict("dev-0")
|
|
||||||
# #predict("test-A")
|
|
||||||
|
200000
test-A/out.tsv
200000
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user