new solution

This commit is contained in:
dylodylo 2020-05-02 22:45:56 +02:00
parent 5c9327ab4b
commit 4e8abfc83d

View File

@ -1,140 +1,230 @@
import csv #using NLTK library, we can do lot of text preprocesing
import re import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random import random
import json import pickle
from math import sqrt #function to split text into word
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here'] def my_tokenize(text):
def make_dict(path): tokens = word_tokenize(text)
dict = {} stop_words = set(stopwords.words('english'))
with open(path) as in_file: tokens = [w for w in tokens if not w in stop_words]
for line in in_file: return tokens
for word in re.findall(r"[\w']+", line):
if not word in dict:
weight = round(random.random()%0.2-0.1,2)
dict[word] = weight
print("dict maked")
with open('dict.txt', 'w') as file:
json.dump(dict, file)
return dict
def make_posts_list(in_file):
posts = [] def post_list(in_file):
counter = 0 post_list = []
with open(in_file) as f: with open(in_file) as f:
for line in f: for line in f:
if counter < 1000: tokens = my_tokenize(line)
posts.append(line) post_list.append(tokens)
else:
counter +=1
return posts
def make_exp_list(exp_file): return post_list
def exp_list(in_file):
exp_list = [] exp_list = []
with open(exp_file) as f: with open(in_file) as f:
for exp_line in f: for line in f:
y = exp_line exp_list.append(float(line))
exp_list.append(float(y.split('\n')[0]))
return exp_list return exp_list
def train_model(in_path, exp_path):
with open('dict.txt', 'r') as file: def make_dictionary(posts):
dict = json.load(file) my_dict = dict()
posts = make_posts_list(in_path) for post in posts:
exp = make_exp_list(exp_path) for t in post:
w0 = 2013 if not t in my_dict:
lr = 0.0000001 my_dict[t] = random.randint(-1,1)*0.1
epchos = 0
with open('dict.pickle', 'wb') as handle:
pickle.dump(my_dict, handle)
def train(in_file, exp_file):
pl = post_list(in_file)
print("pl created")
el = exp_list(exp_file)
print("el created")
#make_dictionary(pl)
with open('dict.pickle', 'rb') as f:
dictionary = pickle.load(f)
print("dict created")
lr = 0.001
w0 = 0.1
loss_sum = 0 loss_sum = 0
last_sum = 10 loss_sum_counter = 1
loss_counter = 0
print("Zaczynam")
while epchos < 10000: while True:
for post, y in zip(pl,el):
loss_cost = 0
for in_line, exp_line in zip(posts, exp):
loss_counter+=1
#losowy przykład ze zbioru uczącego
#print("new post" + str(random.randint(0,10)))
post = in_line
error_rate = 1
y = int(exp_line)
#loop_counter = 0
#while (error_rate > 0.2 and loop_counter < 10000):
#loop_counter +=1
y_hat = w0 y_hat = w0
for word in re.findall(r"[\w']+", post): for token in post:
#dict[word] -= (y_hat - y)*lr y_hat += dictionary[token]
y_hat += dict[word]
loss = (y_hat - y)**2 loss = (y_hat - y)**2
loss_sum += loss loss_sum += loss
#error_rate = (y_hat - y)**2
# if loop_counter%1000 == 0:
# print(error_rate)
# loss_cost += error_rate
# if loss_counter%1000==0:
# print(loss_sum/1000)
# loss_sum = 0
#uczenie if loss_sum_counter % 10000 == 0:
print(str(loss_sum_counter) + " " + str(loss_sum / 10000))
loss_sum = 0.0
loss_sum_counter += 1
#updating weights
delta = (y_hat - y) * lr delta = (y_hat - y) * lr
w0 = w0 - delta w0 -= delta
for word in re.findall(r"[\w']+", post): for token in post:
dict[word] -= delta dictionary[token] -= delta
if loss_sum_counter > 7000000:
break
#We save only things we need for prediction
model = (dictionary)
pickle.dump(model, open("model.pkl", "wb"))
train("train/in.tsv", "train/expected.tsv")
# import csv
# import re
# import random
# import json
# from math import sqrt
# def make_dict(path):
# dict = {}
# with open(path) as in_file:
# for line in in_file:
# for word in re.findall(r"[\w']+", line):
# if not word in dict:
# weight = round(random.random()%0.2-0.1,2)
# dict[word] = weight
# print("dict maked")
# with open('dict.txt', 'w') as file:
# json.dump(dict, file)
# return dict
# def make_posts_list(in_file):
# posts = []
# counter = 0
# with open(in_file) as f:
# for line in f:
# if counter < 1000:
# posts.append(line)
# else:
# counter +=1
# return posts
# def make_exp_list(exp_file):
# exp_list = []
# with open(exp_file) as f:
# for exp_line in f:
# y = exp_line
# exp_list.append(float(y.split('\n')[0]))
# return exp_list
# def train_model(in_path, exp_path):
# with open('dict.txt', 'r') as file:
# dict = json.load(file)
# posts = make_posts_list(in_path)
# exp = make_exp_list(exp_path)
# w0 = 2013
# lr = 0.0000001
# epchos = 0
# loss_sum = 0
# last_sum = 10
# loss_counter = 0
# print("Zaczynam")
# while epchos < 10000:
# loss_cost = 0
# for in_line, exp_line in zip(posts, exp):
# loss_counter+=1
# #losowy przykład ze zbioru uczącego
# #print("new post" + str(random.randint(0,10)))
# post = in_line
# error_rate = 1
# y = int(exp_line)
# #loop_counter = 0
# #while (error_rate > 0.2 and loop_counter < 10000):
# #loop_counter +=1
# y_hat = w0
# for word in re.findall(r"[\w']+", post):
# #dict[word] -= (y_hat - y)*lr
# y_hat += dict[word]
# loss = (y_hat - y)**2
# loss_sum += loss
# #error_rate = (y_hat - y)**2
# # if loop_counter%1000 == 0:
# # print(error_rate)
# # loss_cost += error_rate
# # if loss_counter%1000==0:
# # print(loss_sum/1000)
# # loss_sum = 0
# #uczenie
# delta = (y_hat - y) * lr
# w0 = w0 - delta
# for word in re.findall(r"[\w']+", post):
# dict[word] -= delta
real_loss = loss_sum/loss_counter # real_loss = loss_sum/loss_counter
print(real_loss) # print(real_loss)
# if real_loss > last_sum: # # if real_loss > last_sum:
# break # # break
# else: # # else:
# last_sum = real_loss # # last_sum = real_loss
last_sum = real_loss # last_sum = real_loss
loss_sum = 0 # loss_sum = 0
loss_counter = 0 # loss_counter = 0
epchos +=1 # epchos +=1
with open('dict2.txt', 'w') as file: # with open('dict2.txt', 'w') as file:
json.dump(dict, file) # json.dump(dict, file)
def predict(path): # def predict(path):
results = [] # results = []
with open('dict2.txt', 'r') as file: # with open('dict2.txt', 'r') as file:
dict = json.load(file) # dict = json.load(file)
with open(path+"/in.tsv") as in_file: # with open(path+"/in.tsv") as in_file:
for in_line in in_file: # for in_line in in_file:
print("new post" + str(random.randint(0,10))) # print("new post" + str(random.randint(0,10)))
post = in_line # post = in_line
y=0 # y=0
for word in re.findall(r"[\w']+", post): # for word in re.findall(r"[\w']+", post):
if word in dict: # if word in dict:
y += dict[word] # y += dict[word]
if y > 0.5: # if y > 0.5:
results.append("1") # results.append("1")
else: # else:
results.append("0") # results.append("0")
with open(path+"/out.tsv", 'wt') as tsvfile: # with open(path+"/out.tsv", 'wt') as tsvfile:
tsv_writer = csv.writer(tsvfile, delimiter='\t') # tsv_writer = csv.writer(tsvfile, delimiter='\t')
for i in results: # for i in results:
tsv_writer.writerow(i) # tsv_writer.writerow(i)
#make_dict("train/in.tsv") # #make_dict("train/in.tsv")
train_model("train/in.tsv", "train/expected.tsv") # #train_model("train/in.tsv", "train/expected.tsv")
def check_dev(): # def check_dev():
with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file: # with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
counter = 0 # counter = 0
positive = 0 # positive = 0
for out_line, exp_line in zip(out_file, exp_file): # for out_line, exp_line in zip(out_file, exp_file):
counter+=1 # counter+=1
if out_line == exp_line: # if out_line == exp_line:
positive += 1 # positive += 1
print(positive/counter) # print(positive/counter)
#predict("dev-0") # #predict("dev-0")
#predict("test-A") # #predict("test-A")