#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import pickle
#function to split text into word

def my_tokenize(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens


def post_list(in_file):
    post_list = []
    with open(in_file) as f:
        for line in f:
            tokens = my_tokenize(line)
            post_list.append(tokens)

    return post_list


def exp_list(in_file):
    exp_list = []
    with open(in_file) as f:
        for line in f:
            exp_list.append(float(line))
    
    return exp_list


def make_dictionary(posts):
    my_dict = dict()
    for post in posts:
        for t in post:
            if not t in my_dict:
                my_dict[t] = random.randint(-1,1)*0.1

    with open('dict.pickle', 'wb') as handle:
        pickle.dump(my_dict, handle)


def train(in_file, exp_file):
    pl = post_list(in_file)
    print("pl created")
    el = exp_list(exp_file)
    print("el created")
    #make_dictionary(pl)
    with open('dict.pickle', 'rb') as f:
        dictionary = pickle.load(f)
    print("dict created")
    lr = 0.001
    w0 = 0.1
    loss_sum = 0
    loss_sum_counter = 1


    while True:
        for post, y in zip(pl,el):
            y_hat = w0
            for token in post:
                y_hat += dictionary[token]
            loss = (y_hat - y)**2
            loss_sum += loss

            if loss_sum_counter % 10000 == 0:
                print(str(loss_sum_counter) + "   " + str(loss_sum / 10000))
                loss_sum = 0.0
            loss_sum_counter += 1
            
            #updating weights
            delta = (y_hat - y) * lr
            w0 -= delta
            for token in post:
                dictionary[token] -= delta
                

        if loss_sum_counter > 7000000:
            break
            
    #We save only things we need for prediction
    model = (dictionary)
    pickle.dump(model, open("model.pkl", "wb"))

train("train/in.tsv", "train/expected.tsv")

# import csv
# import re
# import random
# import json
# from math import sqrt

# def make_dict(path):
#     dict = {}
#     with open(path) as in_file:
#         for line in in_file:
#             for word in re.findall(r"[\w']+", line):
#                 if not word in dict:
#                     weight = round(random.random()%0.2-0.1,2)
#                     dict[word] = weight
    
#     print("dict maked")
#     with open('dict.txt', 'w') as file:
#         json.dump(dict, file)
#     return dict

# def make_posts_list(in_file):
#     posts = []
#     counter = 0
#     with open(in_file) as f:
#             for line in f:
#                 if counter < 1000:
#                     posts.append(line)
#                 else:
#                     counter +=1
                
#     return posts

# def make_exp_list(exp_file):
#     exp_list = []
#     with open(exp_file) as f:
#         for exp_line in f:
#             y = exp_line
#             exp_list.append(float(y.split('\n')[0]))

#     return exp_list

# def train_model(in_path, exp_path):
#     with open('dict.txt', 'r') as file:
#         dict = json.load(file)
#     posts = make_posts_list(in_path)
#     exp = make_exp_list(exp_path)
#     w0 = 2013
#     lr = 0.0000001
#     epchos = 0
#     loss_sum = 0
#     last_sum = 10
#     loss_counter = 0
#     print("Zaczynam")
#     while epchos < 10000:
        
#         loss_cost = 0            
#         for in_line, exp_line in zip(posts, exp):
#             loss_counter+=1
#             #losowy przykład ze zbioru uczącego
#             #print("new post" + str(random.randint(0,10)))
#             post = in_line
#             error_rate = 1
#             y = int(exp_line)
#             #loop_counter = 0
#             #while (error_rate > 0.2 and loop_counter < 10000):
#                 #loop_counter +=1
#             y_hat = w0
#             for word in re.findall(r"[\w']+", post):
#                 #dict[word] -= (y_hat - y)*lr
#                 y_hat += dict[word]
#             loss = (y_hat - y)**2
#             loss_sum += loss
#             #error_rate = (y_hat - y)**2
#             # if loop_counter%1000 == 0:
#             #     print(error_rate)
#             # loss_cost += error_rate
#             # if loss_counter%1000==0:
#             #     print(loss_sum/1000)
#             #     loss_sum = 0

#             #uczenie
#             delta = (y_hat - y) * lr
#             w0 = w0 - delta
#             for word in re.findall(r"[\w']+", post):
#                 dict[word] -= delta

        
#         real_loss = loss_sum/loss_counter
#         print(real_loss)

#         # if real_loss > last_sum:
#         #     break
#         # else:
#         #     last_sum = real_loss
#         last_sum = real_loss
#         loss_sum = 0
#         loss_counter = 0
#         epchos +=1
#     with open('dict2.txt', 'w') as file:
#         json.dump(dict, file)

# def predict(path):
#     results = []
#     with open('dict2.txt', 'r') as file:
#         dict = json.load(file)

#     with open(path+"/in.tsv") as in_file:
#         for in_line in in_file:
#             print("new post" + str(random.randint(0,10)))
#             post = in_line
#             y=0
#             for word in re.findall(r"[\w']+", post):
#                 if word in dict:
#                     y += dict[word]
#             if y > 0.5:
#                 results.append("1")
#             else:
#                 results.append("0")
        
#         with open(path+"/out.tsv", 'wt') as tsvfile:
#             tsv_writer = csv.writer(tsvfile, delimiter='\t')
#             for i in results:
#                 tsv_writer.writerow(i)

# #make_dict("train/in.tsv")
# #train_model("train/in.tsv", "train/expected.tsv")

# def check_dev():
#     with open("dev-0/out.tsv") as out_file, open("dev-0/expected.tsv") as exp_file:
#         counter = 0
#         positive = 0
#         for out_line, exp_line in zip(out_file, exp_file):
#             counter+=1
#             if out_line == exp_line:
#                 positive += 1
#         print(positive/counter)

# #predict("dev-0")
# #predict("test-A")