import pickle
import re


def calculate_words(linetxt):
    word_counts = {}
    tokens = linetxt.split(' ')
    for token in tokens:
        if token in word_counts.keys():
            word_counts[token]+=1
        else:
            word_counts[token]=1
        word_counts[''] = 1
    return word_counts

def tokenize_list(string_input):
    string=string_input.replace('\\n',' ')
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    text = re.sub(r'\\n+', " ", text)
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'\/[a-z]\/', " ", text)
    text = re.sub(r'[^a-z]', " ", text)
    text = re.sub(r'\s{2,}', " ", text)
    text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
    text = re.sub(r'^\s', "", text)

    return text

def prediction(input,output):
    loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
    #print(loaded_model)
    weights, word, vocabulary = loaded_model
    #print("WORD: ")
    #print(word)
    #print(" WEIGHTS: ")
    #print(weights)
    output_f = open(output,'w')
    with open(input, encoding='utf-8') as input_f:
        for line in input_f:
            text, timestamp = line.rstrip('\n').split('\t')
            tokens = tokenize_list(text.lower())
            line_vocabulary = calculate_words(tokens)
            tokens = tokens.split(' ')
            y_hat = weights[0]
            for token in tokens:
                if token in vocabulary.keys():
                    y_hat += weights[word[token]] * line_vocabulary[token]
            if y_hat > 0.5:
                output_f.write("1\n")
                #print(y_hat)
            else:
                output_f.write("0\n")
                #print(y_hat)
    output_f.close()


def main():
    prediction("dev-0/in.tsv","dev-0/out.tsv")
    prediction("test-A/in.tsv","test-A/out.tsv")

main()