#!/usr/bin/python3


import sys
import pickle
from tokenizator import tokenize


def main():
    dictionary = set()
    train_y = []
    documents = []
    document_terms = {}

    counter = 0
    for line in sys.stdin:
        #print(line)
        if counter > 15000:
            break
        line = line.strip()
        fields = line.split('\t')
        if len(fields) < 2:
            continue
        label = fields[0].strip()
        document = fields[1]
        documents.append(document)
        train_y.append(int(label))
        terms = tokenize(document)
        document_terms[counter] = terms

        for t in terms:
            dictionary.add(t)
        counter += 1

    word_to_index_mapping = {}
    index = 1
    for w in dictionary:
        word_to_index_mapping[w] = index
        index += 1

    model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
    pickle.dump(model, open('dictionary.pkl', 'wb'))


if __name__ == '__main__':
    main()