#!/usr/bin/python3 import sys import pickle from tokenizator import tokenize def main(): dictionary = set() train_y = [] documents = [] document_terms = {} counter = 0 for line in sys.stdin: #print(line) if counter > 15000: break line = line.strip() fields = line.split('\t') if len(fields) < 2: continue label = fields[0].strip() document = fields[1] documents.append(document) train_y.append(int(label)) terms = tokenize(document) document_terms[counter] = terms for t in terms: dictionary.add(t) counter += 1 word_to_index_mapping = {} index = 1 for w in dictionary: word_to_index_mapping[w] = index index += 1 model = (dictionary, documents, document_terms, word_to_index_mapping, train_y) pickle.dump(model, open('dictionary.pkl', 'wb')) if __name__ == '__main__': main()