from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator import lzma import scripts def get_word_lines_from_file(file_name): counter=0 with lzma.open(file_name, 'r') as fh: for line in fh: counter+=1 # if counter == 10000: # break line = line.decode("utf-8") yield scripts.get_words_from_line(line) vocab_size = scripts.vocab_size vocab = build_vocab_from_iterator( get_word_lines_from_file('train/in.tsv.xz'), max_tokens = vocab_size, specials = ['']) import pickle with open("vocab.pickle", 'wb') as handle: pickle.dump(vocab, handle)