from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator import lzma import pickle def get_words_from_line(line): line = line.rstrip() yield '' for t in line.split(' '): yield t yield '' n_size = 100000 def get_word_lines_from_file(file_name): with lzma.open(file_name, 'r') as fh: n = 0 for line in fh: n += 1 yield get_words_from_line(line.decode('utf-8')) if n == n_size: break vocab_size = 20000 vocab = build_vocab_from_iterator( get_word_lines_from_file('train/in.tsv.xz'), max_tokens = vocab_size, specials = ['']) with open('vocabulary.pickle', 'wb') as handle: pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) vocab['human']