from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator import lzma def get_words_from_line(line): line = line.rstrip() yield '' for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): yield m.group(0).lower() yield '' def get_word_lines_from_file(file_name): counter=0 with lzma.open(file_name, 'r') as fh: for line in fh: counter+=1 # if counter == 10000: # break line = line.decode("utf-8") yield get_words_from_line(line) vocab_size = 20000 vocab = build_vocab_from_iterator( get_word_lines_from_file('train/in.tsv.xz'), max_tokens = vocab_size, specials = ['']) import pickle with open("vocab.pickle", 'wb') as handle: pickle.dump(vocab, handle)