35 lines
831 B
Python
35 lines
831 B
Python
|
from itertools import islice
|
||
|
import regex as re
|
||
|
import sys
|
||
|
from torchtext.vocab import build_vocab_from_iterator
|
||
|
import lzma
|
||
|
import pickle
|
||
|
|
||
|
def get_words_from_line(line):
|
||
|
line = line.rstrip()
|
||
|
yield '<s>'
|
||
|
for t in line.split(' '):
|
||
|
yield t
|
||
|
yield '</s>'
|
||
|
|
||
|
n_size = 100000
|
||
|
def get_word_lines_from_file(file_name):
|
||
|
with lzma.open(file_name, 'r') as fh:
|
||
|
n = 0
|
||
|
for line in fh:
|
||
|
n += 1
|
||
|
yield get_words_from_line(line.decode('utf-8'))
|
||
|
if n == n_size:
|
||
|
break
|
||
|
|
||
|
vocab_size = 20000
|
||
|
|
||
|
vocab = build_vocab_from_iterator(
|
||
|
get_word_lines_from_file('train/in.tsv.xz'),
|
||
|
max_tokens = vocab_size,
|
||
|
specials = ['<unk>'])
|
||
|
|
||
|
with open('vocabulary.pickle', 'wb') as handle:
|
||
|
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||
|
|
||
|
vocab['human']
|