Delete 'create_vocab.py'
This commit is contained in:
parent
4b3fb1c333
commit
04decdd5ba
@ -1,30 +0,0 @@
|
|||||||
from itertools import islice
|
|
||||||
import regex as re
|
|
||||||
import sys
|
|
||||||
from torchtext.vocab import build_vocab_from_iterator
|
|
||||||
import lzma
|
|
||||||
import scripts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_lines_from_file(file_name):
|
|
||||||
counter=0
|
|
||||||
with lzma.open(file_name, 'r') as fh:
|
|
||||||
for line in fh:
|
|
||||||
counter+=1
|
|
||||||
# if counter == 10000:
|
|
||||||
# break
|
|
||||||
line = line.decode("utf-8")
|
|
||||||
yield scripts.get_words_from_line(line)
|
|
||||||
|
|
||||||
|
|
||||||
vocab_size = scripts.vocab_size
|
|
||||||
|
|
||||||
vocab = build_vocab_from_iterator(
|
|
||||||
get_word_lines_from_file('train/in.tsv.xz'),
|
|
||||||
max_tokens = vocab_size,
|
|
||||||
specials = ['<unk>'])
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
with open("vocab.pickle", 'wb') as handle:
|
|
||||||
pickle.dump(vocab, handle)
|
|
Loading…
Reference in New Issue
Block a user