Delete 'create_vocab.py'

This commit is contained in:
Mikołaj Pokrywka 2023-05-31 20:31:14 +02:00
parent 4b3fb1c333
commit 04decdd5ba
1 changed files with 0 additions and 30 deletions

View File

@ -1,30 +0,0 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import scripts
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 10000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
vocab_size = scripts.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)