From 04decdd5bab9bcb4dddabb8163205c291ffa5af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pokrywka?= Date: Wed, 31 May 2023 20:31:14 +0200 Subject: [PATCH] Delete 'create_vocab.py' --- create_vocab.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 create_vocab.py diff --git a/create_vocab.py b/create_vocab.py deleted file mode 100644 index 0942d12..0000000 --- a/create_vocab.py +++ /dev/null @@ -1,30 +0,0 @@ -from itertools import islice -import regex as re -import sys -from torchtext.vocab import build_vocab_from_iterator -import lzma -import scripts - - - -def get_word_lines_from_file(file_name): - counter=0 - with lzma.open(file_name, 'r') as fh: - for line in fh: - counter+=1 - # if counter == 10000: - # break - line = line.decode("utf-8") - yield scripts.get_words_from_line(line) - - -vocab_size = scripts.vocab_size - -vocab = build_vocab_from_iterator( - get_word_lines_from_file('train/in.tsv.xz'), - max_tokens = vocab_size, - specials = ['']) - -import pickle -with open("vocab.pickle", 'wb') as handle: - pickle.dump(vocab, handle) \ No newline at end of file