challenging-america-word-ga.../bigram-neural/train.py

35 lines
831 B
Python
Raw Permalink Normal View History

2023-04-26 15:00:18 +02:00
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for t in line.split(' '):
yield t
yield '</s>'
n_size = 100000
def get_word_lines_from_file(file_name):
with lzma.open(file_name, 'r') as fh:
n = 0
for line in fh:
n += 1
yield get_words_from_line(line.decode('utf-8'))
if n == n_size:
break
vocab_size = 20000
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
with open('vocabulary.pickle', 'wb') as handle:
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
vocab['human']