wmt-2017-cs-en/lang.py
2021-01-27 04:01:04 +01:00

25 lines
732 B
Python

from nltk.tokenize import RegexpTokenizer
SOS_token = 0
EOS_token = 1
tokenizer = RegexpTokenizer(r'\w+')
class Lang:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {0: "SOS", 1: "EOS"}
self.n_words = 2 # Count SOS and EOS
def addSentence(self, sentence):
for word in tokenizer.tokenize(sentence):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1