25 lines
732 B
Python
25 lines
732 B
Python
from nltk.tokenize import RegexpTokenizer
|
|
SOS_token = 0
|
|
EOS_token = 1
|
|
tokenizer = RegexpTokenizer(r'\w+')
|
|
|
|
class Lang:
|
|
def __init__(self, name):
|
|
self.name = name
|
|
self.word2index = {}
|
|
self.word2count = {}
|
|
self.index2word = {0: "SOS", 1: "EOS"}
|
|
self.n_words = 2 # Count SOS and EOS
|
|
|
|
def addSentence(self, sentence):
|
|
for word in tokenizer.tokenize(sentence):
|
|
self.addWord(word)
|
|
|
|
def addWord(self, word):
|
|
if word not in self.word2index:
|
|
self.word2index[word] = self.n_words
|
|
self.word2count[word] = 1
|
|
self.index2word[self.n_words] = word
|
|
self.n_words += 1
|
|
else:
|
|
self.word2count[word] += 1 |