2021-01-27 03:54:11 +01:00
|
|
|
from nltk.tokenize import RegexpTokenizer
|
2021-01-31 16:54:20 +01:00
|
|
|
SOS_token = 2
|
|
|
|
PAD_token = 0
|
2021-01-27 03:54:11 +01:00
|
|
|
|
|
|
|
class Lang:
|
|
|
|
def __init__(self, name):
|
|
|
|
self.name = name
|
|
|
|
self.word2index = {}
|
|
|
|
self.word2count = {}
|
2021-01-31 16:54:20 +01:00
|
|
|
self.index2word = {0: "PAD", 1: "UNK", 2: "SOS"}
|
|
|
|
self.n_words = 2
|
2021-01-27 03:54:11 +01:00
|
|
|
|
|
|
|
def addSentence(self, sentence):
|
2021-01-31 16:54:20 +01:00
|
|
|
for word in sentence.split():
|
2021-01-27 03:54:11 +01:00
|
|
|
self.addWord(word)
|
|
|
|
|
|
|
|
def addWord(self, word):
|
|
|
|
if word not in self.word2index:
|
|
|
|
self.word2index[word] = self.n_words
|
|
|
|
self.word2count[word] = 1
|
|
|
|
self.index2word[self.n_words] = word
|
|
|
|
self.n_words += 1
|
|
|
|
else:
|
|
|
|
self.word2count[word] += 1
|