diff --git a/scripts.py b/scripts.py new file mode 100644 index 0000000..48387fe --- /dev/null +++ b/scripts.py @@ -0,0 +1,16 @@ +import regex as re +import string + + +def get_words_from_line(line): + line = line.rstrip() + # line = line.lower() + line = line.strip() + line = line.translate(str.maketrans('', '', string.punctuation)) +# yield '' + for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): + yield m.group(0).lower() +# yield '' + +vocab_size = 60000 +learning_rate=0.0001 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..e84c271 --- /dev/null +++ b/utils.py @@ -0,0 +1,70 @@ +import regex as re +import string +from torch import nn +import torch +from torch.utils.data import DataLoader + +from torch.utils.data import IterableDataset +import itertools +import lzma +import regex as re +import pickle +import scripts +import string + + +def get_words_from_line(line): + line = line.rstrip() + line = line.lower() + line = line.strip() + line = line.translate(str.maketrans('', '', string.punctuation)) + yield '' + for m in re.finditer(r'\p{L}+', line): + yield m.group(0) + yield '' + +vocab_size = 32000 +learning_rate=0.0001 +embed_size = 100 +device = 'cuda' + +class LanguageModel(nn.Module): + def __init__(self, vocabulary_size, embedding_size): + super(LanguageModel, self).__init__() + self.embedings = nn.Embedding(vocabulary_size, embedding_size) + self.linear = nn.Linear(embedding_size*3, vocabulary_size) + + self.linear_first_layer = nn.Linear(embedding_size*5, embedding_size*3) + self.relu = nn.ReLU() + self.softmax = nn.Softmax() + + # self.model = nn.Sequential( + # nn.Embedding(vocabulary_size, embedding_size), + # nn.Linear(embedding_size, vocabulary_size), + # nn.Softmax() + # ) + + def forward(self, x_in): + # emb_1 = self.embedings(x[0]) + # emb_2 = self.embedings(x[1]) + + + + embeddings = [self.embedings(x) for x in x_in] + + first = embeddings[0] + to_sum = embeddings[1:6] + to_concat = embeddings[6:] + + for t in to_sum: + first = torch.add(first, t) + + to_concat.insert(0, first) + + first_layer = self.linear_first_layer(torch.cat(to_concat, dim=1)) + after_relu = self.relu(first_layer) + concated = self.linear(after_relu) + + y = self.softmax(concated) + + return y \ No newline at end of file