gorgot to add files
This commit is contained in:
parent
04decdd5ba
commit
f8bec5bc13
16
scripts.py
Normal file
16
scripts.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
import regex as re
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
def get_words_from_line(line):
|
||||||
|
line = line.rstrip()
|
||||||
|
# line = line.lower()
|
||||||
|
line = line.strip()
|
||||||
|
line = line.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
# yield '<s>'
|
||||||
|
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
|
||||||
|
yield m.group(0).lower()
|
||||||
|
# yield '</s>'
|
||||||
|
|
||||||
|
vocab_size = 60000
|
||||||
|
learning_rate=0.0001
|
70
utils.py
Normal file
70
utils.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
import regex as re
|
||||||
|
import string
|
||||||
|
from torch import nn
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from torch.utils.data import IterableDataset
|
||||||
|
import itertools
|
||||||
|
import lzma
|
||||||
|
import regex as re
|
||||||
|
import pickle
|
||||||
|
import scripts
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
def get_words_from_line(line):
|
||||||
|
line = line.rstrip()
|
||||||
|
line = line.lower()
|
||||||
|
line = line.strip()
|
||||||
|
line = line.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
yield '<s>'
|
||||||
|
for m in re.finditer(r'\p{L}+', line):
|
||||||
|
yield m.group(0)
|
||||||
|
yield '</s>'
|
||||||
|
|
||||||
|
vocab_size = 32000
|
||||||
|
learning_rate=0.0001
|
||||||
|
embed_size = 100
|
||||||
|
device = 'cuda'
|
||||||
|
|
||||||
|
class LanguageModel(nn.Module):
|
||||||
|
def __init__(self, vocabulary_size, embedding_size):
|
||||||
|
super(LanguageModel, self).__init__()
|
||||||
|
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
|
||||||
|
self.linear = nn.Linear(embedding_size*3, vocabulary_size)
|
||||||
|
|
||||||
|
self.linear_first_layer = nn.Linear(embedding_size*5, embedding_size*3)
|
||||||
|
self.relu = nn.ReLU()
|
||||||
|
self.softmax = nn.Softmax()
|
||||||
|
|
||||||
|
# self.model = nn.Sequential(
|
||||||
|
# nn.Embedding(vocabulary_size, embedding_size),
|
||||||
|
# nn.Linear(embedding_size, vocabulary_size),
|
||||||
|
# nn.Softmax()
|
||||||
|
# )
|
||||||
|
|
||||||
|
def forward(self, x_in):
|
||||||
|
# emb_1 = self.embedings(x[0])
|
||||||
|
# emb_2 = self.embedings(x[1])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
embeddings = [self.embedings(x) for x in x_in]
|
||||||
|
|
||||||
|
first = embeddings[0]
|
||||||
|
to_sum = embeddings[1:6]
|
||||||
|
to_concat = embeddings[6:]
|
||||||
|
|
||||||
|
for t in to_sum:
|
||||||
|
first = torch.add(first, t)
|
||||||
|
|
||||||
|
to_concat.insert(0, first)
|
||||||
|
|
||||||
|
first_layer = self.linear_first_layer(torch.cat(to_concat, dim=1))
|
||||||
|
after_relu = self.relu(first_layer)
|
||||||
|
concated = self.linear(after_relu)
|
||||||
|
|
||||||
|
y = self.softmax(concated)
|
||||||
|
|
||||||
|
return y
|
Loading…
Reference in New Issue
Block a user