from torch import nn import torch from torch.utils.data import IterableDataset import itertools import lzma import regex as re import pickle def look_ahead_iterator(gen): prev = None current = None next = None for next in gen: if prev is not None and current is not None: yield (prev, current, next) prev = current current = next def get_words_from_line(line): line = line.rstrip() yield '' for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): yield m.group(0).lower() yield '' def get_word_lines_from_file(file_name): counter=0 with lzma.open(file_name, 'r') as fh: for line in fh: counter+=1 if counter == 100000: break line = line.decode("utf-8") yield get_words_from_line(line) class Trigrams(IterableDataset): def load_vocab(self): with open("vocab.pickle", 'rb') as handle: vocab = pickle.load( handle) return vocab def __init__(self, text_file, vocabulary_size): self.vocab = self.load_vocab() self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))) vocab_size = 20000 train_dataset = Trigrams('train/in.tsv.xz', vocab_size) #=== trenowanie from torch import nn import torch from torch.utils.data import DataLoader embed_size = 100 class SimpleTrigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleTrigramNeuralLanguageModel, self).__init__() self.embedings = nn.Embedding(vocabulary_size, embedding_size) self.linear = nn.Linear(embedding_size*2, vocabulary_size) self.softmax = nn.Softmax() # self.model = nn.Sequential( # nn.Embedding(vocabulary_size, embedding_size), # nn.Linear(embedding_size, vocabulary_size), # nn.Softmax() # ) def forward(self, x): emb_1 = self.embedings(x[0]) emb_2 = self.embedings(x[1]) concated = self.linear(torch.cat((emb_1, emb_2), dim=1)) y = self.softmax(concated) return y model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size) vocab = train_dataset.vocab vocab.set_default_index(vocab['']) ixs = torch.tensor(vocab.forward(['pies'])) # out[0][vocab['jest']] device = 'cpu' model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device) data = DataLoader(train_dataset, batch_size=5000) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y, z in data: x = x.to(device) y = y.to(device) z = z.to(device) optimizer.zero_grad() ypredicted = model([x, z]) loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) torch.save(model.state_dict(), f'model1_{step}.bin') step += 1 loss.backward() optimizer.step() torch.save(model.state_dict(), 'model_tri1.bin')