This commit is contained in:
Mikolaj 2023-05-09 21:44:00 +02:00
commit cef5c90195
22 changed files with 136333 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token

30
create_vocab.py Normal file
View File

@ -0,0 +1,30 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import scripts
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 10000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
vocab_size = scripts.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)

10519
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

10519
dev-0/hate-speech-info.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
dev-0/in.tsv.xz Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

14
gonito.yaml Normal file
View File

@ -0,0 +1,14 @@
description: Zajęcia 8
tags:
- trigram
- neural-network
- hidden-layer
- hiperparameters
params:
epochs: 1,2,3,4
learning-rate: 0.0001,0.00001
batch-size: 6400,12800
training-set: 100000-lines
links:
- title: "Git WMI"
url: "https://git.wmi.amu.edu.pl/s444463/neural_word_gap"

107
inference.py Normal file
View File

@ -0,0 +1,107 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
vocab_size = scripts.vocab_size
embed_size = 100
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
model.eval()
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load(handle)
vocab.set_default_index(vocab['<unk>'])
step = 0
with lzma.open('dev-0/in.tsv.xz', 'rb') as file:
for line in file:
line = line.decode('utf-8')
line = line.rstrip()
# line = line.lower()
line = line.replace("\\\\n", ' ')
line_splitted = line.split('\t')[-2:]
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
# prev = line[0].split(' ')[-1]
# next = line[1].split(' ')[0]
x = torch.tensor(vocab.forward([prev]))
z = torch.tensor(vocab.forward([next]))
x = x.to(device)
z = z.to(device)
ypredicted = model([x, z])
try:
top = torch.topk(ypredicted[0], 128)
except:
print(ypredicted[0])
raise Exception('aa')
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
if '<unk>' in w:
continue
if re.search(r'\p{L}+', w):
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
print(f"the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
print(string_to_print)

7414
test-A/hate-speech-info.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
test-A/in.tsv.xz Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

124
train.py Normal file
View File

@ -0,0 +1,124 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
def look_ahead_iterator(gen):
prev = None
current = None
next = None
for next in gen:
if prev is not None and current is not None:
yield (prev, current, next)
prev = current
current = next
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
if counter == 100000:
break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
class Trigrams(IterableDataset):
def load_vocab(self):
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
return vocab
def __init__(self, text_file, vocabulary_size):
self.vocab = self.load_vocab()
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab_size = scripts.vocab_size
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
#=== trenowanie
from torch import nn
import torch
from torch.utils.data import DataLoader
embed_size = 100
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
vocab = train_dataset.vocab
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=12800)
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
epochs = 4
for i in range(epochs):
for x, y, z in data:
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model([x, z])
loss = criterion(torch.log(ypredicted), y)
if step % 2000 == 0:
print(step, loss)
# torch.save(model.state_dict(), f'model1_{step}.bin')
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
print(step, loss, f'model_epoch_{i}.bin')
torch.save(model.state_dict(), 'model_tri1.bin')