Compare commits

...

No commits in common. "master" and "zad_8" have entirely different histories.

24 changed files with 107873 additions and 18098 deletions

View File

@ -1,9 +0,0 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
Evaluation metric
-----------------
LikelihoodHashed is the metric

View File

@ -1 +0,0 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

30
create_vocab.py Normal file
View File

@ -0,0 +1,30 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import scripts
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 10000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
vocab_size = scripts.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
geval

Binary file not shown.

14
gonito.yaml Normal file
View File

@ -0,0 +1,14 @@
description: Zajęcia 8
tags:
- trigram
- neural-network
- hidden-layer
- hiperparameters
params:
epochs: 1,2,3,4
learning-rate: 0.0001,0.00001
batch-size: 6400,12800
training-set: 100000-lines
links:
- title: "Git WMI"
url: "https://git.wmi.amu.edu.pl/s444463/neural_word_gap"

View File

@ -1 +0,0 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

107
inference.py Normal file
View File

@ -0,0 +1,107 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
vocab_size = scripts.vocab_size
embed_size = 100
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
model.eval()
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load(handle)
vocab.set_default_index(vocab['<unk>'])
step = 0
with lzma.open('dev-0/in.tsv.xz', 'rb') as file:
for line in file:
line = line.decode('utf-8')
line = line.rstrip()
# line = line.lower()
line = line.replace("\\\\n", ' ')
line_splitted = line.split('\t')[-2:]
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
# prev = line[0].split(' ')[-1]
# next = line[1].split(' ')[0]
x = torch.tensor(vocab.forward([prev]))
z = torch.tensor(vocab.forward([next]))
x = x.to(device)
z = z.to(device)
ypredicted = model([x, z])
try:
top = torch.topk(ypredicted[0], 128)
except:
print(ypredicted[0])
raise Exception('aa')
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
if '<unk>' in w:
continue
if re.search(r'\p{L}+', w):
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
print(f"the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
print(string_to_print)

View File

@ -1 +0,0 @@
Word
1 Word

153
run.py
View File

@ -1,153 +0,0 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
counter_lines+=1
if counter_lines > 90000:
break
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8")
# words = get_words(trainset)
ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items():
if bigram_amount <=2:
continue
p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]]
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
return probabilities_bigrams
words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
# sorted_by_freq = freq_list(ngrams)
PREFIX_VALID = 'test-A'
def count_probabilities(w_b, w_a, probs, w_c, b_c):
results_before = {}
results_after = {}
for bigram, probses in probs.items():
if len(results_before) > 20 or len(results_after) > 20:
break
if w_b == bigram[0]:
results_before[bigram] = probses[0]
if w_a == bigram[1]:
results_after[bigram] = probses[1]
a=1
best_ = {}
for bigram, probses in results_before.items():
for bigram_2, probses_2 in results_after.items():
best_[bigram[1]] = probses * probses_2
for bigram, probses in results_after.items():
for bigram_2, probses_2 in results_before.items():
if bigram[0] in best_:
if probses * probses_2 < probses_2:
continue
best_[bigram[0]] = probses * probses_2
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.replace('\\n', ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_pre = t_line_splitted_by_tab[-2].split()
words_po = t_line_splitted_by_tab[-1].split()
w_pre = words_pre[-1]
w_po = words_po[0]
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
if len(probs_ordered) ==0:
print(f"the:0.5 a:0.3 :0.2")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.1'
print(result_string)
a=1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

124
train.py Normal file
View File

@ -0,0 +1,124 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
def look_ahead_iterator(gen):
prev = None
current = None
next = None
for next in gen:
if prev is not None and current is not None:
yield (prev, current, next)
prev = current
current = next
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
if counter == 100000:
break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
class Trigrams(IterableDataset):
def load_vocab(self):
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
return vocab
def __init__(self, text_file, vocabulary_size):
self.vocab = self.load_vocab()
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab_size = scripts.vocab_size
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
#=== trenowanie
from torch import nn
import torch
from torch.utils.data import DataLoader
embed_size = 100
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
vocab = train_dataset.vocab
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=12800)
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
epochs = 4
for i in range(epochs):
for x, y, z in data:
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model([x, z])
loss = criterion(torch.log(ypredicted), y)
if step % 2000 == 0:
print(step, loss)
# torch.save(model.state_dict(), f'model1_{step}.bin')
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
print(step, loss, f'model_epoch_{i}.bin')
torch.save(model.state_dict(), 'model_tri1.bin')