Compare commits
No commits in common. "2aafd886e6954d484b84d21293b59c8329c76648" and "master" have entirely different histories.
2aafd886e6
...
master
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
Challenging America word-gap prediction
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Guess a word in a gap.
|
||||||
|
|
||||||
|
Evaluation metric
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
LikelihoodHashed is the metric
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
@ -1,30 +0,0 @@
|
|||||||
from itertools import islice
|
|
||||||
import regex as re
|
|
||||||
import sys
|
|
||||||
from torchtext.vocab import build_vocab_from_iterator
|
|
||||||
import lzma
|
|
||||||
import scripts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_lines_from_file(file_name):
|
|
||||||
counter=0
|
|
||||||
with lzma.open(file_name, 'r') as fh:
|
|
||||||
for line in fh:
|
|
||||||
counter+=1
|
|
||||||
# if counter == 10000:
|
|
||||||
# break
|
|
||||||
line = line.decode("utf-8")
|
|
||||||
yield scripts.get_words_from_line(line)
|
|
||||||
|
|
||||||
|
|
||||||
vocab_size = scripts.vocab_size
|
|
||||||
|
|
||||||
vocab = build_vocab_from_iterator(
|
|
||||||
get_word_lines_from_file('train/in.tsv.xz'),
|
|
||||||
max_tokens = vocab_size,
|
|
||||||
specials = ['<unk>'])
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
with open("vocab.pickle", 'wb') as handle:
|
|
||||||
pickle.dump(vocab, handle)
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
14
gonito.yaml
14
gonito.yaml
@ -1,14 +0,0 @@
|
|||||||
description: Zajęcia 8
|
|
||||||
tags:
|
|
||||||
- trigram
|
|
||||||
- neural-network
|
|
||||||
- hidden-layer
|
|
||||||
- hiperparameters
|
|
||||||
params:
|
|
||||||
epochs: 1,2,3,4
|
|
||||||
learning-rate: 0.0001,0.00001
|
|
||||||
batch-size: 6400,12800
|
|
||||||
training-set: 100000-lines
|
|
||||||
links:
|
|
||||||
- title: "Git WMI"
|
|
||||||
url: "https://git.wmi.amu.edu.pl/s444463/neural_word_gap"
|
|
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
FileId Year LeftContext RightContext
|
|
107
inference.py
107
inference.py
@ -1,107 +0,0 @@
|
|||||||
from torch import nn
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
from torch.utils.data import IterableDataset
|
|
||||||
import itertools
|
|
||||||
import lzma
|
|
||||||
import regex as re
|
|
||||||
import pickle
|
|
||||||
import scripts
|
|
||||||
import os
|
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
|
||||||
|
|
||||||
class SimpleTrigramNeuralLanguageModel(nn.Module):
|
|
||||||
def __init__(self, vocabulary_size, embedding_size):
|
|
||||||
super(SimpleTrigramNeuralLanguageModel, self).__init__()
|
|
||||||
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
|
|
||||||
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
|
|
||||||
|
|
||||||
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
self.softmax = nn.Softmax()
|
|
||||||
|
|
||||||
# self.model = nn.Sequential(
|
|
||||||
# nn.Embedding(vocabulary_size, embedding_size),
|
|
||||||
# nn.Linear(embedding_size, vocabulary_size),
|
|
||||||
# nn.Softmax()
|
|
||||||
# )
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
emb_1 = self.embedings(x[0])
|
|
||||||
emb_2 = self.embedings(x[1])
|
|
||||||
|
|
||||||
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
|
|
||||||
after_relu = self.relu(first_layer)
|
|
||||||
concated = self.linear(after_relu)
|
|
||||||
|
|
||||||
y = self.softmax(concated)
|
|
||||||
|
|
||||||
return y
|
|
||||||
|
|
||||||
vocab_size = scripts.vocab_size
|
|
||||||
embed_size = 100
|
|
||||||
device = 'cuda'
|
|
||||||
|
|
||||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
|
|
||||||
|
|
||||||
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
|
|
||||||
model.eval()
|
|
||||||
|
|
||||||
with open("vocab.pickle", 'rb') as handle:
|
|
||||||
vocab = pickle.load(handle)
|
|
||||||
vocab.set_default_index(vocab['<unk>'])
|
|
||||||
|
|
||||||
|
|
||||||
step = 0
|
|
||||||
|
|
||||||
|
|
||||||
with lzma.open('dev-0/in.tsv.xz', 'rb') as file:
|
|
||||||
for line in file:
|
|
||||||
line = line.decode('utf-8')
|
|
||||||
line = line.rstrip()
|
|
||||||
# line = line.lower()
|
|
||||||
line = line.replace("\\\\n", ' ')
|
|
||||||
|
|
||||||
|
|
||||||
line_splitted = line.split('\t')[-2:]
|
|
||||||
|
|
||||||
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
|
|
||||||
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
|
|
||||||
|
|
||||||
# prev = line[0].split(' ')[-1]
|
|
||||||
# next = line[1].split(' ')[0]
|
|
||||||
|
|
||||||
|
|
||||||
x = torch.tensor(vocab.forward([prev]))
|
|
||||||
z = torch.tensor(vocab.forward([next]))
|
|
||||||
x = x.to(device)
|
|
||||||
z = z.to(device)
|
|
||||||
ypredicted = model([x, z])
|
|
||||||
|
|
||||||
try:
|
|
||||||
|
|
||||||
top = torch.topk(ypredicted[0], 128)
|
|
||||||
except:
|
|
||||||
print(ypredicted[0])
|
|
||||||
raise Exception('aa')
|
|
||||||
top_indices = top.indices.tolist()
|
|
||||||
top_probs = top.values.tolist()
|
|
||||||
top_words = vocab.lookup_tokens(top_indices)
|
|
||||||
|
|
||||||
string_to_print = ''
|
|
||||||
sum_probs = 0
|
|
||||||
|
|
||||||
for w, p in zip(top_words, top_probs):
|
|
||||||
if '<unk>' in w:
|
|
||||||
continue
|
|
||||||
if re.search(r'\p{L}+', w):
|
|
||||||
string_to_print += f"{w}:{p} "
|
|
||||||
sum_probs += p
|
|
||||||
if string_to_print == '':
|
|
||||||
print(f"the:0.2 a:0.3 :0.5")
|
|
||||||
continue
|
|
||||||
unknow_prob = 1 - sum_probs
|
|
||||||
string_to_print += f":{unknow_prob}"
|
|
||||||
|
|
||||||
print(string_to_print)
|
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
Word
|
|
153
run.py
Normal file
153
run.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
import lzma
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from math import log
|
||||||
|
from collections import OrderedDict
|
||||||
|
from collections import Counter
|
||||||
|
import regex as re
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
def freq_list(g, top=None):
|
||||||
|
c = Counter(g)
|
||||||
|
|
||||||
|
if top is None:
|
||||||
|
items = c.items()
|
||||||
|
else:
|
||||||
|
items = c.most_common(top)
|
||||||
|
|
||||||
|
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
||||||
|
|
||||||
|
def get_words(t):
|
||||||
|
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
|
||||||
|
yield m.group(0)
|
||||||
|
|
||||||
|
def ngrams(iter, size):
|
||||||
|
ngram = []
|
||||||
|
for item in iter:
|
||||||
|
ngram.append(item)
|
||||||
|
if len(ngram) == size:
|
||||||
|
yield tuple(ngram)
|
||||||
|
ngram = ngram[1:]
|
||||||
|
|
||||||
|
PREFIX_TRAIN = 'train'
|
||||||
|
words = []
|
||||||
|
|
||||||
|
counter_lines = 0
|
||||||
|
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
|
||||||
|
for t_line, e_line in zip(train, expected):
|
||||||
|
t_line = t_line.decode("utf-8")
|
||||||
|
|
||||||
|
t_line = t_line.rstrip()
|
||||||
|
e_line = e_line.rstrip()
|
||||||
|
|
||||||
|
t_line_splitted_by_tab = t_line.split('\t')
|
||||||
|
|
||||||
|
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
|
||||||
|
|
||||||
|
words += t_line_cleared.split()
|
||||||
|
|
||||||
|
counter_lines+=1
|
||||||
|
if counter_lines > 90000:
|
||||||
|
break
|
||||||
|
|
||||||
|
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
|
||||||
|
|
||||||
|
# content = lzmaFile.read().decode("utf-8")
|
||||||
|
# words = get_words(trainset)
|
||||||
|
|
||||||
|
ngrams_ = ngrams(words, 2)
|
||||||
|
|
||||||
|
|
||||||
|
def create_probabilities_bigrams(w_c, b_c):
|
||||||
|
probabilities_bigrams = {}
|
||||||
|
for bigram, bigram_amount in b_c.items():
|
||||||
|
if bigram_amount <=2:
|
||||||
|
continue
|
||||||
|
p_word_before = bigram_amount / w_c[bigram[0]]
|
||||||
|
p_word_after = bigram_amount / w_c[bigram[1]]
|
||||||
|
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
|
||||||
|
|
||||||
|
return probabilities_bigrams
|
||||||
|
|
||||||
|
words_c = Counter(words)
|
||||||
|
word_=''
|
||||||
|
bigram_c = Counter(ngrams_)
|
||||||
|
ngrams_=''
|
||||||
|
probabilities = create_probabilities_bigrams(words_c, bigram_c)
|
||||||
|
|
||||||
|
|
||||||
|
items = probabilities.items()
|
||||||
|
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
||||||
|
items=''
|
||||||
|
# sorted_by_freq = freq_list(ngrams)
|
||||||
|
|
||||||
|
PREFIX_VALID = 'test-A'
|
||||||
|
|
||||||
|
def count_probabilities(w_b, w_a, probs, w_c, b_c):
|
||||||
|
results_before = {}
|
||||||
|
results_after = {}
|
||||||
|
for bigram, probses in probs.items():
|
||||||
|
if len(results_before) > 20 or len(results_after) > 20:
|
||||||
|
break
|
||||||
|
if w_b == bigram[0]:
|
||||||
|
results_before[bigram] = probses[0]
|
||||||
|
if w_a == bigram[1]:
|
||||||
|
results_after[bigram] = probses[1]
|
||||||
|
a=1
|
||||||
|
best_ = {}
|
||||||
|
|
||||||
|
for bigram, probses in results_before.items():
|
||||||
|
for bigram_2, probses_2 in results_after.items():
|
||||||
|
best_[bigram[1]] = probses * probses_2
|
||||||
|
|
||||||
|
for bigram, probses in results_after.items():
|
||||||
|
for bigram_2, probses_2 in results_before.items():
|
||||||
|
if bigram[0] in best_:
|
||||||
|
if probses * probses_2 < probses_2:
|
||||||
|
continue
|
||||||
|
best_[bigram[0]] = probses * probses_2
|
||||||
|
|
||||||
|
items = best_.items()
|
||||||
|
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
||||||
|
|
||||||
|
|
||||||
|
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
|
||||||
|
for t_line in train:
|
||||||
|
t_line = t_line.decode("utf-8")
|
||||||
|
|
||||||
|
t_line = t_line.rstrip()
|
||||||
|
t_line = t_line.replace('\\n', ' ')
|
||||||
|
|
||||||
|
|
||||||
|
t_line_splitted_by_tab = t_line.split('\t')
|
||||||
|
|
||||||
|
|
||||||
|
words_pre = t_line_splitted_by_tab[-2].split()
|
||||||
|
|
||||||
|
words_po = t_line_splitted_by_tab[-1].split()
|
||||||
|
|
||||||
|
w_pre = words_pre[-1]
|
||||||
|
w_po = words_po[0]
|
||||||
|
|
||||||
|
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
|
||||||
|
if len(probs_ordered) ==0:
|
||||||
|
print(f"the:0.5 a:0.3 :0.2")
|
||||||
|
continue
|
||||||
|
result_string = ''
|
||||||
|
counter_ = 0
|
||||||
|
for word_, p in probs_ordered.items():
|
||||||
|
if counter_>4:
|
||||||
|
break
|
||||||
|
re_ = re.search(r'\p{L}+', word_)
|
||||||
|
if re_:
|
||||||
|
word_cleared = re_.group(0)
|
||||||
|
result_string += f"{word_cleared}:{str(p)} "
|
||||||
|
|
||||||
|
else:
|
||||||
|
if result_string == '':
|
||||||
|
result_string = f"the:0.5 a:0.3 "
|
||||||
|
continue
|
||||||
|
|
||||||
|
counter_+=1
|
||||||
|
result_string += ':0.1'
|
||||||
|
print(result_string)
|
||||||
|
a=1
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
124
train.py
124
train.py
@ -1,124 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
from torch import nn
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
from torch.utils.data import IterableDataset
|
|
||||||
import itertools
|
|
||||||
import lzma
|
|
||||||
import regex as re
|
|
||||||
import pickle
|
|
||||||
import scripts
|
|
||||||
|
|
||||||
|
|
||||||
def look_ahead_iterator(gen):
|
|
||||||
prev = None
|
|
||||||
current = None
|
|
||||||
next = None
|
|
||||||
for next in gen:
|
|
||||||
if prev is not None and current is not None:
|
|
||||||
yield (prev, current, next)
|
|
||||||
prev = current
|
|
||||||
current = next
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_lines_from_file(file_name):
|
|
||||||
counter=0
|
|
||||||
with lzma.open(file_name, 'r') as fh:
|
|
||||||
for line in fh:
|
|
||||||
counter+=1
|
|
||||||
if counter == 100000:
|
|
||||||
break
|
|
||||||
line = line.decode("utf-8")
|
|
||||||
yield scripts.get_words_from_line(line)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Trigrams(IterableDataset):
|
|
||||||
def load_vocab(self):
|
|
||||||
with open("vocab.pickle", 'rb') as handle:
|
|
||||||
vocab = pickle.load( handle)
|
|
||||||
return vocab
|
|
||||||
|
|
||||||
def __init__(self, text_file, vocabulary_size):
|
|
||||||
self.vocab = self.load_vocab()
|
|
||||||
self.vocab.set_default_index(self.vocab['<unk>'])
|
|
||||||
self.vocabulary_size = vocabulary_size
|
|
||||||
self.text_file = text_file
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return look_ahead_iterator(
|
|
||||||
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
|
|
||||||
|
|
||||||
vocab_size = scripts.vocab_size
|
|
||||||
|
|
||||||
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=== trenowanie
|
|
||||||
from torch import nn
|
|
||||||
import torch
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
embed_size = 100
|
|
||||||
|
|
||||||
class SimpleTrigramNeuralLanguageModel(nn.Module):
|
|
||||||
def __init__(self, vocabulary_size, embedding_size):
|
|
||||||
super(SimpleTrigramNeuralLanguageModel, self).__init__()
|
|
||||||
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
|
|
||||||
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
|
|
||||||
|
|
||||||
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
self.softmax = nn.Softmax()
|
|
||||||
|
|
||||||
# self.model = nn.Sequential(
|
|
||||||
# nn.Embedding(vocabulary_size, embedding_size),
|
|
||||||
# nn.Linear(embedding_size, vocabulary_size),
|
|
||||||
# nn.Softmax()
|
|
||||||
# )
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
emb_1 = self.embedings(x[0])
|
|
||||||
emb_2 = self.embedings(x[1])
|
|
||||||
|
|
||||||
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
|
|
||||||
after_relu = self.relu(first_layer)
|
|
||||||
concated = self.linear(after_relu)
|
|
||||||
|
|
||||||
y = self.softmax(concated)
|
|
||||||
|
|
||||||
return y
|
|
||||||
|
|
||||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
|
|
||||||
|
|
||||||
vocab = train_dataset.vocab
|
|
||||||
|
|
||||||
|
|
||||||
device = 'cuda'
|
|
||||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
|
|
||||||
data = DataLoader(train_dataset, batch_size=12800)
|
|
||||||
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
|
|
||||||
criterion = torch.nn.NLLLoss()
|
|
||||||
|
|
||||||
model.train()
|
|
||||||
step = 0
|
|
||||||
epochs = 4
|
|
||||||
for i in range(epochs):
|
|
||||||
for x, y, z in data:
|
|
||||||
x = x.to(device)
|
|
||||||
y = y.to(device)
|
|
||||||
z = z.to(device)
|
|
||||||
optimizer.zero_grad()
|
|
||||||
ypredicted = model([x, z])
|
|
||||||
loss = criterion(torch.log(ypredicted), y)
|
|
||||||
if step % 2000 == 0:
|
|
||||||
print(step, loss)
|
|
||||||
# torch.save(model.state_dict(), f'model1_{step}.bin')
|
|
||||||
step += 1
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
|
|
||||||
print(step, loss, f'model_epoch_{i}.bin')
|
|
||||||
torch.save(model.state_dict(), 'model_tri1.bin')
|
|
Loading…
Reference in New Issue
Block a user