Compare commits

...

No commits in common. "2aafd886e6954d484b84d21293b59c8329c76648" and "master" have entirely different histories.

24 changed files with 18098 additions and 107873 deletions

9
README.md Normal file
View File

@ -0,0 +1,9 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
Evaluation metric
-----------------
LikelihoodHashed is the metric

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

View File

@ -1,30 +0,0 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import scripts
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 10000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
vocab_size = scripts.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

10519
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
geval Executable file

Binary file not shown.

View File

@ -1,14 +0,0 @@
description: Zajęcia 8
tags:
- trigram
- neural-network
- hidden-layer
- hiperparameters
params:
epochs: 1,2,3,4
learning-rate: 0.0001,0.00001
batch-size: 6400,12800
training-set: 100000-lines
links:
- title: "Git WMI"
url: "https://git.wmi.amu.edu.pl/s444463/neural_word_gap"

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

View File

@ -1,107 +0,0 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
vocab_size = scripts.vocab_size
embed_size = 100
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
model.eval()
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load(handle)
vocab.set_default_index(vocab['<unk>'])
step = 0
with lzma.open('dev-0/in.tsv.xz', 'rb') as file:
for line in file:
line = line.decode('utf-8')
line = line.rstrip()
# line = line.lower()
line = line.replace("\\\\n", ' ')
line_splitted = line.split('\t')[-2:]
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
# prev = line[0].split(' ')[-1]
# next = line[1].split(' ')[0]
x = torch.tensor(vocab.forward([prev]))
z = torch.tensor(vocab.forward([next]))
x = x.to(device)
z = z.to(device)
ypredicted = model([x, z])
try:
top = torch.topk(ypredicted[0], 128)
except:
print(ypredicted[0])
raise Exception('aa')
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
if '<unk>' in w:
continue
if re.search(r'\p{L}+', w):
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
print(f"the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
print(string_to_print)

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Word
1 Word

153
run.py Normal file
View File

@ -0,0 +1,153 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
counter_lines+=1
if counter_lines > 90000:
break
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8")
# words = get_words(trainset)
ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items():
if bigram_amount <=2:
continue
p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]]
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
return probabilities_bigrams
words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
# sorted_by_freq = freq_list(ngrams)
PREFIX_VALID = 'test-A'
def count_probabilities(w_b, w_a, probs, w_c, b_c):
results_before = {}
results_after = {}
for bigram, probses in probs.items():
if len(results_before) > 20 or len(results_after) > 20:
break
if w_b == bigram[0]:
results_before[bigram] = probses[0]
if w_a == bigram[1]:
results_after[bigram] = probses[1]
a=1
best_ = {}
for bigram, probses in results_before.items():
for bigram_2, probses_2 in results_after.items():
best_[bigram[1]] = probses * probses_2
for bigram, probses in results_after.items():
for bigram_2, probses_2 in results_before.items():
if bigram[0] in best_:
if probses * probses_2 < probses_2:
continue
best_[bigram[0]] = probses * probses_2
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.replace('\\n', ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_pre = t_line_splitted_by_tab[-2].split()
words_po = t_line_splitted_by_tab[-1].split()
w_pre = words_pre[-1]
w_po = words_po[0]
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
if len(probs_ordered) ==0:
print(f"the:0.5 a:0.3 :0.2")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.1'
print(result_string)
a=1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

124
train.py
View File

@ -1,124 +0,0 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
def look_ahead_iterator(gen):
prev = None
current = None
next = None
for next in gen:
if prev is not None and current is not None:
yield (prev, current, next)
prev = current
current = next
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
if counter == 100000:
break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
class Trigrams(IterableDataset):
def load_vocab(self):
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
return vocab
def __init__(self, text_file, vocabulary_size):
self.vocab = self.load_vocab()
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab_size = scripts.vocab_size
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
#=== trenowanie
from torch import nn
import torch
from torch.utils.data import DataLoader
embed_size = 100
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
vocab = train_dataset.vocab
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=12800)
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
epochs = 4
for i in range(epochs):
for x, y, z in data:
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model([x, z])
loss = criterion(torch.log(ypredicted), y)
if step % 2000 == 0:
print(step, loss)
# torch.save(model.state_dict(), f'model1_{step}.bin')
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
print(step, loss, f'model_epoch_{i}.bin')
torch.save(model.state_dict(), 'model_tri1.bin')