Compare commits

...

No commits in common. "zad_10" and "master" have entirely different histories.

13 changed files with 18098 additions and 18398 deletions

9
README.md Normal file
View File

@ -0,0 +1,9 @@
Challenging America word-gap prediction
===================================
Guess a word in a gap.
Evaluation metric
-----------------
LikelihoodHashed is the metric

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

File diff suppressed because it is too large Load Diff

View File

@ -1,104 +0,0 @@
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
import pdb
import utils
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda'
vocab_size = utils.vocab_size
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
model = Model(vocab_size = vocab_size).to(device)
model.load_state_dict(torch.load('lstm_step_10000.bin'))
model.eval()
def predict(model, text_splitted):
model.eval()
words = text_splitted
x = torch.tensor([[vocab[w] for w in words]]).to(device)
state_h, state_c = model.init_state(x.size()[0])
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
last_word_logits = y_pred[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0)
top = torch.topk(p, 10)
top_indices = top.indices.tolist()
top_words = vocab.lookup_tokens(top_indices)
if '<unk>' in top_words:
top_words.remove('<unk>')
return np.random.choice(top_words)
prompts = [
'These, and a thousand other means, by which the wealth of a nation may be greatly increase',
'Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,',
'Mr. Deddrick intends to clothe it and\ngive it as nearly as possible a likeness'
]
for p in prompts:
answer = ''
for i in range(10):
answer += predict(model, p.split()) + ' '
print('Prompt: ', p)
print('Answer: ', answer)
# Prompt: These, and a thousand other means, by which the wealth of a nation may be greatly increase
# Answer: as the of as and to in to for in
# Prompt: Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,
# Answer: in that The a the of the to the for
# Prompt: Mr. Deddrick intends to clothe it and
# give it as nearly as possible a likeness
# Answer: and of\nthe for man in of\nthe and of man of

BIN
geval Executable file

Binary file not shown.

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

View File

@ -1,118 +0,0 @@
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
import pdb
import utils
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda'
vocab_size = utils.vocab_size
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
model = Model(vocab_size = vocab_size).to(device)
model.load_state_dict(torch.load('lstm_step_10000.bin'))
model.eval()
def predict(model, text_splitted):
model.eval()
words = text_splitted
x = torch.tensor([[vocab[w] for w in words]]).to(device)
state_h, state_c = model.init_state(x.size()[0])
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
last_word_logits = y_pred[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0)
top = torch.topk(p, 64)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
return top_words, top_probs
inference_result = []
with lzma.open(f'test-A/in.tsv.xz', 'r') as file:
for line in file:
line = line.decode("utf-8")
line = line.rstrip()
line = line.translate(str.maketrans('', '', string.punctuation))
line_splitted_by_tab = line.split('\t')
left_context = line_splitted_by_tab[-2]
left_context_splitted = list(utils.get_words_from_line(left_context))
top_words, top_probs = predict(model, left_context_splitted)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
# print(top_words)
if '<unk>' in w:
continue
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
inference_result.append("the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
inference_result.append(string_to_print)
with open('test-A/out.tsv', 'w') as f:
for line in inference_result:
f.write(line+'\n')
print('All done')

189
lstm.py
View File

@ -1,189 +0,0 @@
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
import pdb
import utils
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda'
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
def get_word_lines_from_file(file_name):
counter=0
seq_len = 10
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 100000:
# break
line = line.decode("utf-8")
line_splitted = utils.get_words_from_line(line)
vocab_line = [vocab[t] for t in line_splitted]
for i in range(len(vocab_line) - seq_len):
yield torch.tensor(vocab_line[i:i+seq_len]), torch.tensor(vocab_line[i+1 :i+seq_len+1])
class Grams_10(IterableDataset):
def __init__(self, text_file, vocab):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.text_file = text_file
def __iter__(self):
return get_word_lines_from_file(self.text_file)
vocab_size = utils.vocab_size
train_dataset = Grams_10('train/in.tsv.xz', vocab)
BATCH_SIZE = 1024
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
def train(dataloader, model, max_epochs):
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(max_epochs):
step = 0
for batch_i, (x, y) in enumerate(dataloader):
# pdb.set_trace()
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
y_pred, (state_h, state_c) = model(x)
# pdb.set_trace()
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
step+=1
if step % 500 == 0:
print({ 'epoch': epoch,'step': step ,'loss': loss.item(), })
# torch.save(model.state_dict(), f'lstm_step_{step}.bin')
if step % 5000 == 0:
print({ 'epoch': epoch, 'step': step, 'loss': loss.item() })
torch.save(model.state_dict(), f'lstm_step_{step}.bin')
torch.save(model.state_dict(), f'lstm_epoch_{epoch}.bin')
# break
print('Halko zaczynamy trenowanie')
model = Model(vocab_size = vocab_size).to(device)
dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE)
train(dataset, model, 1)
torch.save(model.state_dict(), f'lstm.bin')
# def predict(model, text_splitted):
# model.eval()
# words = text_splitted
# x = torch.tensor([[vocab[w] for w in words]]).to(device)
# state_h, state_c = model.init_state(x.size()[0])
# y_pred, (state_h, state_c) = model(x, (state_h, state_c))
# last_word_logits = y_pred[0][-1]
# p = torch.nn.functional.softmax(last_word_logits, dim=0)
# top = torch.topk(p, 64)
# top_indices = top.indices.tolist()
# top_probs = top.values.tolist()
# top_words = vocab.lookup_tokens(top_indices)
# return top_words, top_probs
# print('Halko zaczynamy predykcje')
# inference_result = []
# with lzma.open(f'dev-0/in.tsv.xz', 'r') as file:
# for line in file:
# line = line.decode("utf-8")
# line = line.rstrip()
# line = line.translate(str.maketrans('', '', string.punctuation))
# line_splitted_by_tab = line.split('\t')
# left_context = line_splitted_by_tab[-2]
# left_context_splitted = list(utils.get_words_from_line(left_context))
# top_words, top_probs = predict(model, left_context_splitted)
# string_to_print = ''
# sum_probs = 0
# for w, p in zip(top_words, top_probs):
# # print(top_words)
# if '<unk>' in w:
# continue
# string_to_print += f"{w}:{p} "
# sum_probs += p
# if string_to_print == '':
# inference_result.append("the:0.2 a:0.3 :0.5")
# continue
# unknow_prob = 1 - sum_probs
# string_to_print += f":{unknow_prob}"
# inference_result.append(string_to_print)
# with open('dev-0/out.tsv', 'w') as f:
# for line in inference_result:
# f.write(line+'\n')
print('All done')

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Word
1 Word

153
run.py Normal file
View File

@ -0,0 +1,153 @@
import lzma
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
PREFIX_TRAIN = 'train'
words = []
counter_lines = 0
with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line_splitted_by_tab = t_line.split('\t')
t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
words += t_line_cleared.split()
counter_lines+=1
if counter_lines > 90000:
break
# lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
# content = lzmaFile.read().decode("utf-8")
# words = get_words(trainset)
ngrams_ = ngrams(words, 2)
def create_probabilities_bigrams(w_c, b_c):
probabilities_bigrams = {}
for bigram, bigram_amount in b_c.items():
if bigram_amount <=2:
continue
p_word_before = bigram_amount / w_c[bigram[0]]
p_word_after = bigram_amount / w_c[bigram[1]]
probabilities_bigrams[bigram] = (p_word_before, p_word_after)
return probabilities_bigrams
words_c = Counter(words)
word_=''
bigram_c = Counter(ngrams_)
ngrams_=''
probabilities = create_probabilities_bigrams(words_c, bigram_c)
items = probabilities.items()
probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
items=''
# sorted_by_freq = freq_list(ngrams)
PREFIX_VALID = 'test-A'
def count_probabilities(w_b, w_a, probs, w_c, b_c):
results_before = {}
results_after = {}
for bigram, probses in probs.items():
if len(results_before) > 20 or len(results_after) > 20:
break
if w_b == bigram[0]:
results_before[bigram] = probses[0]
if w_a == bigram[1]:
results_after[bigram] = probses[1]
a=1
best_ = {}
for bigram, probses in results_before.items():
for bigram_2, probses_2 in results_after.items():
best_[bigram[1]] = probses * probses_2
for bigram, probses in results_after.items():
for bigram_2, probses_2 in results_before.items():
if bigram[0] in best_:
if probses * probses_2 < probses_2:
continue
best_[bigram[0]] = probses * probses_2
items = best_.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.replace('\\n', ' ')
t_line_splitted_by_tab = t_line.split('\t')
words_pre = t_line_splitted_by_tab[-2].split()
words_po = t_line_splitted_by_tab[-1].split()
w_pre = words_pre[-1]
w_po = words_po[0]
probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
if len(probs_ordered) ==0:
print(f"the:0.5 a:0.3 :0.2")
continue
result_string = ''
counter_ = 0
for word_, p in probs_ordered.items():
if counter_>4:
break
re_ = re.search(r'\p{L}+', word_)
if re_:
word_cleared = re_.group(0)
result_string += f"{word_cleared}:{str(p)} "
else:
if result_string == '':
result_string = f"the:0.5 a:0.3 "
continue
counter_+=1
result_string += ':0.1'
print(result_string)
a=1

File diff suppressed because it is too large Load Diff

View File

@ -1,25 +0,0 @@
import regex as re
import string
from torch import nn
import torch
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
def get_words_from_line(line):
line = line.rstrip()
line = line.strip()
# yield '<s>'
for m in line.split():
yield m
# yield '</s>'
vocab_size = 20000
device = 'cuda'

View File

@ -1,29 +0,0 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import utils
import torch
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 4000:
# break
line = line.decode("utf-8")
yield utils.get_words_from_line(line)
vocab_size = utils.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>', '<empty>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)