13 changed files with 18098 additions and 18398 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 Challenging America word-gap prediction
 ===================================
 Guess a word in a gap.
 Evaluation metric
 -----------------
 LikelihoodHashed is the metric
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
 --metric PerplexityHashed --precision 2  --in-header in-header.tsv  --out-header out-header.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/generator.py
+++ b/generator.py
@ -1,104 +0,0 @@
 import torch
 from torch import nn, optim
 from torch.utils.data import DataLoader
 import numpy as np
 from collections import Counter
 import string
 import lzma
 import pdb
 import copy
 from torch.utils.data import IterableDataset
 import itertools
 import lzma
 import regex as re
 import pickle
 import string 
 import pdb
 import utils
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 device = 'cuda'
 vocab_size = utils.vocab_size
 with open("vocab.pickle", 'rb') as handle:
    vocab = pickle.load( handle)
 vocab.set_default_index(vocab['<unk>'])
 class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 150
        self.embedding_dim = 200
        self.num_layers = 1
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=True,
            # dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size*2, vocab_size)
    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state
    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
 model = Model(vocab_size = vocab_size).to(device)
 model.load_state_dict(torch.load('lstm_step_10000.bin'))
 model.eval()
 def predict(model, text_splitted):
    model.eval()
    words = text_splitted
    x = torch.tensor([[vocab[w] for w in words]]).to(device)
    state_h, state_c = model.init_state(x.size()[0])
    y_pred, (state_h, state_c) = model(x, (state_h, state_c))
    last_word_logits = y_pred[0][-1]
    p = torch.nn.functional.softmax(last_word_logits, dim=0)
    top = torch.topk(p, 10)
    top_indices = top.indices.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    if '<unk>' in top_words:
        top_words.remove('<unk>')
    return np.random.choice(top_words)
 prompts = [
    'These, and a thousand other means, by which the wealth of a nation may be greatly increase',
    'Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,',
    'Mr. Deddrick intends to clothe it and\ngive it as nearly as possible a likeness'
 ]
 for p in prompts:
    answer = ''
    for i in range(10):
        answer += predict(model, p.split()) + ' '
    print('Prompt: ', p)
    print('Answer: ', answer)
 # Prompt:  These, and a thousand other means, by which the wealth of a nation may be greatly increase
 # Answer:  as the of as and to in to for in 
 # Prompt:  Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,
 # Answer:  in that The a the of the to the for 
 # Prompt:  Mr. Deddrick intends to clothe it and
 # give it as nearly as possible a likeness
 # Answer:  and of\nthe for man in of\nthe and of man of
--- a/BIN
+++ b/BIN
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
 FileId	Year	LeftContext	RightContext
--- a/inference.py
+++ b/inference.py
@ -1,118 +0,0 @@
 import torch
 from torch import nn, optim
 from torch.utils.data import DataLoader
 import numpy as np
 from collections import Counter
 import string
 import lzma
 import pdb
 import copy
 from torch.utils.data import IterableDataset
 import itertools
 import lzma
 import regex as re
 import pickle
 import string 
 import pdb
 import utils
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 device = 'cuda'
 vocab_size = utils.vocab_size
 with open("vocab.pickle", 'rb') as handle:
    vocab = pickle.load( handle)
 vocab.set_default_index(vocab['<unk>'])
 class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 150
        self.embedding_dim = 200
        self.num_layers = 1
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=True,
            # dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size*2, vocab_size)
    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state
    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
 model = Model(vocab_size = vocab_size).to(device)
 model.load_state_dict(torch.load('lstm_step_10000.bin'))
 model.eval()
 def predict(model, text_splitted):
    model.eval()
    words = text_splitted
    x = torch.tensor([[vocab[w] for w in words]]).to(device)
    state_h, state_c = model.init_state(x.size()[0])
    y_pred, (state_h, state_c) = model(x, (state_h, state_c))
    last_word_logits = y_pred[0][-1]
    p = torch.nn.functional.softmax(last_word_logits, dim=0)
    top = torch.topk(p, 64)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    return top_words, top_probs
 inference_result = []
 with lzma.open(f'test-A/in.tsv.xz', 'r') as file:
    for line in file:
        line = line.decode("utf-8")
        line = line.rstrip()
        line = line.translate(str.maketrans('', '', string.punctuation))
        line_splitted_by_tab = line.split('\t')
        left_context = line_splitted_by_tab[-2]
        left_context_splitted = list(utils.get_words_from_line(left_context))
        top_words, top_probs = predict(model, left_context_splitted)
        string_to_print = ''
        sum_probs = 0
        for w, p in zip(top_words, top_probs):
            # print(top_words)
            if '<unk>' in w:
                continue
            string_to_print += f"{w}:{p} "
            sum_probs += p
        if string_to_print == '':
            inference_result.append("the:0.2 a:0.3 :0.5")
            continue
        unknow_prob = 1 - sum_probs
        string_to_print += f":{unknow_prob}"
        inference_result.append(string_to_print)
 with open('test-A/out.tsv', 'w') as f:
    for line in inference_result:
        f.write(line+'\n')
 print('All done')
--- a/lstm.py
+++ b/lstm.py
@ -1,189 +0,0 @@
 import torch
 from torch import nn, optim
 from torch.utils.data import DataLoader
 import numpy as np
 from collections import Counter
 import string
 import lzma
 import pdb
 import copy
 from torch.utils.data import IterableDataset
 import itertools
 import lzma
 import regex as re
 import pickle
 import string 
 import pdb
 import utils
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 device = 'cuda'
 with open("vocab.pickle", 'rb') as handle:
    vocab = pickle.load( handle)
 vocab.set_default_index(vocab['<unk>'])
 def get_word_lines_from_file(file_name):
  counter=0
  seq_len = 10
  with lzma.open(file_name, 'r') as fh:
    for line in fh:
      counter+=1
    #   if counter == 100000:
    #     break
      line = line.decode("utf-8")
      line_splitted = utils.get_words_from_line(line)
      vocab_line = [vocab[t] for t in line_splitted]
      for i in range(len(vocab_line) - seq_len):
        yield torch.tensor(vocab_line[i:i+seq_len]), torch.tensor(vocab_line[i+1 :i+seq_len+1])
 class Grams_10(IterableDataset):
  def __init__(self, text_file, vocab):
      self.vocab = vocab
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.text_file = text_file
  def __iter__(self):
     return get_word_lines_from_file(self.text_file)
 vocab_size = utils.vocab_size
 train_dataset = Grams_10('train/in.tsv.xz', vocab)
 BATCH_SIZE = 1024
 class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 150
        self.embedding_dim = 200
        self.num_layers = 1
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=True,
            # dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size*2, vocab_size)
    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state
    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
 def train(dataloader, model, max_epochs):
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(max_epochs):
        step = 0
        for batch_i, (x, y) in enumerate(dataloader):
            # pdb.set_trace()
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_pred, (state_h, state_c) = model(x)
            # pdb.set_trace()
            loss = criterion(y_pred.transpose(1, 2), y)
            loss.backward()
            optimizer.step()
            step+=1
            if step % 500 == 0:
                print({ 'epoch': epoch,'step': step ,'loss': loss.item(),  })
                # torch.save(model.state_dict(), f'lstm_step_{step}.bin')
            if step % 5000 == 0:
                print({ 'epoch': epoch, 'step': step, 'loss': loss.item() })
                torch.save(model.state_dict(), f'lstm_step_{step}.bin')
        torch.save(model.state_dict(), f'lstm_epoch_{epoch}.bin')
                # break
 print('Halko zaczynamy trenowanie')
 model = Model(vocab_size = vocab_size).to(device)
 dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE)
 train(dataset, model, 1)
 torch.save(model.state_dict(), f'lstm.bin')
 # def predict(model, text_splitted):
 #     model.eval()
 #     words = text_splitted
 #     x = torch.tensor([[vocab[w] for w in words]]).to(device)
 #     state_h, state_c = model.init_state(x.size()[0])
 #     y_pred, (state_h, state_c) = model(x, (state_h, state_c))
 #     last_word_logits = y_pred[0][-1]
 #     p = torch.nn.functional.softmax(last_word_logits, dim=0)
 #     top = torch.topk(p, 64)
 #     top_indices = top.indices.tolist()
 #     top_probs = top.values.tolist()
 #     top_words = vocab.lookup_tokens(top_indices)
 #     return top_words, top_probs
 # print('Halko zaczynamy predykcje')
 # inference_result = []
 # with lzma.open(f'dev-0/in.tsv.xz', 'r') as file:
 #     for line in file:
 #         line = line.decode("utf-8")
 #         line = line.rstrip()
 #         line = line.translate(str.maketrans('', '', string.punctuation))
 #         line_splitted_by_tab = line.split('\t')
 #         left_context = line_splitted_by_tab[-2]
 #         left_context_splitted = list(utils.get_words_from_line(left_context))
 #         top_words, top_probs = predict(model, left_context_splitted)
 #         string_to_print = ''
 #         sum_probs = 0
 #         for w, p in zip(top_words, top_probs):
 #             # print(top_words)
 #             if '<unk>' in w:
 #                 continue
 #             string_to_print += f"{w}:{p} "
 #             sum_probs += p
 #         if string_to_print == '':
 #             inference_result.append("the:0.2 a:0.3 :0.5")
 #             continue
 #         unknow_prob = 1 - sum_probs
 #         string_to_print += f":{unknow_prob}"
 #         inference_result.append(string_to_print)
 # with open('dev-0/out.tsv', 'w') as f:
 #     for line in inference_result:
 #         f.write(line+'\n')
 print('All done')
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
 Word
--- a/run.py
+++ b/run.py
@ -0,0 +1,153 @@
 import lzma
 import matplotlib.pyplot as plt
 from math import log
 from collections import OrderedDict
 from collections import Counter
 import regex as re
 from itertools import islice
 def freq_list(g, top=None):
    c = Counter(g)
    if top is None:
       items = c.items()
    else:
       items = c.most_common(top)
    return OrderedDict(sorted(items, key=lambda t: -t[1]))
 def get_words(t):
    for m in re.finditer(r'[\p{L}0-9-\*]+', t):
        yield m.group(0)
 def ngrams(iter, size):
  ngram = []
  for item in iter:
    ngram.append(item)
    if len(ngram) == size:
        yield tuple(ngram)
        ngram = ngram[1:]
 PREFIX_TRAIN = 'train' 
 words = []
 counter_lines = 0
 with lzma.open(f'{PREFIX_TRAIN}/in.tsv.xz', 'r') as train, open(f'{PREFIX_TRAIN}/expected.tsv', 'r') as expected:
    for t_line, e_line in zip(train, expected):
        t_line = t_line.decode("utf-8")
        t_line = t_line.rstrip()
        e_line = e_line.rstrip()
        t_line_splitted_by_tab = t_line.split('\t')
        t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
        words += t_line_cleared.split()
        counter_lines+=1
        if counter_lines > 90000:
            break
 # lzmaFile = lzma.open('dev-0/in.tsv.xz', 'rb')
 # content = lzmaFile.read().decode("utf-8")
 # words = get_words(trainset)
 ngrams_ = ngrams(words, 2)
 def create_probabilities_bigrams(w_c, b_c):
    probabilities_bigrams = {}
    for bigram, bigram_amount in b_c.items():
        if bigram_amount <=2:
            continue
        p_word_before = bigram_amount / w_c[bigram[0]] 
        p_word_after = bigram_amount / w_c[bigram[1]]
        probabilities_bigrams[bigram] = (p_word_before, p_word_after)
    return probabilities_bigrams
 words_c = Counter(words)
 word_=''
 bigram_c = Counter(ngrams_)
 ngrams_=''
 probabilities = create_probabilities_bigrams(words_c, bigram_c)
 items = probabilities.items()
 probabilities = OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
 items=''
 # sorted_by_freq = freq_list(ngrams)
 PREFIX_VALID = 'test-A'
 def count_probabilities(w_b, w_a, probs, w_c, b_c):
    results_before = {}
    results_after = {}
    for bigram, probses in probs.items():
        if len(results_before) > 20 or len(results_after) > 20:
            break
        if w_b == bigram[0]:
            results_before[bigram] = probses[0]
        if w_a == bigram[1]:
            results_after[bigram] = probses[1]
    a=1
    best_ = {}
    for bigram, probses in results_before.items():
        for bigram_2, probses_2 in results_after.items():
            best_[bigram[1]] = probses * probses_2
    for bigram, probses in results_after.items():
            for bigram_2, probses_2 in results_before.items():
                if bigram[0] in best_:
                    if probses * probses_2 < probses_2:
                        continue
                best_[bigram[0]] = probses * probses_2
    items = best_.items()
    return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
 with lzma.open(f'{PREFIX_VALID}/in.tsv.xz', 'r') as train:
    for t_line in train:
        t_line = t_line.decode("utf-8")
        t_line = t_line.rstrip()
        t_line = t_line.replace('\\n', ' ')
        t_line_splitted_by_tab = t_line.split('\t')
        words_pre = t_line_splitted_by_tab[-2].split()
        words_po = t_line_splitted_by_tab[-1].split()
        w_pre = words_pre[-1]
        w_po = words_po[0]
        probs_ordered = count_probabilities(w_pre, w_po,probabilities, words_c, bigram_c)
        if len(probs_ordered) ==0:
            print(f"the:0.5 a:0.3 :0.2")
            continue
        result_string = ''
        counter_ = 0
        for word_, p in probs_ordered.items():
            if counter_>4:
                break
            re_ = re.search(r'\p{L}+', word_)
            if re_:
                word_cleared = re_.group(0)
                result_string += f"{word_cleared}:{str(p)} "
            else:
                if result_string == '':
                    result_string = f"the:0.5 a:0.3 "
                continue
            counter_+=1
        result_string += ':0.1'
        print(result_string)
        a=1
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/utils.py
+++ b/utils.py
@ -1,25 +0,0 @@
 import regex as re
 import string 
 from torch import nn
 import torch
 from torch.utils.data import DataLoader
 from torch.utils.data import IterableDataset
 import itertools
 import lzma
 import regex as re
 import pickle
 import string 
 def get_words_from_line(line):
  line = line.rstrip()
  line = line.strip()
  # yield '<s>'
  for m in line.split():
     yield m
  # yield '</s>'
 vocab_size = 20000
 device = 'cuda'
--- a/x_create_vocab.py
+++ b/x_create_vocab.py
@ -1,29 +0,0 @@
 from itertools import islice
 import regex as re
 import sys
 from torchtext.vocab import build_vocab_from_iterator
 import lzma
 import utils
 import torch
 def get_word_lines_from_file(file_name):
  counter=0
  with lzma.open(file_name, 'r') as fh:
    for line in fh:
      counter+=1
    #   if counter == 4000:
    #     break
      line = line.decode("utf-8")
      yield utils.get_words_from_line(line)
 vocab_size = utils.vocab_size
 vocab = build_vocab_from_iterator(
    get_word_lines_from_file('train/in.tsv.xz'),
    max_tokens = vocab_size,
    specials = ['<unk>', '<empty>'])
 import pickle
 with open("vocab.pickle", 'wb') as handle:
    pickle.dump(vocab, handle)
		`@ -0,0 +1 @@`
							`--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv`