retro-gap/full_pipeline.ipynb
Eryk Sokołowski 94519cbc0d popr
2021-01-11 22:59:48 +01:00

59 KiB

import re
import numpy as np
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with open("stopwords.txt", "r+") as f:
  stop_words = f.read().split("\n")
def clean_text(text):
    split = text.lower().split(" ")

    # removing punctuation
    clean = []
    for token in split:
        token = re.sub(r'[^\w\s]', '', token)
        if token:
            clean.append(token)
    return clean
def prepare_corpus(texts, min_count=1, min_word_len=1):
    corpus = {}
    counters = defaultdict(lambda: 0)
    idx_counter = 0
    for text in texts:

        # add to corpus
        for token in text:
            if len(token) < min_word_len or token in stop_words:
                continue
            counters[token] += 1
            if token not in corpus and counters[token] == min_count:
                corpus[token] = idx_counter
                idx_counter += 1
    return corpus
counters = defaultdict(lambda: 0)

class WordCorpus:
    def __init__(self, corpus=None, texts=None, min_count=1, min_word_len=1):
        if corpus:
          self.corpus = corpus
        else:
          self.corpus = prepare_corpus(texts, min_count, min_word_len)

    def get_word_idx(self, token):
        token = token.lower()
        token = re.sub(r'[^\w\s]', '', token)

        return self.corpus.get(token, None)

    def get_embedding(self, token, encode=False):
        embedding = np.zeros(len(self.corpus), dtype=np.int32)
        if encode:
            token_idx = token
        else:
            token = token.lower()
            token = re.sub(r'[^\w\s]', '', token)
            if not token or token not in self.corpus:
                return embedding

            token_idx = self.corpus[token]
        embedding[token_idx] = 1
        return embedding

    def get_bow(self, text, encode=False):
        if encode:
            embeddings = [
                self.get_embedding(token, encode) for token in text
            ]

            return np.sum(embeddings, axis=0)
        else:
            bow = np.zeros(len(self.corpus), dtype=np.int32)
            for token in text:
                bow[token] += 1
            return bow
def load_train_data(train_path):
    texts = []
    with open(train_path, "r+") as file:
        while True:
            line = file.readline()
            if not line:
                break

            _, _, _, _, text, *_ = line.split("\t")
            texts.append(clean_text(text))
    print(f"Loaded {len(texts)} texts from train_set.")
    return texts
class LanguageNeuralModel(nn.Module):
    def __init__(self, corpus_size, hidden_size):
        super().__init__()
        self.input = nn.Linear(corpus_size, hidden_size)
        self.hidden = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, corpus_size)

    def forward(self, x):
        x = self.input(x)
        x = F.relu(x)
        x = self.hidden(x)
        x = F.relu(x)

        x = self.output(x)
        return x
def get_random_word_with_contexts(text, context_size):
    allowed_indexes = np.arange(context_size, len(text) - context_size)
    if not len(allowed_indexes):
      return None, None
    word_idx = np.random.choice(allowed_indexes)
    word = text[word_idx]
    context = text[(word_idx - context_size):word_idx] + text[(word_idx + 1):(word_idx + 1 + context_size)]
    return word, context
a = clean_text("Ala ma kota , kot pije mleko")
get_random_word_with_contexts(a, 2)
('kota', ['ala', 'ma', 'kot', 'pije'])
train_texts = load_train_data("drive/MyDrive/train.tsv")
Loaded 107471 texts from train_set.
corpus = WordCorpus(texts=train_texts, min_count=20, min_word_len=5)
len(corpus.corpus)
111418
def remove_words_outside_corpus_and_encode(text, corpus):
    return [corpus.get_word_idx(token) for token in text if token in corpus.corpus]
train_texts = [remove_words_outside_corpus_and_encode(text, corpus) for text in train_texts]
BATCH_SIZE = 96
CONTEXT_SIZE = 15
import time

def get_batch(texts):
    X, y = [], []
    size = len(texts)
    for _ in range(BATCH_SIZE):
        word_idx = None
        while word_idx is None:
          text_idx = np.random.randint(size)
          text = texts[text_idx]
          word_idx, context = get_random_word_with_contexts(text, CONTEXT_SIZE)
        bow = corpus.get_bow(context, encode=False)
        X.append(bow)
        y.append(word_idx)
    r = (np.array(X) / (CONTEXT_SIZE * 2)).astype(np.float32), np.array(y).astype(np.int64)
    return r
model = LanguageNeuralModel(len(corpus.corpus), 250)
model = model.to(device)
model.train()
LanguageNeuralModel(
  (input): Linear(in_features=111418, out_features=250, bias=True)
  (hidden): Linear(in_features=250, out_features=250, bias=True)
  (output): Linear(in_features=250, out_features=111418, bias=True)
)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
import tqdm
running_loss = 0.0

for i in tqdm.tqdm_notebook(range(20000)):
    X, y = get_batch(train_texts)
    X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)

    optimizer.zero_grad()

    outputs = model(X)
    loss = criterion(outputs, y)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if i % 500 == 499:
        torch.save(model.state_dict(), "model.pth")
        print('[%d, %5d] loss: %.3f' %
              (1, i + 1, running_loss / 500))
        running_loss = 0.0
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until
HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))
[1,   500] loss: 10.873
[1,  1000] loss: 10.559
[1,  1500] loss: 10.505
[1,  2000] loss: 10.437
[1,  2500] loss: 10.371
[1,  3000] loss: 10.371
[1,  3500] loss: 10.336
[1,  4000] loss: 10.338
[1,  4500] loss: 10.325
[1,  5000] loss: 10.325
[1,  5500] loss: 10.335
[1,  6000] loss: 10.366
[1,  6500] loss: 10.366
[1,  7000] loss: 10.377
[1,  7500] loss: 10.392
[1,  8000] loss: 10.422
[1,  8500] loss: 10.477
[1,  9000] loss: 10.525
[1,  9500] loss: 10.562
[1, 10000] loss: 10.593
[1, 10500] loss: 10.657
[1, 11000] loss: 10.711
[1, 11500] loss: 10.706
[1, 12000] loss: 10.781
[1, 12500] loss: 10.799
[1, 13000] loss: 10.875
[1, 13500] loss: 10.882
[1, 14000] loss: 10.921
[1, 14500] loss: 10.946
[1, 15000] loss: 10.979
[1, 15500] loss: 11.001
[1, 16000] loss: 11.032
[1, 16500] loss: 11.069
[1, 17000] loss: 11.090
[1, 17500] loss: 11.112
[1, 18000] loss: 11.119
[1, 18500] loss: 11.132
[1, 19000] loss: 11.212
[1, 19500] loss: 11.188
[1, 20000] loss: 11.213

model.eval()
LanguageNeuralModel(
  (input): Linear(in_features=111418, out_features=250, bias=True)
  (hidden): Linear(in_features=250, out_features=250, bias=True)
  (output): Linear(in_features=250, out_features=111418, bias=True)
)
sets_to_eval = ["drive/MyDrive/dev0/", "drive/MyDrive/dev1/", "drive/MyDrive/test/"]
def load_test_data(test_path, corpus):
    texts = []
    with open(test_path, "r+") as file:
        while True:
            line = file.readline()
            if not line:
                break

            _, _, left, right, *_ = line.split("\t")
            texts.append(
                (
                    remove_words_outside_corpus_and_encode(clean_text(left), corpus),
                    remove_words_outside_corpus_and_encode(clean_text(right), corpus)
                )
            )
    print(f"Loaded {len(texts)} texts from train_set.")
    return texts
words = list(corpus.corpus)

with torch.no_grad():
    for path in sets_to_eval:
        data = load_test_data(path + "in.tsv", corpus)
        results = []
        batch = []
        for left, right in tqdm.tqdm_notebook(data):
            if len(batch) < BATCH_SIZE:
                context = left[-CONTEXT_SIZE:] + right[:CONTEXT_SIZE]
                context = corpus.get_bow(context, encode=False)
                batch.append(context)
                continue
            batch = (np.array(batch) / (2*CONTEXT_SIZE)).astype(np.float32)
            X = torch.from_numpy(batch).to(device)
            out = F.softmax(model(X)).tolist()[0]

            indexes = list(range(len(corpus.corpus)))
            indexes = sorted(indexes, key=lambda x: out[x], reverse=True)

            res = ""
            prob0 = 1.
            for idx in indexes[:10000]:
                prob0 -= out[idx]
                res += f"{words[idx]}:{np.log(out[idx])} "
            res += f":{np.log(prob0)}"
            results.append(res)
            batch = []
        with open(path + "out.tsv", "w+") as f:
            f.write("\n".join(results))
Loaded 19986 texts from train_set.
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:8: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  
HBox(children=(FloatProgress(value=0.0, max=19986.0), HTML(value='')))
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  app.launch_new_instance()
Loaded 11628 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=11628.0), HTML(value='')))
Loaded 14132 texts from train_set.
HBox(children=(FloatProgress(value=0.0, max=14132.0), HTML(value='')))