retro-gap/full_pipeline.ipynb at 5a7d6dc854fd9440aff238f029ca344760e6ffdd

Eryk Sokołowski 5a7d6dc854 corrected

2021-01-12 23:00:18 +01:00

59 KiB

Raw Blame History

import re
import numpy as np
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open("stopwords.txt", "r+") as f:
  stop_words = f.read().split("\n")

def clean_text(text):
    split = text.lower().split(" ")

    # removing punctuation
    clean = []
    for token in split:
        token = re.sub(r'[^\w\s]', '', token)
        if token:
            clean.append(token)
    return clean

def prepare_corpus(texts, min_count=1, min_word_len=1):
    corpus = {}
    counters = defaultdict(lambda: 0)
    idx_counter = 0
    for text in texts:

        # add to corpus
        for token in text:
            if len(token) < min_word_len or token in stop_words:
                continue
            counters[token] += 1
            if token not in corpus and counters[token] == min_count:
                corpus[token] = idx_counter
                idx_counter += 1
    return corpus

counters = defaultdict(lambda: 0)

class WordCorpus:
    def __init__(self, corpus=None, texts=None, min_count=1, min_word_len=1):
        if corpus:
          self.corpus = corpus
        else:
          self.corpus = prepare_corpus(texts, min_count, min_word_len)

    def get_word_idx(self, token):
        token = token.lower()
        token = re.sub(r'[^\w\s]', '', token)

        return self.corpus.get(token, None)

    def get_embedding(self, token, encode=False):
        embedding = np.zeros(len(self.corpus), dtype=np.int32)
        if encode:
            token_idx = token
        else:
            token = token.lower()
            token = re.sub(r'[^\w\s]', '', token)
            if not token or token not in self.corpus:
                return embedding

            token_idx = self.corpus[token]
        embedding[token_idx] = 1
        return embedding

    def get_bow(self, text, encode=False):
        if encode:
            embeddings = [
                self.get_embedding(token, encode) for token in text
            ]

            return np.sum(embeddings, axis=0)
        else:
            bow = np.zeros(len(self.corpus), dtype=np.int32)
            for token in text:
                bow[token] += 1
            return bow

def load_train_data(train_path):
    texts = []
    with open(train_path, "r+") as file:
        while True:
            line = file.readline()
            if not line:
                break

            _, _, _, _, text, *_ = line.split("\t")
            texts.append(clean_text(text))
    print(f"Loaded {len(texts)} texts from train_set.")
    return texts

class LanguageNeuralModel(nn.Module):
    def __init__(self, corpus_size, hidden_size):
        super().__init__()
        self.input = nn.Linear(corpus_size, hidden_size)
        self.hidden = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, corpus_size)

    def forward(self, x):
        x = self.input(x)
        x = F.relu(x)
        x = self.hidden(x)
        x = F.relu(x)

        x = self.output(x)
        return x

def get_random_word_with_contexts(text, context_size):
    allowed_indexes = np.arange(context_size, len(text) - context_size)
    if not len(allowed_indexes):
      return None, None
    word_idx = np.random.choice(allowed_indexes)
    word = text[word_idx]
    context = text[(word_idx - context_size):word_idx] + text[(word_idx + 1):(word_idx + 1 + context_size)]
    return word, context

a = clean_text("Ala ma kota , kot pije mleko")
get_random_word_with_contexts(a, 2)

('kot', ['ma', 'kota', 'pije', 'mleko'])

train_texts = load_train_data("drive/MyDrive/train.tsv")

Loaded 107471 texts from train_set.

corpus = WordCorpus(texts=train_texts, min_count=20, min_word_len=5)

len(corpus.corpus)

def remove_words_outside_corpus_and_encode(text, corpus):
    return [corpus.get_word_idx(token) for token in text if token in corpus.corpus]

train_texts = [remove_words_outside_corpus_and_encode(text, corpus) for text in train_texts]

BATCH_SIZE = 96
CONTEXT_SIZE = 15

import time

def get_batch(texts):
    X, y = [], []
    size = len(texts)
    for _ in range(BATCH_SIZE):
        word_idx = None
        while word_idx is None:
          text_idx = np.random.randint(size)
          text = texts[text_idx]
          word_idx, context = get_random_word_with_contexts(text, CONTEXT_SIZE)
        bow = corpus.get_bow(context, encode=False)
        X.append(bow)
        y.append(word_idx)
    r = (np.array(X) / (CONTEXT_SIZE * 2)).astype(np.float32), np.array(y).astype(np.int64)
    return r

model = LanguageNeuralModel(len(corpus.corpus), 250)

model = model.to(device)

model.load_state_dict(torch.load("drive/MyDrive/model.pth"))

<All keys matched successfully>

model.train()

LanguageNeuralModel(
  (input): Linear(in_features=111418, out_features=250, bias=True)
  (hidden): Linear(in_features=250, out_features=250, bias=True)
  (output): Linear(in_features=250, out_features=111418, bias=True)
)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

import tqdm

running_loss = 0.0

for i in tqdm.tqdm_notebook(range(10000)):
    X, y = get_batch(train_texts)
    X, y = torch.from_numpy(X).to(device), torch.from_numpy(y).to(device)

    optimizer.zero_grad()

    outputs = model(X)
    loss = criterion(outputs, y)

    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    if i % 500 == 499:
        torch.save(model.state_dict(), "model.pth")
        print('[%d, %5d] loss: %.3f' %
              (1, i + 1, running_loss / 500))
        running_loss = 0.0

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

[1,   500] loss: 11.095
[1,  1000] loss: 11.138
[1,  1500] loss: 11.202
[1,  2000] loss: 11.237
[1,  2500] loss: 11.209
[1,  3000] loss: 11.261
[1,  3500] loss: 11.302
[1,  4000] loss: 11.303
[1,  4500] loss: 11.283
[1,  5000] loss: 11.305
[1,  5500] loss: 11.321
[1,  6000] loss: 11.348
[1,  6500] loss: 11.335
[1,  7000] loss: 11.272
[1,  7500] loss: 11.347
[1,  8000] loss: 11.320
[1,  8500] loss: 11.301
[1,  9000] loss: 11.307
[1,  9500] loss: 11.310
[1, 10000] loss: 11.274

model.eval()

LanguageNeuralModel(
  (input): Linear(in_features=111418, out_features=250, bias=True)
  (hidden): Linear(in_features=250, out_features=250, bias=True)
  (output): Linear(in_features=250, out_features=111418, bias=True)
)

sets_to_eval = ["drive/MyDrive/dev0/", "drive/MyDrive/dev1/", "drive/MyDrive/test/"]

def load_test_data(test_path, corpus):
    texts = []
    with open(test_path, "r+") as file:
        while True:
            line = file.readline()
            if not line:
                break

            _, _, left, right, *_ = line.split("\t")
            texts.append(
                (
                    remove_words_outside_corpus_and_encode(clean_text(left), corpus),
                    remove_words_outside_corpus_and_encode(clean_text(right), corpus)
                )
            )
    print(f"Loaded {len(texts)} texts from train_set.")
    return texts

words = list(corpus.corpus)

with torch.no_grad():
    for path in sets_to_eval:
        results = []
        data = load_test_data(path + "in.tsv", corpus)
        batch = []
        for left, right in tqdm.tqdm_notebook(data):
            context = left[-CONTEXT_SIZE:] + right[:CONTEXT_SIZE]
            context = corpus.get_bow(context, encode=False)
            batch.append(context)
            if len(batch) < BATCH_SIZE:
                continue
            batch = (np.array(batch) / (2 * CONTEXT_SIZE)).astype(np.float32)
            X = torch.from_numpy(batch).to(device)
            out_all = F.softmax(model(X)).tolist()

            for pred_idx in range(BATCH_SIZE):
                out = out_all[pred_idx]

                indexes = list(range(len(corpus.corpus)))
                indexes = sorted(indexes, key=lambda x: out[x], reverse=True)

                with open(path + "out.tsv", "a+") as f:
                  res = ""
                  prob0 = 1.
                  for idx in indexes[:500]:
                      prob0 -= out[idx]
                      res += f"{words[idx]}:{np.log(out[idx])} "
                  res += f":{np.log(prob0)}\n"
                  f.write(res)
            batch = []

Loaded 19986 texts from train_set.

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:8: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

HBox(children=(FloatProgress(value=0.0, max=19986.0), HTML(value='')))

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:16: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  app.launch_new_instance()

Loaded 11628 texts from train_set.

HBox(children=(FloatProgress(value=0.0, max=11628.0), HTML(value='')))

Loaded 14132 texts from train_set.

HBox(children=(FloatProgress(value=0.0, max=14132.0), HTML(value='')))

59 KiB Raw Blame History

59 KiB

Raw Blame History