nn bigram

2022-05-08 17:23:26 +02:00 · 2022-05-08 17:23:26 +02:00 · 3349d6ee6b
commit 3349d6ee6b
parent 20cb6b9e97
4 changed files with 18059 additions and 18043 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/nn_model.bin
+++ b/nn_model.bin
--- a/run2.py
+++ b/run2.py
@ -1,76 +1,53 @@
+import csv
 import itertools
-import lzma
+from os.path import exists

+import pandas as pd
 import regex as re
 import torch
-from nltk.tokenize import RegexpTokenizer
 from torch import nn
-from torch.utils.data import DataLoader, IterableDataset
+from torch.utils.data import DataLoader
 from torchtext.vocab import build_vocab_from_iterator

-VOCAB_SIZE = 40000
-EMBED_SIZE = 100
-DEVICE = "cuda"
-
-tokenizer = RegexpTokenizer(r"\w+")
+IN_INPUT_PATH = "train/in.tsv.xz"
+IN_OUTPUT_PATH = "train/expected.tsv"
+VOCAB_SIZE = 30000
+EMBED_SIZE = 150
+BATCH_SIZE = 8000
+DEV_PATH = "dev-0/"
+TEST_PATH = "test-A/"
+DEVICE = "cpu"


-def read_file(file):
-    for line in file:
-        text = line.split("\t")
-        yield re.sub(
-            r"[^\w\d'\s]+",
-            "",
-            re.sub(" +", " ", text[6].replace("\\n", " ").replace("\n", "").lower()),
-        )
+def clean(text):
+    text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
+    return re.sub(r"\p{P}", "", text)


-def get_words(line):
+def get_words_from_line(line, specials=True):
    line = line.rstrip()
-    yield "<s>"
+    if specials:
+        yield "<s>"
    for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
        yield m.group(0).lower()
-    yield "</s>"
+    if specials:
+        yield "</s>"


-def get_line(file_path):
-    with lzma.open(file_path, mode="rt") as file:
-        for _, line in enumerate(file):
-            text = line.split("\t")
-            yield get_words(
-                re.sub(
-                    r"[^\w\d'\s]+",
-                    "",
-                    re.sub(
-                        " +",
-                        " ",
-                        " ".join([text[6], text[7]])
-                        .replace("\\n", " ")
-                        .replace("\n", "")
-                        .lower(),
-                    ),
-                )
-            )
-
-
-def buidl_vocab():
-    vocab = build_vocab_from_iterator(
-        get_line("train/in.tsv.xz"), max_tokens=VOCAB_SIZE, specials=["<unk>"]
-    )
-
-    vocab.set_default_index(vocab["<unk>"])
-    return vocab
+def get_word_lines_from_data(d):
+    for line in d:
+        yield get_words_from_line(line)


 def look_ahead_iterator(gen):
-    prev = None
+    w1 = None
    for item in gen:
-        if prev is not None:
-            yield (prev, item)
-        prev = item
+        if w1 is not None:
+            yield (w1, item)
+        w1 = item


-class SimpleBigramNeuralLanguageModel(nn.Module):
+class SimpleBigramNeuralLanguageModel(torch.nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
@ -83,89 +60,128 @@ class SimpleBigramNeuralLanguageModel(nn.Module):
        return self.model(x)


-class Bigrams(IterableDataset):
-    def __init__(self, text_file, vocabulary_size):
+class Bigrams(torch.utils.data.IterableDataset):
+    def __init__(self, data, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
-            get_line(text_file), max_tokens=vocabulary_size, specials=["<unk>"]
+            get_word_lines_from_data(data),
+            max_tokens=vocabulary_size,
+            specials=["<unk>"],
        )
        self.vocab.set_default_index(self.vocab["<unk>"])
        self.vocabulary_size = vocabulary_size
-        self.text_file = text_file
+        self.data = data

    def __iter__(self):
        return look_ahead_iterator(
            (
                self.vocab[t]
-                for t in itertools.chain.from_iterable(get_line(self.text_file))
+                for t in itertools.chain.from_iterable(
+                    get_word_lines_from_data(self.data)
+                )
            )
        )


-vocab = buidl_vocab()
+def get_dataset():
+    X_train = pd.read_csv(
+        IN_INPUT_PATH,
+        sep="\t",
+        header=None,
+        quoting=csv.QUOTE_NONE,
+        nrows=200000,
+        on_bad_lines="skip",
+        encoding="UTF-8",
+    )
+    Y_train = pd.read_csv(
+        IN_OUTPUT_PATH,
+        sep="\t",
+        header=None,
+        quoting=csv.QUOTE_NONE,
+        nrows=200000,
+        on_bad_lines="skip",
+        encoding="UTF-8",
+    )
+
+    X_train = X_train[[6, 7]]
+    X_train = pd.concat([X_train, Y_train], axis=1)
+    X_train = X_train[6] + X_train[0] + X_train[7]
+    X_train = X_train.apply(clean)
+    return Bigrams(X_train, VOCAB_SIZE)


-def train():
-    batch_size = 10000
-
-    train_dataset = Bigrams("train/in.tsv.xz", VOCAB_SIZE)
-
-    device = "cuda"
-    model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(device)
-    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
-    optimizer = torch.optim.Adam(model.parameters())
-    criterion = torch.nn.NLLLoss()
-
-    model.train()
-    step = 0
-    for x, y in train_data_loader:
-        x = x.to(device)
-        y = y.to(device)
-        optimizer.zero_grad()
-        ypredicted = model(x)
-        loss = criterion(torch.log(ypredicted), y)
-        if step % 100 == 0:
-            print(step, loss)
-        step += 1
-        loss.backward()
-        optimizer.step()
-    torch.save(model.state_dict(), "model1.bin")
+dataset = get_dataset()


-def predict(word, model):
-    ixs = torch.tensor(vocab.forward([word])).to(DEVICE)
+def get_model():

+    model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
+
+    if not exists("nn_model.bin"):
+        data = DataLoader(dataset, batch_size=BATCH_SIZE)
+        optimizer = torch.optim.Adam(model.parameters())
+        criterion = torch.nn.NLLLoss()
+
+        model.train()
+        step = 0
+        for i in range(2):
+            for x, y in data:
+                x = x.to(DEVICE)
+                y = y.to(DEVICE)
+                optimizer.zero_grad()
+                y_predicted = model(x)
+                loss = criterion(torch.log(y_predicted), y)
+                if step % 100 == 0:
+                    print(step, loss)
+                step += 1
+                loss.backward()
+                optimizer.step()
+
+        torch.save(model.state_dict(), "nn_model.bin")
+    else:
+        model.load_state_dict(torch.load("nn_model.bin"))
+    return model
+
+
+vocab = dataset.vocab
+model = get_model()
+
+
+def predict(ws):
+    ixs = torch.tensor(vocab.forward(ws)).to(DEVICE)
    out = model(ixs)
    top = torch.topk(out[0], 8)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
-    str_predictions = ""
-    lht = 1.0
-    for pred_word in list(zip(top_words, top_indices, top_probs)):
-        if lht - pred_word[2] >= 0:
-            str_predictions += f"{pred_word[0]}:{pred_word[2]} "
-            lht -= pred_word[2]
-    if lht != 1.0:
-        str_predictions += f":{lht}"
-    return str_predictions
+    pred_str = ""
+    for word, prob in list(zip(top_words, top_probs)):
+        pred_str += f"{word}:{prob} "
+    return pred_str


-def generate_predictions(input_file, output_file, model):
-    with open(output_file, "w") as outputf:
-        with lzma.open(input_file, mode="rt") as file:
-            for _, text in enumerate(read_file(file)):
-                tokens = tokenizer.tokenize(text)
-                if len(tokens) < 4:
-                    prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
-                else:
-                    prediction = predict(tokens[-1], model)
-                outputf.write(prediction + "\n")
+def predict_input(file):
+    X_test = pd.read_csv(
+        f"{file}/in.tsv.xz",
+        sep="\t",
+        header=None,
+        quoting=csv.QUOTE_NONE,
+        on_bad_lines="skip",
+        encoding="UTF-8",
+    )[6]
+    X_test = X_test.apply(clean)
+    with open(f"{file}/out.tsv", "w+", encoding="UTF-8") as f:
+        for row in X_test:
+            before = None
+            for before in get_words_from_line(clean(str(row)), False):
+                pass
+            before = [before]
+            if len(before) < 1:
+                pred_str = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
+            else:
+                pred_str = predict(before)
+            pred_str = pred_str.strip()
+            f.write(pred_str + "\n")


-if __name__ == "__main__":
-    train()
-    model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
-    model.load_state_dict(torch.load("model1.bin"))
-    model.eval()
-    generate_predictions("dev-0/in.tsv.xz", "dev-0/out.tsv", model)
-    generate_predictions("test-A/in.tsv.xz", "test-A/out.tsv", model)
+predict_input(DEV_PATH)
+predict_input(TEST_PATH)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv