434624

2023-05-10 02:52:02 +02:00 · 2023-05-10 02:52:02 +02:00 · 14d3dc0e04
commit 14d3dc0e04
parent a7ec11ca27
10 changed files with 36016 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-
+geval
 *~
 *.swp
 *.bak
@ -6,3 +6,5 @@
 *.o
 .DS_Store
 .token
+*.pickle
+*.xz
--- a/dev-0/out-embed-100.tsv
+++ b/dev-0/out-embed-100.tsv
--- a/dev-0/out-embed-500.tsv
+++ b/dev-0/out-embed-500.tsv
--- a/gonito.yaml
+++ b/gonito.yaml
@ -5,6 +5,9 @@ tags:
 params:
  epochs: 1
  vocab-size: 20000
-  batch-size: 5000
-  embed-size: 100
-  topk: 150
+  batch-size: 10000
+  embed-size:
+    - 100
+    - 500
+    - 1000
+  topk: 10
--- a/run.py
+++ b/run.py
@ -1,5 +1,3 @@
-from itertools import islice
-import sys
 import lzma
 import regex as re
 from torchtext.vocab import build_vocab_from_iterator
@ -16,10 +14,12 @@ from tqdm import tqdm

 def get_words_from_line(line):
    line = line.rstrip()
-    yield "<s>"
-    for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
-        yield m.group(0).lower()
-    yield "</s>"
+    line = line.split("\t")
+    text = line[-2] + " " + line[-1]
+    text = re.sub(r"\\\\+n", " ", text)
+    text = re.sub('[^A-Za-z ]+', '', text)
+    for t in text.split():
+        yield t


 def get_word_lines_from_file(file_name):
@ -64,25 +64,28 @@ class TrigramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TrigramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
-        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
-        self.linear2 = nn.Linear(hidden_dim, vocab_size)
+        self.hidden = nn.Linear(embedding_dim * 2, hidden_dim)
+        self.output = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, x, y):
        x = self.embeddings(x)
        y = self.embeddings(y)
-        z = self.linear1(x + y)
-        z = self.linear2(z)
+        z = self.hidden(torch.cat([x, y], dim=1))
+        z = self.output(z)
        z = self.softmax(z)
        return z


+embed_size = 500
 vocab_size = 20000
 vocab_path = "vocabulary.pickle"
 if exists(vocab_path):
+    print("Loading vocabulary from file...")
    with open(vocab_path, "rb") as fh:
        vocab = pickle.load(fh)
 else:
+    print("Building vocabulary...")
    vocab = build_vocab_from_iterator(
        get_word_lines_from_file("train/in.tsv.xz"),
        max_tokens=vocab_size,
@ -92,28 +95,139 @@ else:
    with open(vocab_path, "wb") as fh:
        pickle.dump(vocab, fh)

-device = "cpu"
-train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
-model = TrigramModel(vocab_size, 100, 64).to(device)
-data = DataLoader(train_dataset, batch_size=5000)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+print("Using device:", device)
+dataset_path = 'train/dataset.pickle'
+if exists(dataset_path):
+    print("Loading dataset from file...")
+    with open(dataset_path, "rb") as fh:
+        train_dataset = pickle.load(fh)
+else:
+    print("Building dataset...")
+    train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
+    with open(dataset_path, "wb") as fh:
+        pickle.dump(train_dataset, fh)
+
+print("Building model...")
+model = TrigramModel(vocab_size, embed_size, 64).to(device)
+data = DataLoader(train_dataset, batch_size=10000)
 optimizer = torch.optim.Adam(model.parameters())
 criterion = torch.nn.NLLLoss()

+print("Training model...")
 model.train()
 losses = []
-for epoch in tqdm(range(10)):
-    for x, y, z in tqdm(data):
-        x = x.to(device)
-        y = y.to(device)
-        z = z.to(device)
+step = 0
+max_steps = 1000

-        optimizer.zero_grad()
-        ypredicted = model(x, z)
-        loss = criterion(torch.log(ypredicted), y)
-        losses.append(loss)
-        loss.backward()
-        optimizer.step()
-    print(f"Epoch {epoch} loss:", loss.item())
+for x, y, z in tqdm(data):
+    x = x.to(device)
+    y = y.to(device)
+    z = z.to(device)
+
+    optimizer.zero_grad()
+    ypredicted = model(x, z)
+    loss = criterion(torch.log(ypredicted), y)
+    losses.append(loss.item())
+    loss.backward()
+    optimizer.step()
+    step += 1
+    if step > max_steps:
+        break

 plt.plot(losses)
-torch.save(model.state_dict(), "model1.bin")
+plt.show()
+
+torch.save(model.state_dict(), f"trigram_model-embed_{embed_size}.bin")
+
+vocab_unique = set(train_dataset.vocab.get_stoi().keys())
+
+output = []
+print('Predicting dev...')
+with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
+    for line in tqdm(file):
+        line = line.split("\t")
+
+        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
+        first_word = re.sub('[^A-Za-z]+', '', first_word)
+
+        next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
+        nenxt_word = re.sub('[^A-Za-z]+', '', next_word)
+
+        if first_word not in vocab_unique:
+            word = "<unk>"
+        if next_word not in vocab_unique:
+            word = "<unk>"
+
+        first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
+        next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
+
+        out = model(first_word, next_word)
+
+        top = torch.topk(out[0], 10)
+        top_indices = top.indices.tolist()
+        top_probs = top.values.tolist()
+        unk_bonus = 1 - sum(top_probs)
+        top_words = vocab.lookup_tokens(top_indices)
+        top_zipped = list(zip(top_words, top_probs))
+
+        res = ""
+        for w, p in top_zipped:
+            if w == "<unk>":
+                res += f":{(p + unk_bonus):.4f} "
+            else:
+                res += f"{w}:{p:.4f} "
+
+        res = res[:-1]
+        res += "\n"
+        output.append(res)
+
+with open(f"dev-0/out-embed-{embed_size}.tsv", mode="w") as file:
+    file.writelines(output)
+
+
+model.eval()
+
+output = []
+print('Predicting test...')
+with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
+    for line in tqdm(file):
+        line = line.split("\t")
+
+        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
+        first_word = re.sub('[^A-Za-z]+', '', first_word)
+
+        next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
+        next_word = re.sub('[^A-Za-z]+', '', next_word)
+
+        if first_word not in vocab_unique:
+            word = "<unk>"
+        if next_word not in vocab_unique:
+            word = "<unk>"
+
+        first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
+        next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
+
+        out = model(first_word, next_word)
+
+        top = torch.topk(out[0], 10)
+        top_indices = top.indices.tolist()
+        top_probs = top.values.tolist()
+        unk_bonus = 1 - sum(top_probs)
+        top_words = vocab.lookup_tokens(top_indices)
+        top_zipped = list(zip(top_words, top_probs))
+
+        res = ""
+        for w, p in top_zipped:
+            if w == "<unk>":
+                res += f":{(p + unk_bonus):.4f} "
+            else:
+                res += f"{w}:{p:.4f} "
+
+        res = res[:-1]
+        res += "\n"
+        output.append(res)
+
+with open(f"test-A/out-embed-{embed_size}.tsv", mode="w") as file:
+    file.writelines(output)
--- a/test-A/out-embed-100.tsv
+++ b/test-A/out-embed-100.tsv
--- a/test-A/out-embed-500.tsv
+++ b/test-A/out-embed-500.tsv
--- a/trigram_model-50_steps-embed_100.bin
+++ b/trigram_model-50_steps-embed_100.bin
--- a/trigram_model-embed_100.bin
+++ b/trigram_model-embed_100.bin
--- a/trigram_model-embed_500.bin
+++ b/trigram_model-embed_500.bin