Final out fix

Fix output
2023-06-09 02:33:57 +02:00 · 2023-06-09 02:26:52 +02:00 · 2023-06-09 02:24:43 +02:00 · 2023-06-09 02:07:56 +02:00 · 2023-06-09 01:55:15 +02:00
13 changed files with 7463 additions and 46535 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-geval
+
 *~
 *.swp
 *.bak
@ -6,5 +6,3 @@ geval
 *.o
 .DS_Store
 .token
-*.pickle
-*.xz
--- a/dev-0/out-embed-100.tsv
+++ b/dev-0/out-embed-100.tsv
--- a/dev-0/out-embed-500.tsv
+++ b/dev-0/out-embed-500.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/gonito.yaml
+++ b/gonito.yaml
@ -5,9 +5,6 @@ tags:
 params:
  epochs: 1
  vocab-size: 20000
-  batch-size: 10000
-  embed-size:
-    - 100
-    - 500
-    - 1000
-  topk: 10
+  batch-size: 5000
+  embed-size: 100
+  topk: 150
--- a/gpt_predict.py
+++ b/gpt_predict.py
@ -0,0 +1,18 @@
+from transformers import pipeline
+import lzma
+
+generator = pipeline("text-generation", model="gpt2")
+
+with open("test-A/in.tsv", "r") as input_file, open(
+    "test-A/out.tsv", "w"
+) as output_file:
+    for line in input_file:
+        line = line.rstrip()
+        line = line.replace("\\n", " ")
+
+        prompt = line.split("\t")[6]
+
+        result = generator(prompt, max_new_tokens=1, num_return_sequences=1)[0][
+            "generated_text"
+        ]
+        output_file.write(f"{result.split()[-1]}:1\n")
--- a/run.py
+++ b/run.py
@ -1,3 +1,5 @@
+from itertools import islice
+import sys
 import lzma
 import regex as re
 from torchtext.vocab import build_vocab_from_iterator
@ -14,12 +16,10 @@ from tqdm import tqdm

 def get_words_from_line(line):
    line = line.rstrip()
-    line = line.split("\t")
-    text = line[-2] + " " + line[-1]
-    text = re.sub(r"\\\\+n", " ", text)
-    text = re.sub('[^A-Za-z ]+', '', text)
-    for t in text.split():
-        yield t
+    yield "<s>"
+    for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
+        yield m.group(0).lower()
+    yield "</s>"


 def get_word_lines_from_file(file_name):
@ -64,28 +64,25 @@ class TrigramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TrigramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
-        self.hidden = nn.Linear(embedding_dim * 2, hidden_dim)
-        self.output = nn.Linear(hidden_dim, vocab_size)
+        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, x, y):
        x = self.embeddings(x)
        y = self.embeddings(y)
-        z = self.hidden(torch.cat([x, y], dim=1))
-        z = self.output(z)
+        z = self.linear1(x + y)
+        z = self.linear2(z)
        z = self.softmax(z)
        return z


-embed_size = 500
 vocab_size = 20000
 vocab_path = "vocabulary.pickle"
 if exists(vocab_path):
-    print("Loading vocabulary from file...")
    with open(vocab_path, "rb") as fh:
        vocab = pickle.load(fh)
 else:
-    print("Building vocabulary...")
    vocab = build_vocab_from_iterator(
        get_word_lines_from_file("train/in.tsv.xz"),
        max_tokens=vocab_size,
@ -95,139 +92,28 @@ else:
    with open(vocab_path, "wb") as fh:
        pickle.dump(vocab, fh)

-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-print("Using device:", device)
-dataset_path = 'train/dataset.pickle'
-if exists(dataset_path):
-    print("Loading dataset from file...")
-    with open(dataset_path, "rb") as fh:
-        train_dataset = pickle.load(fh)
-else:
-    print("Building dataset...")
-    train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
-    with open(dataset_path, "wb") as fh:
-        pickle.dump(train_dataset, fh)
-
-print("Building model...")
-model = TrigramModel(vocab_size, embed_size, 64).to(device)
-data = DataLoader(train_dataset, batch_size=10000)
+device = "cpu"
+train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
+model = TrigramModel(vocab_size, 100, 64).to(device)
+data = DataLoader(train_dataset, batch_size=5000)
 optimizer = torch.optim.Adam(model.parameters())
 criterion = torch.nn.NLLLoss()

-print("Training model...")
 model.train()
 losses = []
-step = 0
-max_steps = 1000
+for epoch in tqdm(range(10)):
+    for x, y, z in tqdm(data):
+        x = x.to(device)
+        y = y.to(device)
+        z = z.to(device)

-for x, y, z in tqdm(data):
-    x = x.to(device)
-    y = y.to(device)
-    z = z.to(device)
-
-    optimizer.zero_grad()
-    ypredicted = model(x, z)
-    loss = criterion(torch.log(ypredicted), y)
-    losses.append(loss.item())
-    loss.backward()
-    optimizer.step()
-    step += 1
-    if step > max_steps:
-        break
+        optimizer.zero_grad()
+        ypredicted = model(x, z)
+        loss = criterion(torch.log(ypredicted), y)
+        losses.append(loss)
+        loss.backward()
+        optimizer.step()
+    print(f"Epoch {epoch} loss:", loss.item())

 plt.plot(losses)
-plt.show()
-
-torch.save(model.state_dict(), f"trigram_model-embed_{embed_size}.bin")
-
-vocab_unique = set(train_dataset.vocab.get_stoi().keys())
-
-output = []
-print('Predicting dev...')
-with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
-    for line in tqdm(file):
-        line = line.split("\t")
-
-        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
-        first_word = re.sub('[^A-Za-z]+', '', first_word)
-
-        next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
-        nenxt_word = re.sub('[^A-Za-z]+', '', next_word)
-
-        if first_word not in vocab_unique:
-            word = "<unk>"
-        if next_word not in vocab_unique:
-            word = "<unk>"
-
-        first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
-        next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
-
-        out = model(first_word, next_word)
-
-        top = torch.topk(out[0], 10)
-        top_indices = top.indices.tolist()
-        top_probs = top.values.tolist()
-        unk_bonus = 1 - sum(top_probs)
-        top_words = vocab.lookup_tokens(top_indices)
-        top_zipped = list(zip(top_words, top_probs))
-
-        res = ""
-        for w, p in top_zipped:
-            if w == "<unk>":
-                res += f":{(p + unk_bonus):.4f} "
-            else:
-                res += f"{w}:{p:.4f} "
-
-        res = res[:-1]
-        res += "\n"
-        output.append(res)
-
-with open(f"dev-0/out-embed-{embed_size}.tsv", mode="w") as file:
-    file.writelines(output)
-
-
-model.eval()
-
-output = []
-print('Predicting test...')
-with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
-    for line in tqdm(file):
-        line = line.split("\t")
-
-        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
-        first_word = re.sub('[^A-Za-z]+', '', first_word)
-
-        next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
-        next_word = re.sub('[^A-Za-z]+', '', next_word)
-
-        if first_word not in vocab_unique:
-            word = "<unk>"
-        if next_word not in vocab_unique:
-            word = "<unk>"
-
-        first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
-        next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
-
-        out = model(first_word, next_word)
-
-        top = torch.topk(out[0], 10)
-        top_indices = top.indices.tolist()
-        top_probs = top.values.tolist()
-        unk_bonus = 1 - sum(top_probs)
-        top_words = vocab.lookup_tokens(top_indices)
-        top_zipped = list(zip(top_words, top_probs))
-
-        res = ""
-        for w, p in top_zipped:
-            if w == "<unk>":
-                res += f":{(p + unk_bonus):.4f} "
-            else:
-                res += f"{w}:{p:.4f} "
-
-        res = res[:-1]
-        res += "\n"
-        output.append(res)
-
-with open(f"test-A/out-embed-{embed_size}.tsv", mode="w") as file:
-    file.writelines(output)
+torch.save(model.state_dict(), "model1.bin")
--- a/test-A/out-embed-100.tsv
+++ b/test-A/out-embed-100.tsv
--- a/test-A/out-embed-500.tsv
+++ b/test-A/out-embed-500.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/trigram_model-50_steps-embed_100.bin
+++ b/trigram_model-50_steps-embed_100.bin
--- a/trigram_model-embed_100.bin
+++ b/trigram_model-embed_100.bin
--- a/trigram_model-embed_500.bin
+++ b/trigram_model-embed_500.bin
Author	SHA1	Message	Date
Jakub Kaczmarek	40c36dce44	Final out fix	2023-06-09 02:33:57 +02:00
Jakub Kaczmarek	5bc8f3f6f7	Fix output	2023-06-09 02:26:52 +02:00
Jakub Kaczmarek	c8247f077f	Fix output	2023-06-09 02:24:43 +02:00
Jakub Kaczmarek	ab56101d2c	Remove results for dev-0	2023-06-09 02:07:56 +02:00
Jakub Kaczmarek	f8892f9209	Add gpt2 solution	2023-06-09 01:55:15 +02:00