07_neural

2024-05-15 09:27:53 +02:00 · 2024-05-15 09:27:53 +02:00 · 362a192af8
commit 362a192af8
parent 1eb60bd963
3 changed files with 603 additions and 174 deletions
--- a/README.md
+++ b/README.md
@ -13,7 +13,6 @@ Perplexity hashed by
 -----------------
 1. Statystyczny model językowy (zadanie 5) 
    - branch: master - Perplexity hashed on `dev-0`: 555.75
-    - branch: 05_ngram - Perplexity hashed on `dev-0`: xxx
    <br><br>
 2. Neuronowy model językowy (zadanie 7)
    - branch: 07_neural - Perplexity hashed on `dev-0`: xxx
--- a/main.ipynb
+++ b/main.ipynb
--- a/run.py
+++ b/run.py
@ -1,11 +1,10 @@
 import pandas as pd
 import numpy as np
 import csv
-import os
 import re
-import random
-from collections import Counter, defaultdict
 import nltk
+import torch
+from bidict import bidict
 import math
 from tqdm import tqdm

@ -13,60 +12,17 @@ directory = "train/in.tsv.xz"
 directory_dev_0 = "dev-0/in.tsv.xz"
 directory_test_A = "test-A/in.tsv.xz"

-class Model():
-    
-    def __init__(self, vocab_size=30_000, UNK_token= '<UNK>', n=3):
-        if (n <= 1 or n % 2 == 0):
-            raise "change N value !!!"
-        self.n = n
-        self.vocab_size = vocab_size
-        self.UNK_token = UNK_token
-    
-    def train(self, corpus:list) -> None:
-        if(self.n > 1):
-            self.n_grams = list(nltk.ngrams(corpus, n=self.n))
-        else:
-            self.n_grams = corpus
-        self.counter = Counter(self.n_grams)
-        self.words_counter = Counter(corpus)
-        self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])
+# --------------------- PARAMETERS ---------------------

-        self.all_grams = defaultdict(set)
+n = 3
+device = torch.device("cuda")

-        for gram in tqdm(self.n_grams):
-            previous_words = tuple(gram[:math.floor(self.n/2)])
-            next_words = tuple(gram[math.ceil(self.n/2):])
-            word = gram[math.floor(self.n/2)]
-            self.all_grams[(previous_words, next_words)].add(word)
+batch_size = 512
+learning_rate = 0.004
+epochs = 1
+embedding_size = 64

-    def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:
-        previous_words = tuple(left_text[-math.floor(self.n/2):])
-        next_words = tuple(right_text[:math.floor(self.n/2)])
-        quantity = self.counter[previous_words + tuple([word]) + next_words]
-        all_quantity = self.all_quantities[previous_words + next_words]
-        if (all_quantity <= 0):
-            return 0
-        return quantity/all_quantity
-    
-    def get_prob_for_text(self, text: list) -> float:
-        prob = 1
-        for gram in list(nltk.ngrams(text, self.n)):
-            prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])
-        return prob
-    
-    def most_probable_words(self, left_text: list, right_text: list) -> str:
-        previous_words = tuple(left_text[-math.floor(self.n/2):])
-        next_words = tuple(right_text[:math.floor(self.n/2)])
-        all_words = self.all_grams[(previous_words, next_words)]
-        best_words = []
-        for word in all_words:
-            probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)
-            best_words.append((word, probability))
-        return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]
-    
-    def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:
-        words = self.most_probable_words(text_beggining, text_ending)
-        return words
+# --------------------- DATASET ---------------------

 dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)

@ -85,13 +41,120 @@ for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList))
    lines = list(map(lambda l: " ".join(l), lines))
    DATASET = DATASET + " ".join(lines)

+    if(number == 15):
+        break
+
 FINAL_DATASET = re.split(r"\s+", DATASET)
 print(FINAL_DATASET[:100])

-model_3gram = Model(n = 3)
-model_3gram.train(FINAL_DATASET)
+# --------------------- TOKENIZE ---------------------

-model = model_3gram
+FINAL_DATASET_TOKENIZED = []
+tokenize_dict = bidict({})
+token = 1
+for i, word in enumerate(FINAL_DATASET):
+    if(word in tokenize_dict):
+        FINAL_DATASET_TOKENIZED.append(tokenize_dict[word])
+    else:
+        tokenize_dict[word] = token
+        FINAL_DATASET_TOKENIZED.append(token)
+        token = token + 1
+
+# --------------------- N-GRAM & TENSORS ---------------------
+
+ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
+np.random.shuffle(ngram_list)
+
+tensor_ngram = torch.tensor(ngram_list, device=device)
+
+X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
+Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)
+
+X_split = torch.split(X, batch_size)
+Y_split = torch.split(Y, batch_size)
+
+# vocab_size = len(tokenize_dict) + 1
+vocab_size = 20000
+
+# --------------------- MODEL N-GRAM ---------------------
+
+def token_to_word(token):
+    token = int(token)
+    if(token in tokenize_dict.inverse):
+        return tokenize_dict.inverse[token]
+    else:
+        return "<UNK>"
+    
+class Model(torch.nn.Module):
+    
+    def __init__(self, vocab_size=vocab_size, UNK_token= '<UNK>', n=n):
+        super(Model, self).__init__()
+        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
+        self.linear = torch.nn.Linear(embedding_size *(n-1), vocab_size)
+
+    def forward(self, inputs):
+        out = self.embedding(inputs)
+        out = out.view(inputs.size(0), -1)
+        out = self.linear(out)
+        out = torch.softmax(out, dim = 1)
+        return out
+    
+    def train(self, input, output) -> None:
+        criterion = torch.nn.CrossEntropyLoss()
+        # optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
+        optimizer = torch.optim.Adam(self.parameters())
+
+        batch_list = list(zip(input, output))
+        
+        for epoch in range(epochs):
+            total_loss = 0
+            for batch_input, batch_output in tqdm(batch_list):
+                self.zero_grad()
+                result = self(batch_input)
+
+                loss = criterion(result, batch_output.view(-1))
+                total_loss = total_loss + loss.item()
+                loss.backward()
+                optimizer.step()
+
+            total_loss = total_loss/len(batch_list)
+            print("EPOCH: ", epoch, "LOSS: ", total_loss)
+
+    def predict(self, text_beggining:list, text_ending:list) -> list:
+        text_beggining = text_beggining[-math.floor(n/2):]
+        text_ending = text_ending[:math.floor(n/2)]
+
+        beginning = []
+        for word in text_beggining:
+            if(word in tokenize_dict):
+                beginning.append(tokenize_dict[word])
+            else:
+                beginning.append(0)
+
+        ending = []
+        for word in text_ending:
+            if(word in tokenize_dict):
+                ending.append(tokenize_dict[word])
+            else:
+                ending.append(0)
+
+        tensor_context = torch.tensor([beginning + ending]).to(device)
+        with torch.no_grad():
+            result = self(tensor_context)
+            result_pred, result_tokens = torch.topk(result, 10)
+            words = list(zip(result_tokens[0], result_pred[0]))
+            words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
+        return words
+
+# --------------------- TRAIN ---------------------
+
+model = Model()
+model.to(device)
+model.train(X_split[:2000], Y_split[:2000])
+# 39607
+# model.train(X_split, Y_split)
+
+# --------------------- PREDICTION ---------------------

 def convert_predictions(line):
    sum_predictions = np.sum([pred[1] for pred in line])
@ -101,6 +164,8 @@ def convert_predictions(line):
        new_pred = math.floor(pred / sum_predictions * 100) / 100
        if(new_pred == 1.0):
            new_pred = 0.99
+        elif(new_pred == 0.0):
+            continue
        all_pred = all_pred + new_pred
        result = result + word + ":" + str(new_pred) + " "
    if(round(all_pred, 2) < 1):