07_neural fix

2024-05-16 01:44:32 +02:00 · 2024-05-16 01:44:32 +02:00 · 0fc40ace22
commit 0fc40ace22
parent 362a192af8
5 changed files with 18270 additions and 18298 deletions
--- a/README.md
+++ b/README.md
@ -15,7 +15,7 @@ Perplexity hashed by
    - branch: master - Perplexity hashed on `dev-0`: 555.75
    <br><br>
 2. Neuronowy model językowy (zadanie 7)
-    - branch: 07_neural - Perplexity hashed on `dev-0`: xxx
+    - branch: 07_neural - Perplexity hashed on `dev-0`: 588.67
    <br><br>
 3. Model neuronowy rekurencyjny (zadanie 9)
    - branch: 09_neural - Perplexity hashed on `dev-0`: xxx
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/main.ipynb
+++ b/main.ipynb
--- a/run.py
+++ b/run.py
@ -9,6 +9,7 @@ import math
 from tqdm import tqdm

 directory = "train/in.tsv.xz"
+directory_expected = "train/expected.tsv"
 directory_dev_0 = "dev-0/in.tsv.xz"
 directory_test_A = "test-A/in.tsv.xz"

@ -17,64 +18,92 @@ directory_test_A = "test-A/in.tsv.xz"
 n = 3
 device = torch.device("cuda")

-batch_size = 512
-learning_rate = 0.004
-epochs = 1
-embedding_size = 64
+batch_size = 1024
+learning_rate = 0.001
+epochs = 15
+embedding_size = 128

 # --------------------- DATASET ---------------------

 dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)

-expectedList = pd.read_csv(directory, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
+expectedList = pd.read_csv(directory_expected, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)

 DATASET = ""
+count = n - 1
+
+n_gram = []

 for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
+    dataframe = dataframe.reset_index()
    dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)

-    left_text = dataframe['LeftContext'].to_list()
-    right_text = dataframe['RightContext'].to_list()
-    word = expected['Word'].to_list()
+    expected['Word'] = expected['Word'].apply(lambda x: [str(x).strip()])
+    word = expected['Word']

-    lines = zip(left_text, word, right_text)
-    lines = list(map(lambda l: " ".join(l), lines))
-    DATASET = DATASET + " ".join(lines)
+    # ------------------------------ LEFT ------------------------------
+    # dataframe['LeftContext'] = dataframe['LeftContext'].apply(lambda x: re.split(r"\s+", x.strip())[-count:])
+    # left_text = dataframe['LeftContext']

-    if(number == 15):
-        break
+    # lines = list(zip(left_text, word))
+    # lines = list(map(lambda l: l[0] + l[1], lines))

-FINAL_DATASET = re.split(r"\s+", DATASET)
-print(FINAL_DATASET[:100])
+    # ------------------------------ MIDDLE ------------------------------
+    dataframe['LeftContext'] = dataframe['LeftContext'].apply(lambda x: re.split(r"\s+", x.strip())[-math.floor(n/2):])
+    left_text = dataframe['LeftContext']
+    dataframe['RightContext'] = dataframe['RightContext'].apply(lambda x: re.split(r"\s+", x.strip())[:math.floor(n/2)])
+    right_text = dataframe['RightContext']
+
+    lines = list(zip(left_text, word, right_text))
+    lines = list(map(lambda l: l[0] + l[1] + l[2], lines))
+
+    # ------------------------------ END ------------------------------
+    
+    n_gram.extend(lines)
+print(n_gram[:100])
+
+FINAL_DATASET = n_gram

 # --------------------- TOKENIZE ---------------------

 FINAL_DATASET_TOKENIZED = []
 tokenize_dict = bidict({})
 token = 1
-for i, word in enumerate(FINAL_DATASET):
-    if(word in tokenize_dict):
-        FINAL_DATASET_TOKENIZED.append(tokenize_dict[word])
-    else:
-        tokenize_dict[word] = token
-        FINAL_DATASET_TOKENIZED.append(token)
-        token = token + 1
+for i, n_words in enumerate(FINAL_DATASET):
+    n_gram = []
+    for j in range(n):
+        if(n_words[j] in tokenize_dict):
+            n_gram.append(tokenize_dict[n_words[j]])
+        else:
+            tokenize_dict[n_words[j]] = token
+            n_gram.append(token)
+            token = token + 1
+    FINAL_DATASET_TOKENIZED.append(n_gram)
+

 # --------------------- N-GRAM & TENSORS ---------------------

-ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
+# ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
+ngram_list = FINAL_DATASET_TOKENIZED
 np.random.shuffle(ngram_list)

 tensor_ngram = torch.tensor(ngram_list, device=device)

+# ------------------------------ MIDDLE ------------------------------
 X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
 Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)

+# ------------------------------ LEFT ------------------------------
+# X = tensor_ngram[:, :count].to(device)
+# Y = tensor_ngram[:, count].reshape(-1, 1).to(device)
+
+# ------------------------------ END ------------------------------
+
 X_split = torch.split(X, batch_size)
 Y_split = torch.split(Y, batch_size)

-# vocab_size = len(tokenize_dict) + 1
-vocab_size = 20000
+vocab_size = len(tokenize_dict) + 1
+# vocab_size = 50000

 # --------------------- MODEL N-GRAM ---------------------

@ -95,14 +124,15 @@ class Model(torch.nn.Module):
    def forward(self, inputs):
        out = self.embedding(inputs)
        out = out.view(inputs.size(0), -1)
+        # out = torch.relu(out)
+        out = torch.softmax(out, dim=1)
        out = self.linear(out)
-        out = torch.softmax(out, dim = 1)
        return out
    
    def train(self, input, output) -> None:
        criterion = torch.nn.CrossEntropyLoss()
-        # optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
-        optimizer = torch.optim.Adam(self.parameters())
+        optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
+        # optimizer = torch.optim.Adam(self.parameters())

        batch_list = list(zip(input, output))
        
@ -121,8 +151,13 @@ class Model(torch.nn.Module):
            print("EPOCH: ", epoch, "LOSS: ", total_loss)

    def predict(self, text_beggining:list, text_ending:list) -> list:
+        # ------------------------------ MIDDLE ------------------------------
        text_beggining = text_beggining[-math.floor(n/2):]
        text_ending = text_ending[:math.floor(n/2)]
+        # ------------------------------ LEFT ------------------------------
+        # text_ending = []
+        # text_beggining = text_beggining[-count:]
+        # ------------------------------ END ------------------------------

        beginning = []
        for word in text_beggining:
@ -141,7 +176,7 @@ class Model(torch.nn.Module):
        tensor_context = torch.tensor([beginning + ending]).to(device)
        with torch.no_grad():
            result = self(tensor_context)
-            result_pred, result_tokens = torch.topk(result, 10)
+            result_pred, result_tokens = torch.topk(result, 20)
            words = list(zip(result_tokens[0], result_pred[0]))
            words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
        return words
@ -150,9 +185,7 @@ class Model(torch.nn.Module):

 model = Model()
 model.to(device)
-model.train(X_split[:2000], Y_split[:2000])
-# 39607
-# model.train(X_split, Y_split)
+model.train(X_split, Y_split)

 # --------------------- PREDICTION ---------------------

@ -160,14 +193,14 @@ def convert_predictions(line):
    sum_predictions = np.sum([pred[1] for pred in line])
    result = ""
    all_pred = 0
-    for word, pred in line:
+    for word, pred in line[:10]:
        new_pred = math.floor(pred / sum_predictions * 100) / 100
        if(new_pred == 1.0):
            new_pred = 0.99
        elif(new_pred == 0.0):
            continue
        all_pred = all_pred + new_pred
-        result = result + word + ":" + str(new_pred) + " "
+        result = result + str(word) + ":" + str(new_pred) + " "
    if(round(all_pred, 2) < 1):
        result = result + ":" + str(round(1 - all_pred, 2))
    else:
@ -183,8 +216,8 @@ left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_lis
 right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

 lines = zip(left_text, right_text)
-lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
-print(lines[:100])
+lines = list(map(lambda l: model.predict(l[0], l[1]), tqdm(lines)))
+print(lines[:40])

 with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
    result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
@ -200,8 +233,8 @@ left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_lis
 right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()

 lines = zip(left_text, right_text)
-lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
-print(lines[:100])
+lines = list(map(lambda l: model.predict(l[0], l[1]), tqdm(lines)))
+print(lines[:40])

 with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
    result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
--- a/test-A/out.tsv
+++ b/test-A/out.tsv