07_neural

This commit is contained in:
Aleksandra 2024-05-15 09:27:53 +02:00
parent 1eb60bd963
commit 362a192af8
3 changed files with 603 additions and 174 deletions

View File

@ -13,7 +13,6 @@ Perplexity hashed by
-----------------
1. Statystyczny model językowy (zadanie 5)
- branch: master - Perplexity hashed on `dev-0`: 555.75
- branch: 05_ngram - Perplexity hashed on `dev-0`: xxx
<br><br>
2. Neuronowy model językowy (zadanie 7)
- branch: 07_neural - Perplexity hashed on `dev-0`: xxx

File diff suppressed because one or more lines are too long

179
run.py
View File

@ -1,11 +1,10 @@
import pandas as pd
import numpy as np
import csv
import os
import re
import random
from collections import Counter, defaultdict
import nltk
import torch
from bidict import bidict
import math
from tqdm import tqdm
@ -13,60 +12,17 @@ directory = "train/in.tsv.xz"
directory_dev_0 = "dev-0/in.tsv.xz"
directory_test_A = "test-A/in.tsv.xz"
class Model():
def __init__(self, vocab_size=30_000, UNK_token= '<UNK>', n=3):
if (n <= 1 or n % 2 == 0):
raise "change N value !!!"
self.n = n
self.vocab_size = vocab_size
self.UNK_token = UNK_token
def train(self, corpus:list) -> None:
if(self.n > 1):
self.n_grams = list(nltk.ngrams(corpus, n=self.n))
else:
self.n_grams = corpus
self.counter = Counter(self.n_grams)
self.words_counter = Counter(corpus)
self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])
# --------------------- PARAMETERS ---------------------
self.all_grams = defaultdict(set)
n = 3
device = torch.device("cuda")
for gram in tqdm(self.n_grams):
previous_words = tuple(gram[:math.floor(self.n/2)])
next_words = tuple(gram[math.ceil(self.n/2):])
word = gram[math.floor(self.n/2)]
self.all_grams[(previous_words, next_words)].add(word)
batch_size = 512
learning_rate = 0.004
epochs = 1
embedding_size = 64
def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:
previous_words = tuple(left_text[-math.floor(self.n/2):])
next_words = tuple(right_text[:math.floor(self.n/2)])
quantity = self.counter[previous_words + tuple([word]) + next_words]
all_quantity = self.all_quantities[previous_words + next_words]
if (all_quantity <= 0):
return 0
return quantity/all_quantity
def get_prob_for_text(self, text: list) -> float:
prob = 1
for gram in list(nltk.ngrams(text, self.n)):
prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])
return prob
def most_probable_words(self, left_text: list, right_text: list) -> str:
previous_words = tuple(left_text[-math.floor(self.n/2):])
next_words = tuple(right_text[:math.floor(self.n/2)])
all_words = self.all_grams[(previous_words, next_words)]
best_words = []
for word in all_words:
probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)
best_words.append((word, probability))
return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]
def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:
words = self.most_probable_words(text_beggining, text_ending)
return words
# --------------------- DATASET ---------------------
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
@ -85,13 +41,120 @@ for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList))
lines = list(map(lambda l: " ".join(l), lines))
DATASET = DATASET + " ".join(lines)
if(number == 15):
break
FINAL_DATASET = re.split(r"\s+", DATASET)
print(FINAL_DATASET[:100])
model_3gram = Model(n = 3)
model_3gram.train(FINAL_DATASET)
# --------------------- TOKENIZE ---------------------
model = model_3gram
FINAL_DATASET_TOKENIZED = []
tokenize_dict = bidict({})
token = 1
for i, word in enumerate(FINAL_DATASET):
if(word in tokenize_dict):
FINAL_DATASET_TOKENIZED.append(tokenize_dict[word])
else:
tokenize_dict[word] = token
FINAL_DATASET_TOKENIZED.append(token)
token = token + 1
# --------------------- N-GRAM & TENSORS ---------------------
ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
np.random.shuffle(ngram_list)
tensor_ngram = torch.tensor(ngram_list, device=device)
X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)
X_split = torch.split(X, batch_size)
Y_split = torch.split(Y, batch_size)
# vocab_size = len(tokenize_dict) + 1
vocab_size = 20000
# --------------------- MODEL N-GRAM ---------------------
def token_to_word(token):
token = int(token)
if(token in tokenize_dict.inverse):
return tokenize_dict.inverse[token]
else:
return "<UNK>"
class Model(torch.nn.Module):
def __init__(self, vocab_size=vocab_size, UNK_token= '<UNK>', n=n):
super(Model, self).__init__()
self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
self.linear = torch.nn.Linear(embedding_size *(n-1), vocab_size)
def forward(self, inputs):
out = self.embedding(inputs)
out = out.view(inputs.size(0), -1)
out = self.linear(out)
out = torch.softmax(out, dim = 1)
return out
def train(self, input, output) -> None:
criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
optimizer = torch.optim.Adam(self.parameters())
batch_list = list(zip(input, output))
for epoch in range(epochs):
total_loss = 0
for batch_input, batch_output in tqdm(batch_list):
self.zero_grad()
result = self(batch_input)
loss = criterion(result, batch_output.view(-1))
total_loss = total_loss + loss.item()
loss.backward()
optimizer.step()
total_loss = total_loss/len(batch_list)
print("EPOCH: ", epoch, "LOSS: ", total_loss)
def predict(self, text_beggining:list, text_ending:list) -> list:
text_beggining = text_beggining[-math.floor(n/2):]
text_ending = text_ending[:math.floor(n/2)]
beginning = []
for word in text_beggining:
if(word in tokenize_dict):
beginning.append(tokenize_dict[word])
else:
beginning.append(0)
ending = []
for word in text_ending:
if(word in tokenize_dict):
ending.append(tokenize_dict[word])
else:
ending.append(0)
tensor_context = torch.tensor([beginning + ending]).to(device)
with torch.no_grad():
result = self(tensor_context)
result_pred, result_tokens = torch.topk(result, 10)
words = list(zip(result_tokens[0], result_pred[0]))
words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
return words
# --------------------- TRAIN ---------------------
model = Model()
model.to(device)
model.train(X_split[:2000], Y_split[:2000])
# 39607
# model.train(X_split, Y_split)
# --------------------- PREDICTION ---------------------
def convert_predictions(line):
sum_predictions = np.sum([pred[1] for pred in line])
@ -101,6 +164,8 @@ def convert_predictions(line):
new_pred = math.floor(pred / sum_predictions * 100) / 100
if(new_pred == 1.0):
new_pred = 0.99
elif(new_pred == 0.0):
continue
all_pred = all_pred + new_pred
result = result + word + ":" + str(new_pred) + " "
if(round(all_pred, 2) < 1):