07_neural
This commit is contained in:
parent
1eb60bd963
commit
362a192af8
@ -13,7 +13,6 @@ Perplexity hashed by
|
||||
-----------------
|
||||
1. Statystyczny model językowy (zadanie 5)
|
||||
- branch: master - Perplexity hashed on `dev-0`: 555.75
|
||||
- branch: 05_ngram - Perplexity hashed on `dev-0`: xxx
|
||||
<br><br>
|
||||
2. Neuronowy model językowy (zadanie 7)
|
||||
- branch: 07_neural - Perplexity hashed on `dev-0`: xxx
|
||||
|
597
main.ipynb
597
main.ipynb
File diff suppressed because one or more lines are too long
179
run.py
179
run.py
@ -1,11 +1,10 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import random
|
||||
from collections import Counter, defaultdict
|
||||
import nltk
|
||||
import torch
|
||||
from bidict import bidict
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
|
||||
@ -13,60 +12,17 @@ directory = "train/in.tsv.xz"
|
||||
directory_dev_0 = "dev-0/in.tsv.xz"
|
||||
directory_test_A = "test-A/in.tsv.xz"
|
||||
|
||||
class Model():
|
||||
|
||||
def __init__(self, vocab_size=30_000, UNK_token= '<UNK>', n=3):
|
||||
if (n <= 1 or n % 2 == 0):
|
||||
raise "change N value !!!"
|
||||
self.n = n
|
||||
self.vocab_size = vocab_size
|
||||
self.UNK_token = UNK_token
|
||||
|
||||
def train(self, corpus:list) -> None:
|
||||
if(self.n > 1):
|
||||
self.n_grams = list(nltk.ngrams(corpus, n=self.n))
|
||||
else:
|
||||
self.n_grams = corpus
|
||||
self.counter = Counter(self.n_grams)
|
||||
self.words_counter = Counter(corpus)
|
||||
self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])
|
||||
# --------------------- PARAMETERS ---------------------
|
||||
|
||||
self.all_grams = defaultdict(set)
|
||||
n = 3
|
||||
device = torch.device("cuda")
|
||||
|
||||
for gram in tqdm(self.n_grams):
|
||||
previous_words = tuple(gram[:math.floor(self.n/2)])
|
||||
next_words = tuple(gram[math.ceil(self.n/2):])
|
||||
word = gram[math.floor(self.n/2)]
|
||||
self.all_grams[(previous_words, next_words)].add(word)
|
||||
batch_size = 512
|
||||
learning_rate = 0.004
|
||||
epochs = 1
|
||||
embedding_size = 64
|
||||
|
||||
def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:
|
||||
previous_words = tuple(left_text[-math.floor(self.n/2):])
|
||||
next_words = tuple(right_text[:math.floor(self.n/2)])
|
||||
quantity = self.counter[previous_words + tuple([word]) + next_words]
|
||||
all_quantity = self.all_quantities[previous_words + next_words]
|
||||
if (all_quantity <= 0):
|
||||
return 0
|
||||
return quantity/all_quantity
|
||||
|
||||
def get_prob_for_text(self, text: list) -> float:
|
||||
prob = 1
|
||||
for gram in list(nltk.ngrams(text, self.n)):
|
||||
prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])
|
||||
return prob
|
||||
|
||||
def most_probable_words(self, left_text: list, right_text: list) -> str:
|
||||
previous_words = tuple(left_text[-math.floor(self.n/2):])
|
||||
next_words = tuple(right_text[:math.floor(self.n/2)])
|
||||
all_words = self.all_grams[(previous_words, next_words)]
|
||||
best_words = []
|
||||
for word in all_words:
|
||||
probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)
|
||||
best_words.append((word, probability))
|
||||
return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]
|
||||
|
||||
def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:
|
||||
words = self.most_probable_words(text_beggining, text_ending)
|
||||
return words
|
||||
# --------------------- DATASET ---------------------
|
||||
|
||||
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
|
||||
|
||||
@ -85,13 +41,120 @@ for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList))
|
||||
lines = list(map(lambda l: " ".join(l), lines))
|
||||
DATASET = DATASET + " ".join(lines)
|
||||
|
||||
if(number == 15):
|
||||
break
|
||||
|
||||
FINAL_DATASET = re.split(r"\s+", DATASET)
|
||||
print(FINAL_DATASET[:100])
|
||||
|
||||
model_3gram = Model(n = 3)
|
||||
model_3gram.train(FINAL_DATASET)
|
||||
# --------------------- TOKENIZE ---------------------
|
||||
|
||||
model = model_3gram
|
||||
FINAL_DATASET_TOKENIZED = []
|
||||
tokenize_dict = bidict({})
|
||||
token = 1
|
||||
for i, word in enumerate(FINAL_DATASET):
|
||||
if(word in tokenize_dict):
|
||||
FINAL_DATASET_TOKENIZED.append(tokenize_dict[word])
|
||||
else:
|
||||
tokenize_dict[word] = token
|
||||
FINAL_DATASET_TOKENIZED.append(token)
|
||||
token = token + 1
|
||||
|
||||
# --------------------- N-GRAM & TENSORS ---------------------
|
||||
|
||||
ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
|
||||
np.random.shuffle(ngram_list)
|
||||
|
||||
tensor_ngram = torch.tensor(ngram_list, device=device)
|
||||
|
||||
X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
|
||||
Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)
|
||||
|
||||
X_split = torch.split(X, batch_size)
|
||||
Y_split = torch.split(Y, batch_size)
|
||||
|
||||
# vocab_size = len(tokenize_dict) + 1
|
||||
vocab_size = 20000
|
||||
|
||||
# --------------------- MODEL N-GRAM ---------------------
|
||||
|
||||
def token_to_word(token):
|
||||
token = int(token)
|
||||
if(token in tokenize_dict.inverse):
|
||||
return tokenize_dict.inverse[token]
|
||||
else:
|
||||
return "<UNK>"
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
|
||||
def __init__(self, vocab_size=vocab_size, UNK_token= '<UNK>', n=n):
|
||||
super(Model, self).__init__()
|
||||
self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
|
||||
self.linear = torch.nn.Linear(embedding_size *(n-1), vocab_size)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.embedding(inputs)
|
||||
out = out.view(inputs.size(0), -1)
|
||||
out = self.linear(out)
|
||||
out = torch.softmax(out, dim = 1)
|
||||
return out
|
||||
|
||||
def train(self, input, output) -> None:
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
# optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
|
||||
optimizer = torch.optim.Adam(self.parameters())
|
||||
|
||||
batch_list = list(zip(input, output))
|
||||
|
||||
for epoch in range(epochs):
|
||||
total_loss = 0
|
||||
for batch_input, batch_output in tqdm(batch_list):
|
||||
self.zero_grad()
|
||||
result = self(batch_input)
|
||||
|
||||
loss = criterion(result, batch_output.view(-1))
|
||||
total_loss = total_loss + loss.item()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
total_loss = total_loss/len(batch_list)
|
||||
print("EPOCH: ", epoch, "LOSS: ", total_loss)
|
||||
|
||||
def predict(self, text_beggining:list, text_ending:list) -> list:
|
||||
text_beggining = text_beggining[-math.floor(n/2):]
|
||||
text_ending = text_ending[:math.floor(n/2)]
|
||||
|
||||
beginning = []
|
||||
for word in text_beggining:
|
||||
if(word in tokenize_dict):
|
||||
beginning.append(tokenize_dict[word])
|
||||
else:
|
||||
beginning.append(0)
|
||||
|
||||
ending = []
|
||||
for word in text_ending:
|
||||
if(word in tokenize_dict):
|
||||
ending.append(tokenize_dict[word])
|
||||
else:
|
||||
ending.append(0)
|
||||
|
||||
tensor_context = torch.tensor([beginning + ending]).to(device)
|
||||
with torch.no_grad():
|
||||
result = self(tensor_context)
|
||||
result_pred, result_tokens = torch.topk(result, 10)
|
||||
words = list(zip(result_tokens[0], result_pred[0]))
|
||||
words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
|
||||
return words
|
||||
|
||||
# --------------------- TRAIN ---------------------
|
||||
|
||||
model = Model()
|
||||
model.to(device)
|
||||
model.train(X_split[:2000], Y_split[:2000])
|
||||
# 39607
|
||||
# model.train(X_split, Y_split)
|
||||
|
||||
# --------------------- PREDICTION ---------------------
|
||||
|
||||
def convert_predictions(line):
|
||||
sum_predictions = np.sum([pred[1] for pred in line])
|
||||
@ -101,6 +164,8 @@ def convert_predictions(line):
|
||||
new_pred = math.floor(pred / sum_predictions * 100) / 100
|
||||
if(new_pred == 1.0):
|
||||
new_pred = 0.99
|
||||
elif(new_pred == 0.0):
|
||||
continue
|
||||
all_pred = all_pred + new_pred
|
||||
result = result + word + ":" + str(new_pred) + " "
|
||||
if(round(all_pred, 2) < 1):
|
||||
|
Loading…
Reference in New Issue
Block a user