07_neural fix
This commit is contained in:
parent
362a192af8
commit
0fc40ace22
@ -15,7 +15,7 @@ Perplexity hashed by
|
|||||||
- branch: master - Perplexity hashed on `dev-0`: 555.75
|
- branch: master - Perplexity hashed on `dev-0`: 555.75
|
||||||
<br><br>
|
<br><br>
|
||||||
2. Neuronowy model językowy (zadanie 7)
|
2. Neuronowy model językowy (zadanie 7)
|
||||||
- branch: 07_neural - Perplexity hashed on `dev-0`: xxx
|
- branch: 07_neural - Perplexity hashed on `dev-0`: 588.67
|
||||||
<br><br>
|
<br><br>
|
||||||
3. Model neuronowy rekurencyjny (zadanie 9)
|
3. Model neuronowy rekurencyjny (zadanie 9)
|
||||||
- branch: 09_neural - Perplexity hashed on `dev-0`: xxx
|
- branch: 09_neural - Perplexity hashed on `dev-0`: xxx
|
||||||
|
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
591
main.ipynb
591
main.ipynb
File diff suppressed because one or more lines are too long
109
run.py
109
run.py
@ -9,6 +9,7 @@ import math
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
directory = "train/in.tsv.xz"
|
directory = "train/in.tsv.xz"
|
||||||
|
directory_expected = "train/expected.tsv"
|
||||||
directory_dev_0 = "dev-0/in.tsv.xz"
|
directory_dev_0 = "dev-0/in.tsv.xz"
|
||||||
directory_test_A = "test-A/in.tsv.xz"
|
directory_test_A = "test-A/in.tsv.xz"
|
||||||
|
|
||||||
@ -17,64 +18,92 @@ directory_test_A = "test-A/in.tsv.xz"
|
|||||||
n = 3
|
n = 3
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
|
||||||
batch_size = 512
|
batch_size = 1024
|
||||||
learning_rate = 0.004
|
learning_rate = 0.001
|
||||||
epochs = 1
|
epochs = 15
|
||||||
embedding_size = 64
|
embedding_size = 128
|
||||||
|
|
||||||
# --------------------- DATASET ---------------------
|
# --------------------- DATASET ---------------------
|
||||||
|
|
||||||
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
|
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
|
||||||
|
|
||||||
expectedList = pd.read_csv(directory, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
|
expectedList = pd.read_csv(directory_expected, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
|
||||||
|
|
||||||
DATASET = ""
|
DATASET = ""
|
||||||
|
count = n - 1
|
||||||
|
|
||||||
|
n_gram = []
|
||||||
|
|
||||||
for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
|
for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
|
||||||
|
dataframe = dataframe.reset_index()
|
||||||
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)
|
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)
|
||||||
|
|
||||||
left_text = dataframe['LeftContext'].to_list()
|
expected['Word'] = expected['Word'].apply(lambda x: [str(x).strip()])
|
||||||
right_text = dataframe['RightContext'].to_list()
|
word = expected['Word']
|
||||||
word = expected['Word'].to_list()
|
|
||||||
|
|
||||||
lines = zip(left_text, word, right_text)
|
# ------------------------------ LEFT ------------------------------
|
||||||
lines = list(map(lambda l: " ".join(l), lines))
|
# dataframe['LeftContext'] = dataframe['LeftContext'].apply(lambda x: re.split(r"\s+", x.strip())[-count:])
|
||||||
DATASET = DATASET + " ".join(lines)
|
# left_text = dataframe['LeftContext']
|
||||||
|
|
||||||
if(number == 15):
|
# lines = list(zip(left_text, word))
|
||||||
break
|
# lines = list(map(lambda l: l[0] + l[1], lines))
|
||||||
|
|
||||||
FINAL_DATASET = re.split(r"\s+", DATASET)
|
# ------------------------------ MIDDLE ------------------------------
|
||||||
print(FINAL_DATASET[:100])
|
dataframe['LeftContext'] = dataframe['LeftContext'].apply(lambda x: re.split(r"\s+", x.strip())[-math.floor(n/2):])
|
||||||
|
left_text = dataframe['LeftContext']
|
||||||
|
dataframe['RightContext'] = dataframe['RightContext'].apply(lambda x: re.split(r"\s+", x.strip())[:math.floor(n/2)])
|
||||||
|
right_text = dataframe['RightContext']
|
||||||
|
|
||||||
|
lines = list(zip(left_text, word, right_text))
|
||||||
|
lines = list(map(lambda l: l[0] + l[1] + l[2], lines))
|
||||||
|
|
||||||
|
# ------------------------------ END ------------------------------
|
||||||
|
|
||||||
|
n_gram.extend(lines)
|
||||||
|
print(n_gram[:100])
|
||||||
|
|
||||||
|
FINAL_DATASET = n_gram
|
||||||
|
|
||||||
# --------------------- TOKENIZE ---------------------
|
# --------------------- TOKENIZE ---------------------
|
||||||
|
|
||||||
FINAL_DATASET_TOKENIZED = []
|
FINAL_DATASET_TOKENIZED = []
|
||||||
tokenize_dict = bidict({})
|
tokenize_dict = bidict({})
|
||||||
token = 1
|
token = 1
|
||||||
for i, word in enumerate(FINAL_DATASET):
|
for i, n_words in enumerate(FINAL_DATASET):
|
||||||
if(word in tokenize_dict):
|
n_gram = []
|
||||||
FINAL_DATASET_TOKENIZED.append(tokenize_dict[word])
|
for j in range(n):
|
||||||
else:
|
if(n_words[j] in tokenize_dict):
|
||||||
tokenize_dict[word] = token
|
n_gram.append(tokenize_dict[n_words[j]])
|
||||||
FINAL_DATASET_TOKENIZED.append(token)
|
else:
|
||||||
token = token + 1
|
tokenize_dict[n_words[j]] = token
|
||||||
|
n_gram.append(token)
|
||||||
|
token = token + 1
|
||||||
|
FINAL_DATASET_TOKENIZED.append(n_gram)
|
||||||
|
|
||||||
|
|
||||||
# --------------------- N-GRAM & TENSORS ---------------------
|
# --------------------- N-GRAM & TENSORS ---------------------
|
||||||
|
|
||||||
ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
|
# ngram_list = list(nltk.ngrams(FINAL_DATASET_TOKENIZED, n=n))
|
||||||
|
ngram_list = FINAL_DATASET_TOKENIZED
|
||||||
np.random.shuffle(ngram_list)
|
np.random.shuffle(ngram_list)
|
||||||
|
|
||||||
tensor_ngram = torch.tensor(ngram_list, device=device)
|
tensor_ngram = torch.tensor(ngram_list, device=device)
|
||||||
|
|
||||||
|
# ------------------------------ MIDDLE ------------------------------
|
||||||
X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
|
X = torch.cat((tensor_ngram[:, :math.floor(n/2)], tensor_ngram[:, math.ceil(n/2):]), dim = 1).to(device)
|
||||||
Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)
|
Y = tensor_ngram[:, math.floor(n/2)].reshape(-1, 1).to(device)
|
||||||
|
|
||||||
|
# ------------------------------ LEFT ------------------------------
|
||||||
|
# X = tensor_ngram[:, :count].to(device)
|
||||||
|
# Y = tensor_ngram[:, count].reshape(-1, 1).to(device)
|
||||||
|
|
||||||
|
# ------------------------------ END ------------------------------
|
||||||
|
|
||||||
X_split = torch.split(X, batch_size)
|
X_split = torch.split(X, batch_size)
|
||||||
Y_split = torch.split(Y, batch_size)
|
Y_split = torch.split(Y, batch_size)
|
||||||
|
|
||||||
# vocab_size = len(tokenize_dict) + 1
|
vocab_size = len(tokenize_dict) + 1
|
||||||
vocab_size = 20000
|
# vocab_size = 50000
|
||||||
|
|
||||||
# --------------------- MODEL N-GRAM ---------------------
|
# --------------------- MODEL N-GRAM ---------------------
|
||||||
|
|
||||||
@ -95,14 +124,15 @@ class Model(torch.nn.Module):
|
|||||||
def forward(self, inputs):
|
def forward(self, inputs):
|
||||||
out = self.embedding(inputs)
|
out = self.embedding(inputs)
|
||||||
out = out.view(inputs.size(0), -1)
|
out = out.view(inputs.size(0), -1)
|
||||||
|
# out = torch.relu(out)
|
||||||
|
out = torch.softmax(out, dim=1)
|
||||||
out = self.linear(out)
|
out = self.linear(out)
|
||||||
out = torch.softmax(out, dim = 1)
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def train(self, input, output) -> None:
|
def train(self, input, output) -> None:
|
||||||
criterion = torch.nn.CrossEntropyLoss()
|
criterion = torch.nn.CrossEntropyLoss()
|
||||||
# optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
|
optimizer = torch.optim.Adam(self.parameters(), lr = learning_rate)
|
||||||
optimizer = torch.optim.Adam(self.parameters())
|
# optimizer = torch.optim.Adam(self.parameters())
|
||||||
|
|
||||||
batch_list = list(zip(input, output))
|
batch_list = list(zip(input, output))
|
||||||
|
|
||||||
@ -121,8 +151,13 @@ class Model(torch.nn.Module):
|
|||||||
print("EPOCH: ", epoch, "LOSS: ", total_loss)
|
print("EPOCH: ", epoch, "LOSS: ", total_loss)
|
||||||
|
|
||||||
def predict(self, text_beggining:list, text_ending:list) -> list:
|
def predict(self, text_beggining:list, text_ending:list) -> list:
|
||||||
|
# ------------------------------ MIDDLE ------------------------------
|
||||||
text_beggining = text_beggining[-math.floor(n/2):]
|
text_beggining = text_beggining[-math.floor(n/2):]
|
||||||
text_ending = text_ending[:math.floor(n/2)]
|
text_ending = text_ending[:math.floor(n/2)]
|
||||||
|
# ------------------------------ LEFT ------------------------------
|
||||||
|
# text_ending = []
|
||||||
|
# text_beggining = text_beggining[-count:]
|
||||||
|
# ------------------------------ END ------------------------------
|
||||||
|
|
||||||
beginning = []
|
beginning = []
|
||||||
for word in text_beggining:
|
for word in text_beggining:
|
||||||
@ -141,7 +176,7 @@ class Model(torch.nn.Module):
|
|||||||
tensor_context = torch.tensor([beginning + ending]).to(device)
|
tensor_context = torch.tensor([beginning + ending]).to(device)
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
result = self(tensor_context)
|
result = self(tensor_context)
|
||||||
result_pred, result_tokens = torch.topk(result, 10)
|
result_pred, result_tokens = torch.topk(result, 20)
|
||||||
words = list(zip(result_tokens[0], result_pred[0]))
|
words = list(zip(result_tokens[0], result_pred[0]))
|
||||||
words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
|
words = [(token_to_word(token), round(float(score), 2)) for token, score in words]
|
||||||
return words
|
return words
|
||||||
@ -150,9 +185,7 @@ class Model(torch.nn.Module):
|
|||||||
|
|
||||||
model = Model()
|
model = Model()
|
||||||
model.to(device)
|
model.to(device)
|
||||||
model.train(X_split[:2000], Y_split[:2000])
|
model.train(X_split, Y_split)
|
||||||
# 39607
|
|
||||||
# model.train(X_split, Y_split)
|
|
||||||
|
|
||||||
# --------------------- PREDICTION ---------------------
|
# --------------------- PREDICTION ---------------------
|
||||||
|
|
||||||
@ -160,14 +193,14 @@ def convert_predictions(line):
|
|||||||
sum_predictions = np.sum([pred[1] for pred in line])
|
sum_predictions = np.sum([pred[1] for pred in line])
|
||||||
result = ""
|
result = ""
|
||||||
all_pred = 0
|
all_pred = 0
|
||||||
for word, pred in line:
|
for word, pred in line[:10]:
|
||||||
new_pred = math.floor(pred / sum_predictions * 100) / 100
|
new_pred = math.floor(pred / sum_predictions * 100) / 100
|
||||||
if(new_pred == 1.0):
|
if(new_pred == 1.0):
|
||||||
new_pred = 0.99
|
new_pred = 0.99
|
||||||
elif(new_pred == 0.0):
|
elif(new_pred == 0.0):
|
||||||
continue
|
continue
|
||||||
all_pred = all_pred + new_pred
|
all_pred = all_pred + new_pred
|
||||||
result = result + word + ":" + str(new_pred) + " "
|
result = result + str(word) + ":" + str(new_pred) + " "
|
||||||
if(round(all_pred, 2) < 1):
|
if(round(all_pred, 2) < 1):
|
||||||
result = result + ":" + str(round(1 - all_pred, 2))
|
result = result + ":" + str(round(1 - all_pred, 2))
|
||||||
else:
|
else:
|
||||||
@ -183,8 +216,8 @@ left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_lis
|
|||||||
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
|
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
|
||||||
|
|
||||||
lines = zip(left_text, right_text)
|
lines = zip(left_text, right_text)
|
||||||
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
|
lines = list(map(lambda l: model.predict(l[0], l[1]), tqdm(lines)))
|
||||||
print(lines[:100])
|
print(lines[:40])
|
||||||
|
|
||||||
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
|
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
|
||||||
result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
|
result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
|
||||||
@ -200,8 +233,8 @@ left_text = dataframe['LeftContext'].apply(lambda l: re.split(r"\s+", l)).to_lis
|
|||||||
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
|
right_text = dataframe['RightContext'].apply(lambda l: re.split(r"\s+", l)).to_list()
|
||||||
|
|
||||||
lines = zip(left_text, right_text)
|
lines = zip(left_text, right_text)
|
||||||
lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))
|
lines = list(map(lambda l: model.predict(l[0], l[1]), tqdm(lines)))
|
||||||
print(lines[:100])
|
print(lines[:40])
|
||||||
|
|
||||||
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
|
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
|
||||||
result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
|
result = "\n".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))
|
||||||
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user