Compare commits

..

4 Commits

Author SHA1 Message Date
Marcin Czerniak
0c45624062 Perplexity: 422 2024-05-22 01:51:51 +02:00
a2d183f2e3 Score: 6.04 2024-05-18 22:57:13 +02:00
Marcin Czerniak
337d2ffc42 PerplexityHashed: 990 2024-05-16 01:26:17 +02:00
Marcin Czerniak
d380959afc Neural network 2024-05-15 04:51:48 +02:00
4 changed files with 12330 additions and 10519 deletions

File diff suppressed because it is too large Load Diff

248
run.py Normal file
View File

@ -0,0 +1,248 @@
# %% [markdown]
# # <b>Trigram</b> neural network model for gap fill task
# %% [markdown]
# ## Import required packages
# %%
from tqdm import tqdm
import re
import nltk
import os
import csv
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import sys
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from bidict import bidict
import math
from sklearn.utils import shuffle
from collections import Counter
import random
# %%
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'
# %% [markdown]
# ## Global configuration variables
# %%
vocab_size = 60_000
batch_size = 64
embedding_dim = 64
hidden_dim = 1024
learning_rate = 0.001
epochs = 20
output_size = vocab_size
# %%
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)
# %% [markdown]
# ## Load train data corpus
# %%
dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')
expected_dir = os.path.join('..', 'train', 'expected.tsv')
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
input_corpus = []
target_corpus = []
left_tokens = 1
right_tokens = 1
for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['RightContext'].to_list()):
target_corpus.append([str(word).strip()])
input_corpus.append(re.split(r"\s+", left_context.strip())[-left_tokens:] + re.split(r"\s+", right_context.strip())[:right_tokens])
# %% [markdown]
# ## Create dictionaries for mapping words to indices
# %%
def flatten(matrix):
flat_list = []
for row in matrix:
flat_list += row
return flat_list
# %%
word_to_ix = bidict({})
words_corpus = flatten(input_corpus) + flatten(target_corpus)
counts = Counter(words_corpus)
for word, _ in tqdm(counts.most_common(vocab_size - 1)):
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix) + 1
# %% [markdown]
# ## Tokenize entire corpus
# %%
def tokenize(w):
if w in word_to_ix:
return word_to_ix[w]
else:
return 0
tokenized_input_corpus = []
tokenized_target_corpus = []
for words in tqdm(input_corpus):
tokenized_input_corpus.append([tokenize(word) for word in words])
for words in tqdm(target_corpus):
tokenized_target_corpus.append([tokenize(word) for word in words])
# %%
tokenized_input_corpus, tokenized_target_corpus = shuffle(tokenized_input_corpus, tokenized_target_corpus)
# %% [markdown]
# ## Create dataset
# %%
indices = np.nonzero(np.array(tokenized_target_corpus).flatten())
tokenized_input_corpus = np.take(tokenized_input_corpus, indices, axis=0)
tokenized_target_corpus = np.take(tokenized_target_corpus, indices, axis=0)
# %%
input_corpus_tensor = torch.flatten(torch.tensor(tokenized_input_corpus, dtype=torch.long, device=device), end_dim=-2)
target_corpus_tensor = torch.flatten(torch.tensor(tokenized_target_corpus, dtype=torch.long, device=device)).reshape(-1, 1)
# %%
print(input_corpus_tensor.size())
print(target_corpus_tensor.size())
# %%
random_index = random.randint(0, len(input_corpus_tensor) - 1)
# Get random element from input corpus
random_input_element = input_corpus_tensor[random_index]
# Get corresponding element from target corpus
random_target_element = target_corpus_tensor[random_index]
print([word_to_ix.inverse[int(idx)] if int(idx) > 0 else '<UNK>' for idx in random_input_element])
print([word_to_ix.inverse[int(idx)] if int(idx) > 0 else '<UNK>' for idx in random_target_element])
# %%
dataset = TensorDataset(input_corpus_tensor[:10_000], target_corpus_tensor[:10_000])
# %%
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# %% [markdown]
# ## Define the trigram neural network model
# %%
class TrigramNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
super(TrigramNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(embedding_dim * (left_tokens + right_tokens), hidden_dim)
self.linear2 = nn.Linear(hidden_dim, output_size)
def forward(self, inputs):
out = self.embedding(inputs)
out = out.view(inputs.size(0), -1)
out = torch.softmax(self.linear1(out), dim=1)
out = self.linear2(out)
return out
# %% [markdown]
# ## Initialize the model, loss function, and optimizer
# %%
model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
# %% [markdown]
# ## Training loop
# %%
model.to(device)
for epoch in range(epochs):
total_loss = 0
for batch_inputs, batch_targets in tqdm(dataloader):
batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)
model.zero_grad()
output = model(batch_inputs)
loss = criterion(output, batch_targets.view(-1))
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")
# %% [markdown]
# ## Write function to convert index to word
# %%
def idx_to_word(idx):
idx = int(idx)
if idx not in word_to_ix.inverse:
return '<UNK>'
return word_to_ix.inverse[idx]
# %% [markdown]
# ## test the model
# %%
def predict(left_context, right_context):
with torch.no_grad():
context = left_context + right_context
test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)
output = model(test_context_idxs)
top_predicted_scores, top_predicted_indices = torch.topk(output, 5)
predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))
predictions = [(float(score), idx_to_word(idx)) for score, idx in predictions]
total_score = np.sum([score for score, _ in predictions])
predictions = ' '.join([f"{word}:{score}" for score, word in predictions]) + ' :' + str(1.0 - total_score)
return predictions
# %%
print(predict(["came", "fiom"], []))
# %% [markdown]
# # Generate result for dev dataset
# %%
dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')
output_dir = os.path.join('..', 'dev-0', 'out.tsv')
df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
# %%
final = ""
for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
left_context = re.split(r"\s+", row['LeftContext'].strip())[-left_tokens:]
right_context = re.split(r"\s+", row['RightContext'].strip())[:right_tokens]
final += predict(left_context, right_context) + '\n'
with open(output_dir, 'w', encoding="UTF-8") as f:
f.write(final)

824
src/07_trigram_neural.ipynb Normal file
View File

@ -0,0 +1,824 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <b>Trigram</b> neural network model for gap fill task"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import required packages"
]
},
{
"cell_type": "code",
"execution_count": 414,
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"import re\n",
"import nltk\n",
"import os\n",
"import csv\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import sys\n",
"import numpy as np\n",
"from torch.utils.data import DataLoader, TensorDataset\n",
"from bidict import bidict\n",
"import math\n",
"from sklearn.utils import shuffle\n",
"from collections import Counter\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 415,
"metadata": {},
"outputs": [],
"source": [
"os.environ['CUDA_LAUNCH_BLOCKING'] = '1'\n",
"os.environ['TORCH_USE_CUDA_DSA'] = '1'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Global configuration variables"
]
},
{
"cell_type": "code",
"execution_count": 416,
"metadata": {},
"outputs": [],
"source": [
"vocab_size = 60_000\n",
"batch_size = 64\n",
"embedding_dim = 64\n",
"hidden_dim = 1024\n",
"learning_rate = 0.001\n",
"epochs = 20\n",
"\n",
"output_size = vocab_size"
]
},
{
"cell_type": "code",
"execution_count": 417,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cpu\n"
]
}
],
"source": [
"# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"device = torch.device(\"cpu\")\n",
"print(device)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load train data corpus"
]
},
{
"cell_type": "code",
"execution_count": 418,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/433 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 433/433 [01:03<00:00, 6.77it/s]\n"
]
}
],
"source": [
"dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')\n",
"expected_dir = os.path.join('..', 'train', 'expected.tsv')\n",
"\n",
"df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
"expected_df = pd.read_csv(expected_dir, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
"\n",
"\n",
"input_corpus = []\n",
"target_corpus = []\n",
"\n",
"left_tokens = 1\n",
"right_tokens = 1\n",
"\n",
"for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):\n",
" df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)\n",
" \n",
" for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['RightContext'].to_list()):\n",
" target_corpus.append([str(word).strip()])\n",
" input_corpus.append(re.split(r\"\\s+\", left_context.strip())[-left_tokens:] + re.split(r\"\\s+\", right_context.strip())[:right_tokens])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create dictionaries for mapping words to indices"
]
},
{
"cell_type": "code",
"execution_count": 419,
"metadata": {},
"outputs": [],
"source": [
"def flatten(matrix):\n",
" flat_list = []\n",
" for row in matrix:\n",
" flat_list += row\n",
" return flat_list"
]
},
{
"cell_type": "code",
"execution_count": 420,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 59999/59999 [00:00<00:00, 131034.12it/s]\n"
]
}
],
"source": [
"word_to_ix = bidict({})\n",
"words_corpus = flatten(input_corpus) + flatten(target_corpus)\n",
"\n",
"counts = Counter(words_corpus)\n",
"\n",
"for word, _ in tqdm(counts.most_common(vocab_size - 1)):\n",
" if word not in word_to_ix:\n",
" word_to_ix[word] = len(word_to_ix) + 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenize entire corpus"
]
},
{
"cell_type": "code",
"execution_count": 421,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 432022/432022 [00:01<00:00, 255044.26it/s]\n",
"100%|██████████| 432022/432022 [00:01<00:00, 348618.53it/s]\n"
]
}
],
"source": [
"def tokenize(w):\n",
" if w in word_to_ix:\n",
" return word_to_ix[w]\n",
" else:\n",
" return 0\n",
"\n",
"tokenized_input_corpus = []\n",
"tokenized_target_corpus = []\n",
"\n",
"for words in tqdm(input_corpus):\n",
" tokenized_input_corpus.append([tokenize(word) for word in words])\n",
"\n",
"for words in tqdm(target_corpus):\n",
" tokenized_target_corpus.append([tokenize(word) for word in words])"
]
},
{
"cell_type": "code",
"execution_count": 422,
"metadata": {},
"outputs": [],
"source": [
"tokenized_input_corpus, tokenized_target_corpus = shuffle(tokenized_input_corpus, tokenized_target_corpus)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create dataset"
]
},
{
"cell_type": "code",
"execution_count": 423,
"metadata": {},
"outputs": [],
"source": [
"indices = np.nonzero(np.array(tokenized_target_corpus).flatten())\n",
"\n",
"tokenized_input_corpus = np.take(tokenized_input_corpus, indices, axis=0)\n",
"tokenized_target_corpus = np.take(tokenized_target_corpus, indices, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 424,
"metadata": {},
"outputs": [],
"source": [
"input_corpus_tensor = torch.flatten(torch.tensor(tokenized_input_corpus, dtype=torch.long, device=device), end_dim=-2)\n",
"target_corpus_tensor = torch.flatten(torch.tensor(tokenized_target_corpus, dtype=torch.long, device=device)).reshape(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": 425,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([389892, 2])\n",
"torch.Size([389892, 1])\n"
]
}
],
"source": [
"print(input_corpus_tensor.size())\n",
"print(target_corpus_tensor.size())"
]
},
{
"cell_type": "code",
"execution_count": 426,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['end', 'the']\n",
"['of']\n"
]
}
],
"source": [
"random_index = random.randint(0, len(input_corpus_tensor) - 1)\n",
"\n",
"# Get random element from input corpus\n",
"random_input_element = input_corpus_tensor[random_index]\n",
"\n",
"# Get corresponding element from target corpus\n",
"random_target_element = target_corpus_tensor[random_index]\n",
"\n",
"print([word_to_ix.inverse[int(idx)] if int(idx) > 0 else '<UNK>' for idx in random_input_element])\n",
"print([word_to_ix.inverse[int(idx)] if int(idx) > 0 else '<UNK>' for idx in random_target_element])"
]
},
{
"cell_type": "code",
"execution_count": 427,
"metadata": {},
"outputs": [],
"source": [
"dataset = TensorDataset(input_corpus_tensor[:10_000], target_corpus_tensor[:10_000])"
]
},
{
"cell_type": "code",
"execution_count": 428,
"metadata": {},
"outputs": [],
"source": [
"dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the trigram neural network model"
]
},
{
"cell_type": "code",
"execution_count": 429,
"metadata": {},
"outputs": [],
"source": [
"class TrigramNN(nn.Module):\n",
" def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):\n",
" super(TrigramNN, self).__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
" self.linear1 = nn.Linear(embedding_dim * (left_tokens + right_tokens), hidden_dim)\n",
" self.linear2 = nn.Linear(hidden_dim, output_size)\n",
" \n",
" def forward(self, inputs):\n",
" out = self.embedding(inputs)\n",
" out = out.view(inputs.size(0), -1)\n",
" out = torch.softmax(self.linear1(out), dim=1)\n",
" out = self.linear2(out)\n",
" return out"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the model, loss function, and optimizer"
]
},
{
"cell_type": "code",
"execution_count": 430,
"metadata": {},
"outputs": [],
"source": [
"model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.SGD(model.parameters(), lr=learning_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training loop"
]
},
{
"cell_type": "code",
"execution_count": 431,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:32<00:00, 4.81it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 10.999195001687214\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:32<00:00, 4.86it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 2, Loss: 10.997720451112006\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:32<00:00, 4.88it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 3, Loss: 10.99624701214444\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.17it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 4, Loss: 10.994744385883306\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.21it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 5, Loss: 10.993266263585182\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.22it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 6, Loss: 10.991843545512788\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:31<00:00, 4.92it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 7, Loss: 10.990350304135852\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:28<00:00, 5.60it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 8, Loss: 10.988877800619527\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:32<00:00, 4.81it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 9, Loss: 10.987337306806236\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:29<00:00, 5.32it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 10, Loss: 10.985873113012618\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.13it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 11, Loss: 10.98438450637137\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:28<00:00, 5.45it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 12, Loss: 10.9829175548189\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.11it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 13, Loss: 10.981461263765954\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.08it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 14, Loss: 10.97996347269435\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.22it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 15, Loss: 10.978485234983408\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:31<00:00, 4.98it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 16, Loss: 10.977057912547117\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.23it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 17, Loss: 10.97553843601494\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:29<00:00, 5.34it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 18, Loss: 10.974108489455691\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:32<00:00, 4.82it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 19, Loss: 10.972679308265638\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 157/157 [00:30<00:00, 5.23it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 20, Loss: 10.971182902147815\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"model.to(device)\n",
"\n",
"for epoch in range(epochs):\n",
" total_loss = 0\n",
" for batch_inputs, batch_targets in tqdm(dataloader):\n",
" batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)\n",
" \n",
" model.zero_grad()\n",
" output = model(batch_inputs)\n",
"\n",
" loss = criterion(output, batch_targets.view(-1))\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" print(f\"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write function to convert index to word"
]
},
{
"cell_type": "code",
"execution_count": 432,
"metadata": {},
"outputs": [],
"source": [
"def idx_to_word(idx):\n",
" idx = int(idx)\n",
" if idx not in word_to_ix.inverse:\n",
" return '<UNK>'\n",
" return word_to_ix.inverse[idx]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## test the model"
]
},
{
"cell_type": "code",
"execution_count": 433,
"metadata": {},
"outputs": [],
"source": [
"def predict(left_context, right_context):\n",
" with torch.no_grad():\n",
" context = left_context + right_context\n",
" test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)\n",
" output = model(test_context_idxs)\n",
" top_predicted_scores, top_predicted_indices = torch.topk(output, 5)\n",
" predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))\n",
" predictions = [(float(score), idx_to_word(idx)) for score, idx in predictions]\n",
" total_score = np.sum([score for score, _ in predictions])\n",
" predictions = ' '.join([f\"{word}:{score}\" for score, word in predictions]) + ' :' + str(1.0 - total_score)\n",
" return predictions"
]
},
{
"cell_type": "code",
"execution_count": 434,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"the:0.210836723446846 of:0.13834647834300995 and:0.11819174885749817 to:0.09819918870925903 a:0.0662047415971756 :0.36822111904621124\n"
]
}
],
"source": [
"print(predict([\"came\", \"fiom\"], []))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate result for dev dataset"
]
},
{
"cell_type": "code",
"execution_count": 435,
"metadata": {},
"outputs": [],
"source": [
"dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')\n",
"output_dir = os.path.join('..', 'dev-0', 'out.tsv')\n",
"\n",
"df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n",
"df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)"
]
},
{
"cell_type": "code",
"execution_count": 436,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10519/10519 [02:25<00:00, 72.19it/s]\n"
]
}
],
"source": [
"final = \"\"\n",
"\n",
"for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):\n",
" left_context = re.split(r\"\\s+\", row['LeftContext'].strip())[-left_tokens:]\n",
" right_context = re.split(r\"\\s+\", row['RightContext'].strip())[:right_tokens]\n",
"\n",
" final += predict(left_context, right_context) + '\\n'\n",
"\n",
"with open(output_dir, 'w', encoding=\"UTF-8\") as f:\n",
" f.write(final)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "p311-cu121",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

739
src/08_word2vec.ipynb Normal file
View File

@ -0,0 +1,739 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <b>Trigram</b> neural network model for gap fill task"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import required packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\vocab\\__init__.py:4: UserWarning: \n",
"/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
"Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
" warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n",
"c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\utils.py:4: UserWarning: \n",
"/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
"Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
" warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"import re\n",
"import os\n",
"import csv\n",
"import pandas as pd\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import numpy as np\n",
"from torch.utils.data import DataLoader, TensorDataset\n",
"from sklearn.utils import shuffle\n",
"import random\n",
"from torchtext.vocab import build_vocab_from_iterator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"os.environ['CUDA_LAUNCH_BLOCKING'] = '1'\n",
"os.environ['TORCH_USE_CUDA_DSA'] = '1'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Global configuration variables"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"vocab_size = 40_000\n",
"batch_size = 256\n",
"embedding_dim = 128\n",
"hidden_dim = 1024\n",
"learning_rate = 0.001\n",
"epochs = 5\n",
"\n",
"output_size = vocab_size"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda\n"
]
}
],
"source": [
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"# device = torch.device(\"cpu\")\n",
"print(device)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build vocabulary"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_word_lines_from_dataset():\n",
" dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')\n",
" expected_dir = os.path.join('..', 'train', 'expected.tsv')\n",
"\n",
" df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
" expected_df = pd.read_csv(expected_dir, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
"\n",
" for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):\n",
" df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)\n",
" \n",
" for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['RightContext'].to_list()):\n",
" yield re.split(r\"\\s+\", left_context.strip()) + [str(word).strip()] + re.split(r\"\\s+\", right_context.strip())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/433 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 433/433 [01:34<00:00, 4.57it/s]\n"
]
}
],
"source": [
"vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_dataset(),\n",
" max_tokens = vocab_size,\n",
" specials = ['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"vocab.set_default_index(vocab['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<unk>', 'the', 'of', 'houses']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.lookup_tokens([0, 1, 2, 1245])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/433 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 433/433 [01:08<00:00, 6.31it/s]\n"
]
}
],
"source": [
"dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')\n",
"expected_dir = os.path.join('..', 'train', 'expected.tsv')\n",
"\n",
"df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
"expected_df = pd.read_csv(expected_dir, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
"\n",
"\n",
"input_corpus = []\n",
"target_corpus = []\n",
"\n",
"left_tokens = 2\n",
"right_tokens = 2\n",
"\n",
"for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):\n",
" df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)\n",
" \n",
" for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['RightContext'].to_list()):\n",
" target_corpus.append([str(word).strip()])\n",
" input_corpus.append(re.split(r\"\\s+\", left_context.strip())[-left_tokens:] + re.split(r\"\\s+\", right_context.strip())[:right_tokens])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenize entire corpus"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 432022/432022 [00:02<00:00, 172153.99it/s]\n",
"100%|██████████| 432022/432022 [00:01<00:00, 316818.82it/s]\n"
]
}
],
"source": [
"def tokenize(w):\n",
" return vocab[w]\n",
" \n",
"def detokenize(t):\n",
" return vocab.lookup_tokens([t])[0]\n",
"\n",
"tokenized_input_corpus = []\n",
"tokenized_target_corpus = []\n",
"\n",
"for words in tqdm(input_corpus):\n",
" tokenized_input_corpus.append([tokenize(word) for word in words])\n",
"\n",
"for words in tqdm(target_corpus):\n",
" tokenized_target_corpus.append([tokenize(word) for word in words])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"tokenized_input_corpus, tokenized_target_corpus = shuffle(tokenized_input_corpus, tokenized_target_corpus)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create dataset"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"indices = np.nonzero(np.array(tokenized_target_corpus).flatten())\n",
"\n",
"tokenized_input_corpus = np.take(tokenized_input_corpus, indices, axis=0)\n",
"tokenized_target_corpus = np.take(tokenized_target_corpus, indices, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"input_corpus_tensor = torch.flatten(torch.tensor(tokenized_input_corpus, dtype=torch.long, device=device), end_dim=-2)\n",
"target_corpus_tensor = torch.flatten(torch.tensor(tokenized_target_corpus, dtype=torch.long, device=device)).reshape(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([378666, 4])\n",
"torch.Size([378666, 1])\n"
]
}
],
"source": [
"print(input_corpus_tensor.size())\n",
"print(target_corpus_tensor.size())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['a', 'charming', 'woman', 'would']\n",
"['young']\n"
]
}
],
"source": [
"random_index = random.randint(0, len(input_corpus_tensor) - 1)\n",
"\n",
"# Get random element from input corpus\n",
"random_input_element = input_corpus_tensor[random_index]\n",
"\n",
"# Get corresponding element from target corpus\n",
"random_target_element = target_corpus_tensor[random_index]\n",
"\n",
"print([detokenize(int(idx)) for idx in random_input_element])\n",
"print([detokenize(int(idx)) for idx in random_target_element])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"dataset = TensorDataset(input_corpus_tensor, target_corpus_tensor)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the trigram neural network model"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"class NGramNN(nn.Module):\n",
" def __init__(self, vocab_size, embedding_dim):\n",
" super(NGramNN, self).__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
" self.linear = nn.Linear(embedding_dim * (left_tokens + right_tokens), vocab_size)\n",
" \n",
" def forward(self, inputs):\n",
" out = self.embedding(inputs)\n",
" out = out.view(inputs.size(0), -1)\n",
" out = self.linear(out)\n",
" # out = torch.softmax(out, dim=1)\n",
" return out"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize the model, loss function, and optimizer"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"model = NGramNN(vocab_size, embedding_dim)\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=learning_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training loop"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/1480 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 45.24it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 7.488175111847955\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 45.62it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 2, Loss: 5.083534079629022\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 44.91it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 3, Loss: 3.8214319522316393\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:33<00:00, 44.65it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 4, Loss: 3.1464366490776476\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 45.94it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 5, Loss: 2.743303858589482\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 46.07it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 6, Loss: 2.456264268949225\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 45.59it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 7, Loss: 2.2358319317972337\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 46.15it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 8, Loss: 2.0536118873067806\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 45.90it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 9, Loss: 1.89841981795994\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1480/1480 [00:32<00:00, 44.89it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 10, Loss: 1.7637179977990485\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"model.to(device)\n",
"\n",
"for epoch in range(epochs):\n",
" total_loss = 0\n",
" for batch_inputs, batch_targets in tqdm(dataloader):\n",
" batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)\n",
" \n",
" model.zero_grad()\n",
" output = model(batch_inputs)\n",
"\n",
" loss = criterion(output, batch_targets.view(-1))\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" print(f\"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## test the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def predict(left_context, right_context):\n",
" with torch.no_grad():\n",
" context = left_context + right_context\n",
" test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)\n",
" output = model(test_context_idxs)\n",
" top_predicted_scores, top_predicted_indices = torch.topk(output, vocab_size)\n",
"\n",
" top_predicted_scores = np.array(top_predicted_scores[0].cpu())\n",
" top_predicted_indices = top_predicted_indices[0]\n",
"\n",
" top_predicted_scores = top_predicted_scores[top_predicted_scores > 0]\n",
" top_predicted_indices = top_predicted_indices[:len(top_predicted_scores)]\n",
"\n",
" total_score = np.sum([score for score in top_predicted_scores[:20]])\n",
"\n",
" predictions = list(zip(top_predicted_scores, top_predicted_indices))\n",
" predictions = [(round(float(score), 2), detokenize(idx)) for score, idx in predictions[:10]]\n",
" \n",
" words = [word for _, word in predictions]\n",
" scores = [round(score/total_score, 2) for score, _ in predictions]\n",
"\n",
" remaining_score = round(1.0 - np.sum(scores), 2)\n",
"\n",
" predictions = ' '.join([f\"{word}:{score}\" for score, word in zip(scores, words)]) + ' :' + str(remaining_score)\n",
"\n",
" return predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"the:0.07 his:0.06 a:0.06 their:0.06 John:0.06 tho:0.05 he:0.05 its:0.05 and:0.05 my:0.05 :0.44\n",
"to:0.06 a:0.06 the:0.06 and:0.06 in:0.05 when:0.05 of:0.05 up:0.05 such:0.05 for:0.05 :0.46\n"
]
}
],
"source": [
"print(predict([\"came\", \"fiom\"], [\"26th\", \"place\"]))\n",
"print(predict([\"will\", \"buy\"], [\"telephone\", \"and\"]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generate result for dev dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')\n",
"output_dir = os.path.join('..', 'dev-0', 'out.tsv')\n",
"\n",
"df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n",
"df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10519/10519 [00:50<00:00, 206.40it/s]\n"
]
}
],
"source": [
"final = \"\"\n",
"\n",
"for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):\n",
" left_context = re.split(r\"\\s+\", row['LeftContext'].strip())[-left_tokens:]\n",
" right_context = re.split(r\"\\s+\", row['RightContext'].strip())[:right_tokens]\n",
"\n",
" final += predict(left_context, right_context) + '\\n'\n",
"\n",
"with open(output_dir, 'w', encoding=\"UTF-8\") as f:\n",
" f.write(final)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "p311-cu121",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}