Neural network

2024-05-15 04:51:48 +02:00 · 2024-05-15 04:51:48 +02:00 · d380959afc
commit d380959afc
parent 0734c5d906
3 changed files with 11179 additions and 10519 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,202 @@
+# %% [markdown]
+# # <b>Trigram</b> neural network model for gap fill task
+
+# %% [markdown]
+# ## Import required packages
+
+# %%
+from tqdm import tqdm
+import re
+import nltk
+import os
+import csv
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import sys
+import numpy as np
+from torch.utils.data import DataLoader, TensorDataset
+from bidict import bidict
+import torchtext.vocab as vocab
+import math
+from collections import Counter
+
+# %% [markdown]
+# ## Load train data corpus
+
+# %%
+dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')
+expected_dir = os.path.join('..', 'train', 'expected.tsv')
+
+df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
+expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
+
+corpus = []
+for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):
+    df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
+    
+    for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
+        corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
+
+# %% [markdown]
+# ## Create dictionaries for mapping words to indices
+
+# %%
+word_to_ix = bidict({})
+counts = Counter(corpus)
+
+for word, _ in tqdm(counts.most_common(1_500_000)):
+    if word not in word_to_ix:
+        word_to_ix[word] = len(word_to_ix) + 1
+
+# %% [markdown]
+# ## Tokenize entire corpus
+
+# %%
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def tokenize(w):
+    if w in word_to_ix:
+        return word_to_ix[w]
+    else:
+        return 0
+
+tokenized_corpus = []
+
+for word in tqdm(corpus):
+    tokenized_corpus.append(tokenize(word))
+
+# %% [markdown]
+# ## Create n-grams
+
+# %%
+tokenized_training_corpus = []
+ngrams = list(nltk.ngrams(tokenized_corpus, n=7))
+np.random.shuffle(ngrams)
+ngrams = ngrams[:100_000]
+ngrams_tensor = torch.tensor(ngrams, dtype=torch.long, device=device)
+
+indices = torch.any(ngrams_tensor == 0, dim=1)
+ngrams_tensor = ngrams_tensor[~indices]
+
+# %%
+target_tensor = ngrams_tensor[:, 3].reshape(-1, 1).to(device)
+input_tensor = torch.cat((ngrams_tensor[:, :3], ngrams_tensor[:, 4:]), dim=1).to(device)
+
+# %%
+batched_input_tensor = torch.split(input_tensor, 512)
+batched_target_tensor = torch.split(target_tensor, 512)
+
+# %% [markdown]
+# ## Define the trigram neural network model
+
+# %%
+class TrigramNN(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
+        super(TrigramNN, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, 50)
+        self.linear1 = nn.Linear(50 * 6, output_size)
+    
+    def forward(self, inputs):
+        out = self.embedding(inputs)
+        out = out.view(inputs.size(0), -1)
+        out = self.linear1(out)
+        out = torch.softmax(out, dim=1)
+        return out
+
+# %% [markdown]
+# ## Define training parameters
+
+# %%
+batch_size = 512
+vocab_size = len(word_to_ix) + 1
+embedding_dim = 10
+hidden_dim = 64
+output_size = vocab_size
+learning_rate = 0.005
+epochs = 1
+
+# %% [markdown]
+# ## Initialize the model, loss function, and optimizer
+
+# %%
+model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(model.parameters(), lr=learning_rate)
+
+# %% [markdown]
+# ## Training loop
+
+# %%
+model.to(device)
+
+batches = list(zip(batched_input_tensor, batched_target_tensor))
+
+for epoch in range(epochs):
+    total_loss = 0
+    for batch_inputs, batch_targets in tqdm(batches):
+        model.zero_grad()
+        output = model(batch_inputs)
+
+        loss = criterion(output, batch_targets.view(-1))
+        total_loss += loss.item()
+
+        loss.backward()
+        optimizer.step()
+
+    print(f"Epoch {epoch+1}, Loss: {total_loss/len(batches)}")
+
+# %% [markdown]
+# ## Write function to convert index to word
+
+# %%
+def idx_to_word(idx):
+    idx = int(idx)
+    if idx not in word_to_ix.inverse:
+        return '<UNK>'
+    return word_to_ix.inverse[idx]
+
+# %% [markdown]
+# ## test the model
+
+# %%
+def predict(left_context, right_context):
+    with torch.no_grad():
+        context = left_context + right_context
+        test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)
+        output = model(test_context_idxs)
+        top_predicted_scores, top_predicted_indices = torch.topk(output, 5)
+        predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))
+        predictions = [(round(float(score), 2), idx_to_word(idx)) for score, idx in predictions]
+        total_score = np.sum([score for score, _ in predictions])
+        predictions = ' '.join([f"{word}:{round(score/total_score, 2)}" for score, word in predictions]) + ' :0.01'
+        return predictions
+
+# %%
+test_context = ["came", "fiom", "the", "place", "to", "this"]
+print(predict(test_context[:3], test_context[3:]))
+
+# %% [markdown]
+# # Generate result for dev dataset
+
+# %%
+dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')
+output_dir = os.path.join('..', 'dev-0', 'out.tsv')
+
+df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
+df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
+
+# %%
+final = ""
+
+for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
+    left_context = re.split(r"\s+", row['LeftContext'].strip())[-3:]
+    right_context = re.split(r"\s+", row['RightContext'].strip())[:3]
+
+    final += predict(left_context, right_context) + '\n'
+
+with open(output_dir, 'w', encoding="UTF-8") as f:
+    f.write(final)
+
+
--- a/src/07_trigram_neural.ipynb
+++ b/src/07_trigram_neural.ipynb
@ -0,0 +1,458 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# <b>Trigram</b> neural network model for gap fill task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import required packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\vocab\\__init__.py:4: UserWarning: \n",
+      "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
+      "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
+      "  warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n",
+      "c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\utils.py:4: UserWarning: \n",
+      "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
+      "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
+      "  warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import re\n",
+    "import nltk\n",
+    "import os\n",
+    "import csv\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import sys\n",
+    "import numpy as np\n",
+    "from torch.utils.data import DataLoader, TensorDataset\n",
+    "from bidict import bidict\n",
+    "import torchtext.vocab as vocab\n",
+    "import math\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load train data corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 433/433 [01:10<00:00,  6.12it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')\n",
+    "expected_dir = os.path.join('..', 'train', 'expected.tsv')\n",
+    "\n",
+    "df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
+    "expected_df = pd.read_csv(expected_dir, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
+    "\n",
+    "corpus = []\n",
+    "for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):\n",
+    "    df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)\n",
+    "    \n",
+    "    for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):\n",
+    "        corpus.extend(re.split(r\"\\s+\", left_context.strip()) + [str(word).strip()] + re.split(r\"\\s+\", right_context.strip()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create dictionaries for mapping words to indices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1500000/1500000 [00:11<00:00, 128039.35it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "word_to_ix = bidict({})\n",
+    "counts = Counter(corpus)\n",
+    "\n",
+    "for word, _ in tqdm(counts.most_common(1_500_000)):\n",
+    "    if word not in word_to_ix:\n",
+    "        word_to_ix[word] = len(word_to_ix) + 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenize entire corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 139456816/139456816 [01:28<00:00, 1569462.31it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "def tokenize(w):\n",
+    "    if w in word_to_ix:\n",
+    "        return word_to_ix[w]\n",
+    "    else:\n",
+    "        return 0\n",
+    "\n",
+    "tokenized_corpus = []\n",
+    "\n",
+    "for word in tqdm(corpus):\n",
+    "    tokenized_corpus.append(tokenize(word))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create n-grams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_training_corpus = []\n",
+    "ngrams = list(nltk.ngrams(tokenized_corpus, n=7))\n",
+    "np.random.shuffle(ngrams)\n",
+    "ngrams = ngrams[:100_000]\n",
+    "ngrams_tensor = torch.tensor(ngrams, dtype=torch.long, device=device)\n",
+    "\n",
+    "indices = torch.any(ngrams_tensor == 0, dim=1)\n",
+    "ngrams_tensor = ngrams_tensor[~indices]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_tensor = ngrams_tensor[:, 3].reshape(-1, 1).to(device)\n",
+    "input_tensor = torch.cat((ngrams_tensor[:, :3], ngrams_tensor[:, 4:]), dim=1).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batched_input_tensor = torch.split(input_tensor, 512)\n",
+    "batched_target_tensor = torch.split(target_tensor, 512)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the trigram neural network model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TrigramNN(nn.Module):\n",
+    "    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):\n",
+    "        super(TrigramNN, self).__init__()\n",
+    "        self.embedding = nn.Embedding(vocab_size, 50)\n",
+    "        self.linear1 = nn.Linear(50 * 6, output_size)\n",
+    "    \n",
+    "    def forward(self, inputs):\n",
+    "        out = self.embedding(inputs)\n",
+    "        out = out.view(inputs.size(0), -1)\n",
+    "        out = self.linear1(out)\n",
+    "        out = torch.softmax(out, dim=1)\n",
+    "        return out"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define training parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 512\n",
+    "vocab_size = len(word_to_ix) + 1\n",
+    "embedding_dim = 10\n",
+    "hidden_dim = 64\n",
+    "output_size = vocab_size\n",
+    "learning_rate = 0.005\n",
+    "epochs = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize the model, loss function, and optimizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.SGD(model.parameters(), lr=learning_rate)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/164 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 164/164 [29:56<00:00, 10.95s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1, Loss: 14.220980655856248\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.to(device)\n",
+    "\n",
+    "batches = list(zip(batched_input_tensor, batched_target_tensor))\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    total_loss = 0\n",
+    "    for batch_inputs, batch_targets in tqdm(batches):\n",
+    "        model.zero_grad()\n",
+    "        output = model(batch_inputs)\n",
+    "\n",
+    "        loss = criterion(output, batch_targets.view(-1))\n",
+    "        total_loss += loss.item()\n",
+    "\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "    print(f\"Epoch {epoch+1}, Loss: {total_loss/len(batches)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Write function to convert index to word"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def idx_to_word(idx):\n",
+    "    idx = int(idx)\n",
+    "    if idx not in word_to_ix.inverse:\n",
+    "        return '<UNK>'\n",
+    "    return word_to_ix.inverse[idx]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## test the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(left_context, right_context):\n",
+    "    with torch.no_grad():\n",
+    "        context = left_context + right_context\n",
+    "        test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)\n",
+    "        output = model(test_context_idxs)\n",
+    "        top_predicted_scores, top_predicted_indices = torch.topk(output, 5)\n",
+    "        predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))\n",
+    "        predictions = [(round(float(score), 2), idx_to_word(idx)) for score, idx in predictions]\n",
+    "        total_score = np.sum([score for score, _ in predictions])\n",
+    "        predictions = ' '.join([f\"{word}:{round(score/total_score, 2)}\" for score, word in predictions]) + ' :0.01'\n",
+    "        return predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AmiTlceaa.:nan Allentown.:nan thereuntoi:nan Jugo-Slav:nan Sallie,:nan :0.01\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Marcin\\AppData\\Local\\Temp\\ipykernel_36872\\3389363719.py:10: RuntimeWarning: invalid value encountered in scalar divide\n",
+      "  predictions = ' '.join([f\"{word}:{round(score/total_score, 2)}\" for score, word in predictions]) + ' :0.01'\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_context = [\"came\", \"fiom\", \"the\", \"place\", \"to\", \"this\"]\n",
+    "print(predict(test_context[:3], test_context[3:]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generate result for dev dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')\n",
+    "output_dir = os.path.join('..', 'dev-0', 'out.tsv')\n",
+    "\n",
+    "df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n",
+    "df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final = \"\"\n",
+    "\n",
+    "for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):\n",
+    "    left_context = re.split(r\"\\s+\", row['LeftContext'].strip())[-3:]\n",
+    "    right_context = re.split(r\"\\s+\", row['RightContext'].strip())[:3]\n",
+    "\n",
+    "    final += predict(left_context, right_context) + '\\n'\n",
+    "\n",
+    "with open(output_dir, 'w', encoding=\"UTF-8\") as f:\n",
+    "    f.write(final)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "p311-cu121",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}