Neural network

2024-05-15 04:51:48 +02:00 · 2024-05-15 04:51:48 +02:00 · d380959afc
commit d380959afc
parent 0734c5d906
3 changed files with 11179 additions and 10519 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,202 @@
 # %% [markdown]
 # # <b>Trigram</b> neural network model for gap fill task
 # %% [markdown]
 # ## Import required packages
 # %%
 from tqdm import tqdm
 import re
 import nltk
 import os
 import csv
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import sys
 import numpy as np
 from torch.utils.data import DataLoader, TensorDataset
 from bidict import bidict
 import torchtext.vocab as vocab
 import math
 from collections import Counter
 # %% [markdown]
 # ## Load train data corpus
 # %%
 dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')
 expected_dir = os.path.join('..', 'train', 'expected.tsv')
 df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
 expected_df = pd.read_csv(expected_dir, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)
 corpus = []
 for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):
    df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
    for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):
        corpus.extend(re.split(r"\s+", left_context.strip()) + [str(word).strip()] + re.split(r"\s+", right_context.strip()))
 # %% [markdown]
 # ## Create dictionaries for mapping words to indices
 # %%
 word_to_ix = bidict({})
 counts = Counter(corpus)
 for word, _ in tqdm(counts.most_common(1_500_000)):
    if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix) + 1
 # %% [markdown]
 # ## Tokenize entire corpus
 # %%
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def tokenize(w):
    if w in word_to_ix:
        return word_to_ix[w]
    else:
        return 0
 tokenized_corpus = []
 for word in tqdm(corpus):
    tokenized_corpus.append(tokenize(word))
 # %% [markdown]
 # ## Create n-grams
 # %%
 tokenized_training_corpus = []
 ngrams = list(nltk.ngrams(tokenized_corpus, n=7))
 np.random.shuffle(ngrams)
 ngrams = ngrams[:100_000]
 ngrams_tensor = torch.tensor(ngrams, dtype=torch.long, device=device)
 indices = torch.any(ngrams_tensor == 0, dim=1)
 ngrams_tensor = ngrams_tensor[~indices]
 # %%
 target_tensor = ngrams_tensor[:, 3].reshape(-1, 1).to(device)
 input_tensor = torch.cat((ngrams_tensor[:, :3], ngrams_tensor[:, 4:]), dim=1).to(device)
 # %%
 batched_input_tensor = torch.split(input_tensor, 512)
 batched_target_tensor = torch.split(target_tensor, 512)
 # %% [markdown]
 # ## Define the trigram neural network model
 # %%
 class TrigramNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(TrigramNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 50)
        self.linear1 = nn.Linear(50 * 6, output_size)
    def forward(self, inputs):
        out = self.embedding(inputs)
        out = out.view(inputs.size(0), -1)
        out = self.linear1(out)
        out = torch.softmax(out, dim=1)
        return out
 # %% [markdown]
 # ## Define training parameters
 # %%
 batch_size = 512
 vocab_size = len(word_to_ix) + 1
 embedding_dim = 10
 hidden_dim = 64
 output_size = vocab_size
 learning_rate = 0.005
 epochs = 1
 # %% [markdown]
 # ## Initialize the model, loss function, and optimizer
 # %%
 model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.SGD(model.parameters(), lr=learning_rate)
 # %% [markdown]
 # ## Training loop
 # %%
 model.to(device)
 batches = list(zip(batched_input_tensor, batched_target_tensor))
 for epoch in range(epochs):
    total_loss = 0
    for batch_inputs, batch_targets in tqdm(batches):
        model.zero_grad()
        output = model(batch_inputs)
        loss = criterion(output, batch_targets.view(-1))
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(batches)}")
 # %% [markdown]
 # ## Write function to convert index to word
 # %%
 def idx_to_word(idx):
    idx = int(idx)
    if idx not in word_to_ix.inverse:
        return '<UNK>'
    return word_to_ix.inverse[idx]
 # %% [markdown]
 # ## test the model
 # %%
 def predict(left_context, right_context):
    with torch.no_grad():
        context = left_context + right_context
        test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)
        output = model(test_context_idxs)
        top_predicted_scores, top_predicted_indices = torch.topk(output, 5)
        predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))
        predictions = [(round(float(score), 2), idx_to_word(idx)) for score, idx in predictions]
        total_score = np.sum([score for score, _ in predictions])
        predictions = ' '.join([f"{word}:{round(score/total_score, 2)}" for score, word in predictions]) + ' :0.01'
        return predictions
 # %%
 test_context = ["came", "fiom", "the", "place", "to", "this"]
 print(predict(test_context[:3], test_context[3:]))
 # %% [markdown]
 # # Generate result for dev dataset
 # %%
 dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')
 output_dir = os.path.join('..', 'dev-0', 'out.tsv')
 df = pd.read_csv(dataset_dir, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)
 df = df.replace(r'\\r+|\\n+|\\t+', ' ', regex=True)
 # %%
 final = ""
 for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):
    left_context = re.split(r"\s+", row['LeftContext'].strip())[-3:]
    right_context = re.split(r"\s+", row['RightContext'].strip())[:3]
    final += predict(left_context, right_context) + '\n'
 with open(output_dir, 'w', encoding="UTF-8") as f:
    f.write(final)
--- a/src/07_trigram_neural.ipynb
+++ b/src/07_trigram_neural.ipynb
@ -0,0 +1,458 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <b>Trigram</b> neural network model for gap fill task"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import required packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\vocab\\__init__.py:4: UserWarning: \n",
      "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
      "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
      "  warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n",
      "c:\\Users\\Marcin\\.conda\\envs\\p311-cu121\\Lib\\site-packages\\torchtext\\utils.py:4: UserWarning: \n",
      "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n",
      "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n",
      "  warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "import nltk\n",
    "import os\n",
    "import csv\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "import sys\n",
    "import numpy as np\n",
    "from torch.utils.data import DataLoader, TensorDataset\n",
    "from bidict import bidict\n",
    "import torchtext.vocab as vocab\n",
    "import math\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load train data corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 433/433 [01:10<00:00,  6.12it/s]\n"
     ]
    }
   ],
   "source": [
    "dataset_dir = os.path.join('..', 'train', 'in.tsv.xz')\n",
    "expected_dir = os.path.join('..', 'train', 'expected.tsv')\n",
    "\n",
    "df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
    "expected_df = pd.read_csv(expected_dir, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, dtype=str, chunksize=1000)\n",
    "\n",
    "corpus = []\n",
    "for j, (df, expected_df) in tqdm(enumerate(zip(df, expected_df)), total=433):\n",
    "    df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)\n",
    "    \n",
    "    for left_context, word, right_context in zip(df['LeftContext'].to_list(), expected_df['Word'].to_list(), df['LeftContext'].to_list()):\n",
    "        corpus.extend(re.split(r\"\\s+\", left_context.strip()) + [str(word).strip()] + re.split(r\"\\s+\", right_context.strip()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create dictionaries for mapping words to indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1500000/1500000 [00:11<00:00, 128039.35it/s]\n"
     ]
    }
   ],
   "source": [
    "word_to_ix = bidict({})\n",
    "counts = Counter(corpus)\n",
    "\n",
    "for word, _ in tqdm(counts.most_common(1_500_000)):\n",
    "    if word not in word_to_ix:\n",
    "        word_to_ix[word] = len(word_to_ix) + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize entire corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 139456816/139456816 [01:28<00:00, 1569462.31it/s]\n"
     ]
    }
   ],
   "source": [
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "def tokenize(w):\n",
    "    if w in word_to_ix:\n",
    "        return word_to_ix[w]\n",
    "    else:\n",
    "        return 0\n",
    "\n",
    "tokenized_corpus = []\n",
    "\n",
    "for word in tqdm(corpus):\n",
    "    tokenized_corpus.append(tokenize(word))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create n-grams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_training_corpus = []\n",
    "ngrams = list(nltk.ngrams(tokenized_corpus, n=7))\n",
    "np.random.shuffle(ngrams)\n",
    "ngrams = ngrams[:100_000]\n",
    "ngrams_tensor = torch.tensor(ngrams, dtype=torch.long, device=device)\n",
    "\n",
    "indices = torch.any(ngrams_tensor == 0, dim=1)\n",
    "ngrams_tensor = ngrams_tensor[~indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_tensor = ngrams_tensor[:, 3].reshape(-1, 1).to(device)\n",
    "input_tensor = torch.cat((ngrams_tensor[:, :3], ngrams_tensor[:, 4:]), dim=1).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "batched_input_tensor = torch.split(input_tensor, 512)\n",
    "batched_target_tensor = torch.split(target_tensor, 512)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the trigram neural network model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TrigramNN(nn.Module):\n",
    "    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):\n",
    "        super(TrigramNN, self).__init__()\n",
    "        self.embedding = nn.Embedding(vocab_size, 50)\n",
    "        self.linear1 = nn.Linear(50 * 6, output_size)\n",
    "    \n",
    "    def forward(self, inputs):\n",
    "        out = self.embedding(inputs)\n",
    "        out = out.view(inputs.size(0), -1)\n",
    "        out = self.linear1(out)\n",
    "        out = torch.softmax(out, dim=1)\n",
    "        return out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define training parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 512\n",
    "vocab_size = len(word_to_ix) + 1\n",
    "embedding_dim = 10\n",
    "hidden_dim = 64\n",
    "output_size = vocab_size\n",
    "learning_rate = 0.005\n",
    "epochs = 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize the model, loss function, and optimizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TrigramNN(vocab_size, embedding_dim, hidden_dim, output_size)\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "optimizer = optim.SGD(model.parameters(), lr=learning_rate)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/164 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 164/164 [29:56<00:00, 10.95s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 14.220980655856248\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model.to(device)\n",
    "\n",
    "batches = list(zip(batched_input_tensor, batched_target_tensor))\n",
    "\n",
    "for epoch in range(epochs):\n",
    "    total_loss = 0\n",
    "    for batch_inputs, batch_targets in tqdm(batches):\n",
    "        model.zero_grad()\n",
    "        output = model(batch_inputs)\n",
    "\n",
    "        loss = criterion(output, batch_targets.view(-1))\n",
    "        total_loss += loss.item()\n",
    "\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "    print(f\"Epoch {epoch+1}, Loss: {total_loss/len(batches)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write function to convert index to word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def idx_to_word(idx):\n",
    "    idx = int(idx)\n",
    "    if idx not in word_to_ix.inverse:\n",
    "        return '<UNK>'\n",
    "    return word_to_ix.inverse[idx]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## test the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(left_context, right_context):\n",
    "    with torch.no_grad():\n",
    "        context = left_context + right_context\n",
    "        test_context_idxs = torch.tensor([[tokenize(x) for x in context]], device=device)\n",
    "        output = model(test_context_idxs)\n",
    "        top_predicted_scores, top_predicted_indices = torch.topk(output, 5)\n",
    "        predictions = list(zip(top_predicted_scores[0], top_predicted_indices[0]))\n",
    "        predictions = [(round(float(score), 2), idx_to_word(idx)) for score, idx in predictions]\n",
    "        total_score = np.sum([score for score, _ in predictions])\n",
    "        predictions = ' '.join([f\"{word}:{round(score/total_score, 2)}\" for score, word in predictions]) + ' :0.01'\n",
    "        return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AmiTlceaa.:nan Allentown.:nan thereuntoi:nan Jugo-Slav:nan Sallie,:nan :0.01\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Marcin\\AppData\\Local\\Temp\\ipykernel_36872\\3389363719.py:10: RuntimeWarning: invalid value encountered in scalar divide\n",
      "  predictions = ' '.join([f\"{word}:{round(score/total_score, 2)}\" for score, word in predictions]) + ' :0.01'\n"
     ]
    }
   ],
   "source": [
    "test_context = [\"came\", \"fiom\", \"the\", \"place\", \"to\", \"this\"]\n",
    "print(predict(test_context[:3], test_context[3:]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate result for dev dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_dir = os.path.join('..', 'dev-0', 'in.tsv.xz')\n",
    "output_dir = os.path.join('..', 'dev-0', 'out.tsv')\n",
    "\n",
    "df = pd.read_csv(dataset_dir, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n",
    "df = df.replace(r'\\\\r+|\\\\n+|\\\\t+', ' ', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final = \"\"\n",
    "\n",
    "for i, (_, row) in tqdm(enumerate(df.iterrows()), total=len(df)):\n",
    "    left_context = re.split(r\"\\s+\", row['LeftContext'].strip())[-3:]\n",
    "    right_context = re.split(r\"\\s+\", row['RightContext'].strip())[:3]\n",
    "\n",
    "    final += predict(left_context, right_context) + '\\n'\n",
    "\n",
    "with open(output_dir, 'w', encoding=\"UTF-8\") as f:\n",
    "    f.write(final)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "p311-cu121",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }