ipynb file

2024-05-27 12:59:11 +02:00 · 2024-05-27 12:59:11 +02:00 · b7ee0fb834
commit b7ee0fb834
parent 5ebc5d3f12
1 changed files with 534 additions and 0 deletions
--- a/RNN.ipynb
+++ b/RNN.ipynb
@ -0,0 +1,534 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ae9d73b0-9e7a-4259-aa04-2d3176864d40",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch import nn, optim\n",
    "from torch.utils.data import DataLoader\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import regex as re\n",
    "import itertools\n",
    "from itertools import islice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ae22808c-8957-4d38-94bc-8f9cfc5f8b99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CUDA Available: True\n",
      "CUDA Device Name: NVIDIA GeForce RTX 3050\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "cuda_available = torch.cuda.is_available()\n",
    "print(f\"CUDA Available: {cuda_available}\")\n",
    "if cuda_available:\n",
    "    print(f\"CUDA Device Name: {torch.cuda.get_device_name(0)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "41daea76-75a5-4098-b5ae-b770d3aa9e1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = 'cuda'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fa76fb6d-c5cf-4711-a65e-8ec004e3b6fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = \"C:/Users/Mauri/Desktop/UAM - 3 semestr/modelowanie języka/gap_pred/challenging-america-word-gap-prediction/train/train.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e40859e9-88e4-4ff5-a78c-bb11b3822fd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Dataset(torch.utils.data.Dataset):\n",
    "    def __init__(\n",
    "            self,\n",
    "            sequence_length,\n",
    "            train_path,\n",
    "            max_vocab_size=20000\n",
    "    ):\n",
    "        self.sequence_length = sequence_length\n",
    "        self.train_path = train_path\n",
    "        self.max_vocab_size = max_vocab_size\n",
    "\n",
    "        self.words = self.load()\n",
    "        self.uniq_words = self.get_uniq_words()\n",
    "\n",
    "        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}\n",
    "        self.index_to_word[len(self.index_to_word)] = '<UNK>'\n",
    "        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}\n",
    "        self.word_to_index['<UNK>'] = len(self.word_to_index)\n",
    "\n",
    "        self.words_indexes = [self.word_to_index.get(w, self.word_to_index['<UNK>']) for w in self.words]\n",
    "\n",
    "    def load(self):\n",
    "        with open(self.train_path, 'r', encoding='utf-8') as f_in:\n",
    "            text = [x.rstrip() for x in f_in.readlines() if x.strip()]\n",
    "            text = ' '.join(text).lower()\n",
    "            text = text.replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ').replace('\\\\\\\\t', ' ')\n",
    "            text = re.sub(r'\\n', ' ', text)\n",
    "            text = re.sub(r'(?<=\\w)[,-](?=\\w)', '', text)\n",
    "            text = re.sub(r'\\s+', ' ', text)\n",
    "            text = re.sub(r'\\p{P}', '', text)\n",
    "            text = text.split(' ')\n",
    "        return text\n",
    "\n",
    "    def get_uniq_words(self):\n",
    "        word_counts = Counter(self.words)\n",
    "        most_common_words = word_counts.most_common(self.max_vocab_size)\n",
    "        return [word for word, _ in most_common_words]\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.words_indexes) - self.sequence_length\n",
    "\n",
    "    def __getitem__(self, index):\n",
    "        # Get the sequence\n",
    "        sequence = self.words_indexes[index:index+self.sequence_length]\n",
    "        # Split the sequence into x and y\n",
    "        x = sequence[:2] + sequence[-2:]\n",
    "        y = sequence[len(sequence) // 2]\n",
    "        return torch.tensor(x), torch.tensor(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "bf0efaba-86a2-4368-a31d-de7d08a759a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = Dataset(5, train_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "7aa7bd72-5978-484e-b541-36f737f22b0d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor([ 14, 110,   3,  28]), tensor(208))"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset[420]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "2a13298c-e0dd-4181-9093-7cec414b5b79",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['at', 'last', 'to', 'tho']"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[train_dataset.index_to_word[x] for x in [ 14, 110, 3,  28]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "192c4d6d-3fc1-4687-9ce4-b1a8cbea7d82",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['come']"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[train_dataset.index_to_word[208]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "3f0cd5b3-3937-4ad8-a9f8-766d27ad9d70",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(tensor([ 218,  104, 8207, 3121]), tensor(20000))"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset[21237]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b1302c90-d77e-49e4-8b9d-9a8aeca675b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "\n",
    "class Model(nn.Module):\n",
    "    def __init__(self, vocab_size, lstm_size=128, embedding_dim=128, num_layers=3, dropout=0.2):\n",
    "        super(Model, self).__init__()\n",
    "        self.lstm_size = lstm_size\n",
    "        self.embedding_dim = embedding_dim\n",
    "        self.num_layers = num_layers\n",
    "        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "        self.embedding = nn.Embedding(\n",
    "            num_embeddings=vocab_size,\n",
    "            embedding_dim=self.embedding_dim,\n",
    "        )\n",
    "        self.lstm = nn.LSTM(\n",
    "            input_size=self.embedding_dim,\n",
    "            hidden_size=self.lstm_size,\n",
    "            num_layers=self.num_layers,\n",
    "            dropout=dropout,\n",
    "        )\n",
    "        self.fc1 = nn.Linear(self.lstm_size, 256) \n",
    "        self.fc2 = nn.Linear(256, vocab_size)\n",
    "        self.softmax = nn.Softmax(dim=1)\n",
    "        \n",
    "    def forward(self, x, prev_state=None):\n",
    "        x = x.to(self.device)\n",
    "        embed = self.embedding(x)\n",
    "        embed = embed.transpose(0, 1)\n",
    "        \n",
    "        if prev_state is None:\n",
    "            prev_state = self.init_state(x.size(0))\n",
    "        \n",
    "        output, state = self.lstm(embed, prev_state)\n",
    "        logits = self.fc1(output[-1])\n",
    "        logits = self.fc2(logits)\n",
    "        probabilities = self.softmax(logits)\n",
    "        return probabilities\n",
    "\n",
    "    def init_state(self, batch_size):\n",
    "        return (torch.zeros(self.num_layers, batch_size, self.lstm_size).to(self.device),\n",
    "                torch.zeros(self.num_layers, batch_size, self.lstm_size).to(self.device))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "93a29618-3283-4ad5-881f-48c84839ceeb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(dataset, model, max_epochs, batch_size):\n",
    "    model.train()\n",
    "\n",
    "    dataloader = DataLoader(dataset, batch_size=batch_size, pin_memory=True)\n",
    "    criterion = nn.CrossEntropyLoss()\n",
    "    optimizer = optim.Adam(model.parameters())\n",
    "\n",
    "    for epoch in range(max_epochs):\n",
    "        for batch, (x, y) in enumerate(dataloader):\n",
    "            optimizer.zero_grad()\n",
    "            x = x.to(device, non_blocking=True)\n",
    "            y = y.to(device, non_blocking=True)\n",
    "\n",
    "            y_pred = model(x)\n",
    "            loss = criterion(torch.log(y_pred), y)\n",
    "\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "            if batch % 500 == 0:\n",
    "                print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "2315e67d-a315-44b5-bddf-5ab4bed1e727",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'epoch': 0, 'update in batch': 0, '/': 16679, 'loss': 9.917818069458008}\n",
      "{'epoch': 0, 'update in batch': 500, '/': 16679, 'loss': 6.078440189361572}\n",
      "{'epoch': 0, 'update in batch': 1000, '/': 16679, 'loss': 5.651369571685791}\n",
      "{'epoch': 0, 'update in batch': 1500, '/': 16679, 'loss': 5.4341654777526855}\n",
      "{'epoch': 0, 'update in batch': 2000, '/': 16679, 'loss': 5.383695602416992}\n",
      "{'epoch': 0, 'update in batch': 2500, '/': 16679, 'loss': 5.225739479064941}\n",
      "{'epoch': 0, 'update in batch': 3000, '/': 16679, 'loss': 5.282474517822266}\n",
      "{'epoch': 0, 'update in batch': 3500, '/': 16679, 'loss': 5.092397689819336}\n",
      "{'epoch': 0, 'update in batch': 4000, '/': 16679, 'loss': 4.940906047821045}\n",
      "{'epoch': 0, 'update in batch': 4500, '/': 16679, 'loss': 4.908115863800049}\n",
      "{'epoch': 0, 'update in batch': 5000, '/': 16679, 'loss': 5.092423439025879}\n",
      "{'epoch': 0, 'update in batch': 5500, '/': 16679, 'loss': 4.979565620422363}\n",
      "{'epoch': 0, 'update in batch': 6000, '/': 16679, 'loss': 4.8268022537231445}\n",
      "{'epoch': 0, 'update in batch': 6500, '/': 16679, 'loss': 4.7172017097473145}\n",
      "{'epoch': 0, 'update in batch': 7000, '/': 16679, 'loss': 4.781315326690674}\n",
      "{'epoch': 0, 'update in batch': 7500, '/': 16679, 'loss': 5.0033040046691895}\n",
      "{'epoch': 0, 'update in batch': 8000, '/': 16679, 'loss': 4.663774013519287}\n",
      "{'epoch': 0, 'update in batch': 8500, '/': 16679, 'loss': 4.710158348083496}\n",
      "{'epoch': 0, 'update in batch': 9000, '/': 16679, 'loss': 4.817586898803711}\n",
      "{'epoch': 0, 'update in batch': 9500, '/': 16679, 'loss': 4.655371189117432}\n",
      "{'epoch': 0, 'update in batch': 10000, '/': 16679, 'loss': 4.679412841796875}\n",
      "{'epoch': 0, 'update in batch': 10500, '/': 16679, 'loss': 4.544621467590332}\n",
      "{'epoch': 0, 'update in batch': 11000, '/': 16679, 'loss': 4.816493511199951}\n",
      "{'epoch': 0, 'update in batch': 11500, '/': 16679, 'loss': 4.627770900726318}\n",
      "{'epoch': 0, 'update in batch': 12000, '/': 16679, 'loss': 4.525866985321045}\n",
      "{'epoch': 0, 'update in batch': 12500, '/': 16679, 'loss': 4.739295959472656}\n",
      "{'epoch': 0, 'update in batch': 13000, '/': 16679, 'loss': 4.6095709800720215}\n",
      "{'epoch': 0, 'update in batch': 13500, '/': 16679, 'loss': 4.7243266105651855}\n",
      "{'epoch': 0, 'update in batch': 14000, '/': 16679, 'loss': 4.557321071624756}\n",
      "{'epoch': 0, 'update in batch': 14500, '/': 16679, 'loss': 4.830319404602051}\n",
      "{'epoch': 0, 'update in batch': 15000, '/': 16679, 'loss': 4.536618709564209}\n",
      "{'epoch': 0, 'update in batch': 15500, '/': 16679, 'loss': 4.605734825134277}\n",
      "{'epoch': 0, 'update in batch': 16000, '/': 16679, 'loss': 4.605676651000977}\n",
      "{'epoch': 0, 'update in batch': 16500, '/': 16679, 'loss': 4.614283084869385}\n"
     ]
    }
   ],
   "source": [
    "model = Model(vocab_size = len(train_dataset.uniq_words) + 1).to(device)\n",
    "train(train_dataset, model, 1, 8192)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "8acf3dc2-f3fe-4a2a-bdf9-82a18acb1bd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.save(model.state_dict(), 'model.pth')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5e60d5b3-019d-4d63-b794-59e1356bc45e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model(20001).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "7e55b0b2-cdda-4c37-8979-0400f9973461",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<All keys matched successfully>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.load_state_dict(torch.load('model.pth'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e842b192-8e10-438c-b8ee-781a4a7a875c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean(text):\n",
    "    text = text.replace('-\\\\\\\\n', '').replace('\\\\\\\\n', ' ').replace('\\\\\\\\t', ' ')\n",
    "    text = re.sub(r'\\n', ' ', text)\n",
    "    text = re.sub(r'(?<=\\w)[,-](?=\\w)', '', text)\n",
    "    text = re.sub(r'\\s+', ' ', text)\n",
    "    text = re.sub(r'\\p{P}', '', text)\n",
    "    text = text.strip()\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f20f8fdc-194e-415a-8343-6f590abe1166",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_words(words, model, dataset, n=20):\n",
    "    ixs = [dataset.word_to_index.get(word, dataset.word_to_index['<UNK>']) for word in words]\n",
    "    ixs = torch.tensor(ixs).unsqueeze(0).to(model.device)\n",
    "\n",
    "    out = model(ixs)\n",
    "    top = torch.topk(out[0], n)\n",
    "    top_indices = top.indices.tolist()\n",
    "    top_probs = top.values.tolist()\n",
    "    top_words = [dataset.index_to_word[idx] for idx in top_indices]\n",
    "    return list(zip(top_words, top_probs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "22ebafa5-d21f-4208-9aad-a4c4d90134c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def f_out(left, right, model, dataset):\n",
    "    left = clean(left)\n",
    "    right = clean(right)\n",
    "    words = left.split(' ')[-2:] + right.split(' ')[:2]\n",
    "    words = get_words(words, model, dataset)\n",
    "\n",
    "    probs_sum = 0\n",
    "    output = ''\n",
    "    for word, prob in words:\n",
    "        if word == \"<UNK>\":\n",
    "            continue\n",
    "        probs_sum += prob\n",
    "        output += f\"{word}:{prob} \"\n",
    "    output += f\":{1-probs_sum}\"\n",
    "\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1dc64cee-a9a5-44d4-92da-82e1b7f8fdc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_out(input_path, model, dataset, output_path):\n",
    "    lines = []\n",
    "    with open(input_path, encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            columns = line.split('\\t')\n",
    "            left = columns[6]\n",
    "            right = columns[7]\n",
    "            lines.append((left, right))\n",
    "\n",
    "    with open(output_path, 'w', encoding='utf-8') as output_file:\n",
    "        for left, right in lines:\n",
    "            result = f_out(left, right, model, dataset)\n",
    "            output_file.write(result + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "348a77c1-8ff1-40bb-a243-3b702c119c2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_path = \"C:/Users/Mauri/Desktop/UAM - 3 semestr/modelowanie języka/gap_pred/challenging-america-word-gap-prediction/dev-0/in.tsv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "9377c725-3309-4590-89d2-444057ae2b80",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_out(dev_path, model, train_dataset, output_path='C:/Users/Mauri/Desktop/UAM - 3 semestr/modelowanie języka/gap_pred/challenging-america-word-gap-prediction/dev-0/out.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "50f47d4a-762f-48b2-9c19-f385d9822886",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_path = \"C:/Users/Mauri/Desktop/UAM - 3 semestr/modelowanie języka/gap_pred/challenging-america-word-gap-prediction/test-A/in.tsv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "18aa1059-88ed-4c32-af88-80a4de4be6c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_out(test_path, model, train_dataset, output_path='C:/Users/Mauri/Desktop/UAM - 3 semestr/modelowanie języka/gap_pred/challenging-america-word-gap-prediction/test-A/out.tsv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }