init simple model

2023-04-19 11:33:14 +02:00 · 2023-04-19 11:33:14 +02:00 · 37f40392aa
commit 37f40392aa
parent a05b52d6d2
1 changed files with 265 additions and 0 deletions
--- a/07_bigram_regression.ipynb
+++ b/07_bigram_regression.ipynb
@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Zadanie 1\n",
+    "Wyucz prosty bigramowy model języka oparty na regresji logistycznej (jak przedstawiono na wykładzie)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import islice\n",
+    "import regex as re\n",
+    "import sys\n",
+    "from torchtext.vocab import build_vocab_from_iterator\n",
+    "\n",
+    "\n",
+    "def get_words_from_line(line):\n",
+    "  line = line.rstrip()\n",
+    "  yield '<s>'\n",
+    "  for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
+    "     yield m.group(0).lower()\n",
+    "  yield '</s>'\n",
+    "\n",
+    "\n",
+    "def get_word_lines_from_file(file_name):\n",
+    "  with open(file_name, 'r') as fh:\n",
+    "    for line in fh:\n",
+    "       yield get_words_from_line(line)\n",
+    "\n",
+    "vocab_size = 20000\n",
+    "\n",
+    "vocab = build_vocab_from_iterator(\n",
+    "    get_word_lines_from_file('test-A/in.tsv'),\n",
+    "    max_tokens = vocab_size,\n",
+    "    specials = ['<unk>'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3798"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab['welcome']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(5.5038e-05, grad_fn=<SelectBackward0>)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from torch import nn\n",
+    "import torch\n",
+    "\n",
+    "embed_size = 100\n",
+    "\n",
+    "class SimpleBigramNeuralLanguageModel(nn.Module):\n",
+    "  def __init__(self, vocabulary_size, embedding_size):\n",
+    "      super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
+    "      self.model = nn.Sequential(\n",
+    "          nn.Embedding(vocabulary_size, embedding_size),\n",
+    "          nn.Linear(embedding_size, vocabulary_size),\n",
+    "          nn.Softmax()\n",
+    "      )\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "      return self.model(x)\n",
+    "\n",
+    "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)\n",
+    "\n",
+    "vocab.set_default_index(vocab['<unk>'])\n",
+    "ixs = torch.tensor(vocab.forward(['welcone']))\n",
+    "out = model(ixs)\n",
+    "out[0][vocab['to']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import IterableDataset\n",
+    "import itertools\n",
+    "\n",
+    "def look_ahead_iterator(gen):\n",
+    "   prev = None\n",
+    "   for item in gen:\n",
+    "      if prev is not None:\n",
+    "         yield (prev, item)\n",
+    "      prev = item\n",
+    "\n",
+    "class Bigrams(IterableDataset):\n",
+    "  def __init__(self, text_file, vocabulary_size):\n",
+    "      self.vocab = build_vocab_from_iterator(\n",
+    "         get_word_lines_from_file(text_file),\n",
+    "         max_tokens = vocabulary_size,\n",
+    "         specials = ['<unk>'])\n",
+    "      self.vocab.set_default_index(self.vocab['<unk>'])\n",
+    "      self.vocabulary_size = vocabulary_size\n",
+    "      self.text_file = text_file\n",
+    "\n",
+    "  def __iter__(self):\n",
+    "     return look_ahead_iterator(\n",
+    "         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
+    "\n",
+    "train_dataset = Bigrams('test-A/in.tsv', vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 tensor(10.0928, grad_fn=<NllLossBackward0>)\n",
+      "100 tensor(8.4572, grad_fn=<NllLossBackward0>)\n",
+      "200 tensor(7.6165, grad_fn=<NllLossBackward0>)\n",
+      "300 tensor(6.9356, grad_fn=<NllLossBackward0>)\n",
+      "400 tensor(6.5687, grad_fn=<NllLossBackward0>)\n",
+      "500 tensor(6.2197, grad_fn=<NllLossBackward0>)\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[13], line 15\u001b[0m\n\u001b[0;32m     13\u001b[0m y \u001b[39m=\u001b[39m y\u001b[39m.\u001b[39mto(device)\n\u001b[0;32m     14\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n\u001b[1;32m---> 15\u001b[0m ypredicted \u001b[39m=\u001b[39m model(x)\n\u001b[0;32m     16\u001b[0m loss \u001b[39m=\u001b[39m criterion(torch\u001b[39m.\u001b[39mlog(ypredicted), y)\n\u001b[0;32m     17\u001b[0m \u001b[39mif\u001b[39;00m step \u001b[39m%\u001b[39m \u001b[39m100\u001b[39m \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
+      "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "Cell \u001b[1;32mIn[10], line 16\u001b[0m, in \u001b[0;36mSimpleBigramNeuralLanguageModel.forward\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m     15\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x):\n\u001b[1;32m---> 16\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(x)\n",
+      "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m    215\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m):\n\u001b[0;32m    216\u001b[0m     \u001b[39mfor\u001b[39;00m module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m:\n\u001b[1;32m--> 217\u001b[0m         \u001b[39minput\u001b[39m \u001b[39m=\u001b[39m module(\u001b[39minput\u001b[39;49m)\n\u001b[0;32m    218\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39minput\u001b[39m\n",
+      "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m    113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[1;32m--> 114\u001b[0m     \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mlinear(\u001b[39minput\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias)\n",
+      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "device = 'cpu' # cuda\n",
+    "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
+    "data = DataLoader(train_dataset, batch_size=5000)\n",
+    "optimizer = torch.optim.Adam(model.parameters())\n",
+    "criterion = torch.nn.NLLLoss()\n",
+    "\n",
+    "model.train()\n",
+    "step = 0\n",
+    "for x, y in data:\n",
+    "   x = x.to(device)\n",
+    "   y = y.to(device)\n",
+    "   optimizer.zero_grad()\n",
+    "   ypredicted = model(x)\n",
+    "   loss = criterion(torch.log(ypredicted), y)\n",
+    "   if step % 100 == 0:\n",
+    "      print(step, loss)\n",
+    "   step += 1\n",
+    "   loss.backward()\n",
+    "   optimizer.step()\n",
+    "\n",
+    "torch.save(model.state_dict(), 'model1.bin')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('liquid', 6933, 0.0004737793351523578),\n",
+       " ('bia', 5842, 0.00043268679291941226),\n",
+       " ('sole', 6386, 0.0004295798426028341),\n",
+       " ('nmeant', 17711, 0.00034942160709761083),\n",
+       " ('savs', 16709, 0.00034736539237201214),\n",
+       " ('striving', 12414, 0.0003441996523179114),\n",
+       " ('nol', 2640, 0.00032789510441944003),\n",
+       " ('imposing', 8457, 0.0003199590719304979),\n",
+       " ('hound', 17348, 0.00031824613688513637),\n",
+       " ('?\"\\\\', 4294, 0.0003141215711366385)]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = 'cpu' # cuda\n",
+    "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
+    "#model.load_state_dict(torch.load('model1.bin'))\n",
+    "model.eval()\n",
+    "\n",
+    "ixs = torch.tensor(vocab.forward(['welcome'])).to(device)\n",
+    "\n",
+    "out = model(ixs)\n",
+    "top = torch.topk(out[0], 10)\n",
+    "top_indices = top.indices.tolist()\n",
+    "top_probs = top.values.tolist()\n",
+    "top_words = vocab.lookup_tokens(top_indices)\n",
+    "list(zip(top_words, top_indices, top_probs))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "modelowanie",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}