diff --git a/07_bigram_regression.ipynb b/07_bigram_regression.ipynb new file mode 100644 index 0000000..4c7cb5a --- /dev/null +++ b/07_bigram_regression.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Zadanie 1\n", + "Wyucz prosty bigramowy model języka oparty na regresji logistycznej (jak przedstawiono na wykładzie)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import islice\n", + "import regex as re\n", + "import sys\n", + "from torchtext.vocab import build_vocab_from_iterator\n", + "\n", + "\n", + "def get_words_from_line(line):\n", + " line = line.rstrip()\n", + " yield ''\n", + " for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n", + " yield m.group(0).lower()\n", + " yield ''\n", + "\n", + "\n", + "def get_word_lines_from_file(file_name):\n", + " with open(file_name, 'r') as fh:\n", + " for line in fh:\n", + " yield get_words_from_line(line)\n", + "\n", + "vocab_size = 20000\n", + "\n", + "vocab = build_vocab_from_iterator(\n", + " get_word_lines_from_file('test-A/in.tsv'),\n", + " max_tokens = vocab_size,\n", + " specials = [''])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3798" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab['welcome']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor(5.5038e-05, grad_fn=)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from torch import nn\n", + "import torch\n", + "\n", + "embed_size = 100\n", + "\n", + "class SimpleBigramNeuralLanguageModel(nn.Module):\n", + " def __init__(self, vocabulary_size, embedding_size):\n", + " super(SimpleBigramNeuralLanguageModel, self).__init__()\n", + " self.model = nn.Sequential(\n", + " nn.Embedding(vocabulary_size, embedding_size),\n", + " nn.Linear(embedding_size, vocabulary_size),\n", + " nn.Softmax()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " return self.model(x)\n", + "\n", + "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)\n", + "\n", + "vocab.set_default_index(vocab[''])\n", + "ixs = torch.tensor(vocab.forward(['welcone']))\n", + "out = model(ixs)\n", + "out[0][vocab['to']]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import IterableDataset\n", + "import itertools\n", + "\n", + "def look_ahead_iterator(gen):\n", + " prev = None\n", + " for item in gen:\n", + " if prev is not None:\n", + " yield (prev, item)\n", + " prev = item\n", + "\n", + "class Bigrams(IterableDataset):\n", + " def __init__(self, text_file, vocabulary_size):\n", + " self.vocab = build_vocab_from_iterator(\n", + " get_word_lines_from_file(text_file),\n", + " max_tokens = vocabulary_size,\n", + " specials = [''])\n", + " self.vocab.set_default_index(self.vocab[''])\n", + " self.vocabulary_size = vocabulary_size\n", + " self.text_file = text_file\n", + "\n", + " def __iter__(self):\n", + " return look_ahead_iterator(\n", + " (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n", + "\n", + "train_dataset = Bigrams('test-A/in.tsv', vocab_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 tensor(10.0928, grad_fn=)\n", + "100 tensor(8.4572, grad_fn=)\n", + "200 tensor(7.6165, grad_fn=)\n", + "300 tensor(6.9356, grad_fn=)\n", + "400 tensor(6.5687, grad_fn=)\n", + "500 tensor(6.2197, grad_fn=)\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[13], line 15\u001b[0m\n\u001b[0;32m 13\u001b[0m y \u001b[39m=\u001b[39m y\u001b[39m.\u001b[39mto(device)\n\u001b[0;32m 14\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n\u001b[1;32m---> 15\u001b[0m ypredicted \u001b[39m=\u001b[39m model(x)\n\u001b[0;32m 16\u001b[0m loss \u001b[39m=\u001b[39m criterion(torch\u001b[39m.\u001b[39mlog(ypredicted), y)\n\u001b[0;32m 17\u001b[0m \u001b[39mif\u001b[39;00m step \u001b[39m%\u001b[39m \u001b[39m100\u001b[39m \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n", + "Cell \u001b[1;32mIn[10], line 16\u001b[0m, in \u001b[0;36mSimpleBigramNeuralLanguageModel.forward\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x):\n\u001b[1;32m---> 16\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(x)\n", + "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n", + "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 215\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m):\n\u001b[0;32m 216\u001b[0m \u001b[39mfor\u001b[39;00m module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m:\n\u001b[1;32m--> 217\u001b[0m \u001b[39minput\u001b[39m \u001b[39m=\u001b[39m module(\u001b[39minput\u001b[39;49m)\n\u001b[0;32m 218\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39minput\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n", + "File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[1;32m--> 114\u001b[0m \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mlinear(\u001b[39minput\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias)\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "device = 'cpu' # cuda\n", + "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n", + "data = DataLoader(train_dataset, batch_size=5000)\n", + "optimizer = torch.optim.Adam(model.parameters())\n", + "criterion = torch.nn.NLLLoss()\n", + "\n", + "model.train()\n", + "step = 0\n", + "for x, y in data:\n", + " x = x.to(device)\n", + " y = y.to(device)\n", + " optimizer.zero_grad()\n", + " ypredicted = model(x)\n", + " loss = criterion(torch.log(ypredicted), y)\n", + " if step % 100 == 0:\n", + " print(step, loss)\n", + " step += 1\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + "torch.save(model.state_dict(), 'model1.bin')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('liquid', 6933, 0.0004737793351523578),\n", + " ('bia', 5842, 0.00043268679291941226),\n", + " ('sole', 6386, 0.0004295798426028341),\n", + " ('nmeant', 17711, 0.00034942160709761083),\n", + " ('savs', 16709, 0.00034736539237201214),\n", + " ('striving', 12414, 0.0003441996523179114),\n", + " ('nol', 2640, 0.00032789510441944003),\n", + " ('imposing', 8457, 0.0003199590719304979),\n", + " ('hound', 17348, 0.00031824613688513637),\n", + " ('?\"\\\\', 4294, 0.0003141215711366385)]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "device = 'cpu' # cuda\n", + "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n", + "#model.load_state_dict(torch.load('model1.bin'))\n", + "model.eval()\n", + "\n", + "ixs = torch.tensor(vocab.forward(['welcome'])).to(device)\n", + "\n", + "out = model(ixs)\n", + "top = torch.topk(out[0], 10)\n", + "top_indices = top.indices.tolist()\n", + "top_probs = top.values.tolist()\n", + "top_words = vocab.lookup_tokens(top_indices)\n", + "list(zip(top_words, top_indices, top_probs))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "modelowanie", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}