zad12

2023-06-08 17:34:01 +02:00 · 2023-06-08 17:34:01 +02:00 · 3b0cab7eef
commit 3b0cab7eef
parent 64d2b9ebda
4 changed files with 27488 additions and 17933 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/lab12.ipynb
+++ b/lab12.ipynb
@ -0,0 +1,443 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W8-j-5oV0o46",
+        "outputId": "5cf81efc-7e9b-46a6-d3bd-792a4b4b39b9"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Collecting transformers\n",
+            "  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m105.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n",
+            "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n",
+            "  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
+            "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n",
+            "  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m117.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0)\n",
+            "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
+            "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
+            "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
+            "Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2\n",
+            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n",
+            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n",
+            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install transformers\n",
+        "!pip install torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "O6aa5mpE0s6H",
+        "outputId": "18112d31-6a14-4b91-b9db-44ea197c8d0c"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Cloning into 'challenging-america-word-gap-prediction'...\n",
+            "remote: Wymienianie obiektów: 27, gotowe.\u001b[K\n",
+            "remote: Zliczanie obiektów: 100% (27/27), gotowe.\u001b[K\n",
+            "remote: Kompresowanie obiektów: 100% (23/23), gotowe.\u001b[K\n",
+            "remote: Razem 27 (delty 2), użyte ponownie 17 (delty 0), paczki użyte ponownie 0\u001b[K\n",
+            "Receiving objects: 100% (27/27), 278.33 MiB | 8.52 MiB/s, done.\n",
+            "Resolving deltas: 100% (2/2), done.\n"
+          ]
+        }
+      ],
+      "source": [
+        "!git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "uHkXCRs-0iSr"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import sys\n",
+        "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "HyKM4zn41YvQ"
+      },
+      "outputs": [],
+      "source": [
+        "import lzma\n",
+        "from itertools import islice\n",
+        "import regex as re\n",
+        "import sys\n",
+        "from torchtext.vocab import build_vocab_from_iterator\n",
+        "from torch import nn\n",
+        "from torch.utils.data import IterableDataset\n",
+        "import itertools"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "-k8RhlmI06mQ",
+        "outputId": "e2ef4117-5d5b-40e9-f774-9faba825042c"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content/challenging-america-word-gap-prediction\n"
+          ]
+        }
+      ],
+      "source": [
+        "%cd /content/challenging-america-word-gap-prediction"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+      ],
+      "metadata": {
+        "id": "PCA7Ank2dnwM"
+      },
+      "execution_count": 28,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
+        "model = GPT2LMHeadModel.from_pretrained(\"gpt2\").to(device)"
+      ],
+      "metadata": {
+        "id": "U0kG_W5AY7uE"
+      },
+      "execution_count": 32,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 33,
+      "metadata": {
+        "id": "F4MXeKLxMQ4N"
+      },
+      "outputs": [],
+      "source": [
+        "def prediction(word: str) -> str:\n",
+        "    left_context =tokenizer.encode(word, return_tensors=\"pt\").to(device)\n",
+        "    out = model(left_context)\n",
+        "    prob_dist=torch.softmax(out[0][-1],dim=1)\n",
+        "    values,index  =prob_dist.topk(5)\n",
+        "    token = [] \n",
+        "    for x in index[-1]:\n",
+        "      token.append(tokenizer.decode(x))\n",
+        "    zipped = list(zip(values[-1], token))\n",
+        "    for index, element in enumerate(zipped):\n",
+        "        unk = None\n",
+        "        if '<unk>' in element:\n",
+        "            unk = zipped.pop(index)\n",
+        "            zipped.append(('', unk[1]))\n",
+        "            break\n",
+        "    if unk is None:\n",
+        "        zipped[-1] = ('', zipped[-1][1])\n",
+        "    return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 34,
+      "metadata": {
+        "id": "My_1_4L5MMc3"
+      },
+      "outputs": [],
+      "source": [
+        "def create_outputs(folder_name):\n",
+        "    print(f'Creating outputs in {folder_name}')\n",
+        "    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
+        "        with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
+        "            for line in fid:\n",
+        "                separated = line.split('\\t')\n",
+        "                prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
+        "                output_line = prediction(prefix)\n",
+        "                f.write(output_line + '\\n')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 35,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4VQPcLF-OChJ",
+        "outputId": "7a7408b2-ad26-4041-887e-99a8e9d36d0f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Creating outputs in dev-0\n",
+            "Creating outputs in test-A\n"
+          ]
+        }
+      ],
+      "source": [
+        "create_outputs('dev-0')\n",
+        "create_outputs('test-A')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "YCGOd41pzfAC"
+      },
+      "outputs": [],
+      "source": [
+        "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
+        "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
+        "def get_words_from_line(line):\n",
+        "  line = line.rstrip()\n",
+        "  yield '<s>'\n",
+        "  for t in line.split():\n",
+        "    yield t\n",
+        "  yield '</s>'\n",
+        "\n",
+        "\n",
+        "def get_word_lines_from_file(file_name):\n",
+        "  with lzma.open(file_name, encoding='utf8', mode=\"rt\") as fh:\n",
+        "    for line in fh:\n",
+        "      pattern = r'\\^\\^|\\n|\\\\|[<>]|[()]'\n",
+        "      line = re.sub(pattern, '', line)\n",
+        "      yield line\n",
+        "\n",
+        "for line in get_word_lines_from_file(\"train/in.tsv.xz\"):\n",
+        "    # line = line.strip('\\n')\n",
+        "    # fields = line.split(\"\\t\")\n",
+        "    # print(line)\n",
+        "    left_context = str(line)\n",
+        "    input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
+        "    # print(input_ids)\n",
+        "    output = model(input_ids)\n",
+        "    # print(output[0].shape())\n",
+        "    prob_dist=torch.softmax(output[0][-1],dim=1)\n",
+        "    values,index  =prob_dist.topk(20) \n",
+        "    print(left_context[-100:])\n",
+        "    print(values.size())\n",
+        "    print(index.size())\n",
+        "    break\n",
+        "    for x,indx in zip(values,index):\n",
+        "      for i in range(20):\n",
+        "        token = tokenizer.decode(indx[i])\n",
+        "        print(f'{x[i]} {indx[i]} {token}')\n",
+        "      print('-------------------------')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "01zkM5giNUR3"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "# line = line.strip('\\n')\n",
+        "# fields = line.split(\"\\t\")\n",
+        "# print(line)\n",
+        "left_context = \"he\"\n",
+        "input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
+        "# print(input_ids)\n",
+        "output = model(input_ids)\n",
+        "# print(output[0].shape())\n",
+        "prob_dist=torch.softmax(output[0][-1],dim=1)\n",
+        "values,index  =prob_dist.topk(5) \n",
+        "token = []\n",
+        "for x in index[-1]:\n",
+        "      token.append(tokenizer.decode(x))\n",
+        "      # print(token)\n",
+        "for x,token in zip(values[-1],token):\n",
+        "      # token = tokenizer.decode(indx)\n",
+        "      print(f'{x}  {token}')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lDc9Nw40C3dr"
+      },
+      "outputs": [],
+      "source": [
+        "for line in get_word_lines_from_file(\"dev-0/in.tsv.xz\"):\n",
+        "    # line = line.strip('\\n')\n",
+        "    # fields = line.split(\"\\t\")\n",
+        "    # print(line)\n",
+        "    left_context = str(line)\n",
+        "    input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
+        "    # print(input_ids)\n",
+        "    output = model(input_ids)\n",
+        "    # print(output[0].shape())\n",
+        "    prob_dist=torch.softmax(output[0][-1],dim=1)\n",
+        "    values,index  =prob_dist.topk(20) \n",
+        "    print(left_context[-100:])\n",
+        "    # print(values.size())\n",
+        "    # print(index.size())\n",
+        "    # print(values[])\n",
+        "    # break\n",
+        "    for x,indx in zip(values[-1],index[-1]):\n",
+        "        token = tokenizer.decode(indx)\n",
+        "        print(f'{x} {indx} {token}')\n",
+        "    print('-------------------------')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "si7wLC2Tx-kg"
+      },
+      "outputs": [],
+      "source": [
+        "token = tokenizer.decode(256  )\n",
+        "print(token)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "lJoE0Cwz0JCM"
+      },
+      "outputs": [],
+      "source": [
+        "top_indices[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tgmT1vG20U_1"
+      },
+      "outputs": [],
+      "source": [
+        "top_probs[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "U9GVSAZz4SlW"
+      },
+      "outputs": [],
+      "source": [
+        "top  =prob_dist.topk(20) \n",
+        "top_indices = top.indices.tolist()\n",
+        "top_probs = top.values.tolist()\n",
+        "top_words = tokenizer.decode(top_indices)\n",
+        "print(top_words,'\\n',top_indices,'\\n',top_probs)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8_WSZ_v99xSH"
+      },
+      "outputs": [],
+      "source": [
+        "print(index[1])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OAiJNMNMwNNg"
+      },
+      "outputs": [],
+      "source": [
+        "print(prob_dist.topk(2)[0].size())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PIUjH8-ow1y9"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/model1.bin
+++ b/model1.bin
--- a/test-A/out.tsv
+++ b/test-A/out.tsv