{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W8-j-5oV0o46", "outputId": "2b90f9b1-acb4-45e2-9a2f-2818e6fe03b8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m91.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m111.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m73.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers\n", "Successfully installed huggingface-hub-0.15.1 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting datasets\n", " Downloading datasets-2.13.0-py3-none-any.whl (485 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m485.6/485.6 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.22.4)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.27.1)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.65.0)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m25.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.4.0)\n", "Collecting aiohttp (from datasets)\n", " Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m32.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: huggingface-hub<1.0.0,>=0.11.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.15.1)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)\n", " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->datasets)\n", " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", "Collecting yarl<2.0,>=1.0 (from aiohttp->datasets)\n", " Downloading yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (268 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting frozenlist>=1.1.1 (from aiohttp->datasets)\n", " Downloading frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (149 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m19.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting aiosignal>=1.1.2 (from aiohttp->datasets)\n", " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (3.12.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (4.5.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (1.26.15)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n", "Installing collected packages: xxhash, multidict, frozenlist, dill, async-timeout, yarl, multiprocess, aiosignal, aiohttp, datasets\n", "Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.13.0 dill-0.3.6 frozenlist-1.3.3 multidict-6.0.4 multiprocess-0.70.14 xxhash-3.2.0 yarl-1.9.2\n" ] } ], "source": [ "!pip install torch\n", "!pip install transformers\n", "!pip install datasets" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "O6aa5mpE0s6H", "outputId": "43b0e006-1678-41f1-9b40-6964f9127a74" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'challenging-america-word-gap-prediction'...\n", "remote: Wymienianie obiektów: 27, gotowe.\u001b[K\n", "remote: Zliczanie obiektów: 100% (27/27), gotowe.\u001b[K\n", "remote: Kompresowanie obiektów: 100% (23/23), gotowe.\u001b[K\n", "remote: Razem 27 (delty 2), użyte ponownie 17 (delty 0), paczki użyte ponownie 0\u001b[K\n", "Receiving objects: 100% (27/27), 278.33 MiB | 21.80 MiB/s, done.\n", "Resolving deltas: 100% (2/2), done.\n" ] } ], "source": [ "!git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "uHkXCRs-0iSr" }, "outputs": [], "source": [ "import torch\n", "import sys\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "HyKM4zn41YvQ" }, "outputs": [], "source": [ "import torch\n", "import sys\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", "\n", "import lzma\n", "from itertools import islice\n", "import regex as re\n", "import math\n", "import sys\n", "from torchtext.vocab import build_vocab_from_iterator\n", "from torch import nn\n", "from torch.utils.data import IterableDataset\n", "import itertools" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-k8RhlmI06mQ", "outputId": "39d0fc5d-c6ea-4f4b-e75e-cbe199bd175a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/content/challenging-america-word-gap-prediction\n" ] } ], "source": [ "%cd /content/challenging-america-word-gap-prediction" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "PCA7Ank2dnwM" }, "outputs": [], "source": [ "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 177, "referenced_widgets": [ "679cc8100859428eaed49abdb40f93d1", "c4a853a74ab44187a321bba0e0c242a9", "2196c28d51fc4e7da12de6b95f915c83", "eab433dc0a754595be140a7bebac7d53", "2716e1a313324f17a613d453496da711", "493d2cdcbe1048c498818ba831518823", "c0683a76665147959ac3590d686dac79", "1f4322bee4ab472ab079e125367565f9", "fbe1a50893a440ac9f94da7da561724c", "6680540157cb4a408f23f5139b5afb7e", "7f8864f4e2194bb19ae2259b8fcae171", "744ae4c109ee48fb9b5dfd20174a1ba8", "069716cdd6a249389631e7372e8bdd1f", "2a4ab12c36134facafed947c59e56c1f", "db7297bd37e140978362223b898b296e", "75b0b357c261489e9de1013f9e744d7b", "f6fcf563441841e382b22946a0c62141", "2f34c38768f4451490eceea17929ea28", "59faac5f23584f419e76f46ff2c67eb8", "42434cce3dfc4d71a822bad1c25403ce", "e2804fce63cc45caa0a0b10a454b8d5e", "137db27055614a15b3254c70caa394ba", "54c307e63f804593ad0a31a5cdae1cd1", "6933bf7f852148a790a8cda8d6a9c249", "632496f950244273b54b25aecd11c744", "bde223731a20426a91cf605fdf2b31bf", "12d37d60b9fe479c9f2e1b8fcff1a9da", "3f17bc5fc47140239f82d75bbb464dbd", "07e02b1760ff4ba1a56e53f69f906059", "5717dca4a03b41f3af7a31807ce4614e", "82dd58f03b1a45c4ba1139ed5e24aab7", "f1dc48052bf9460a9687f53f81b68a77", "f033acb2cb564a27af86761ecb92cf94", "5f6148c4123e40ce83c418b612bab240", "9fed501af07041589bdf1cb2e1ae43f6", "a498757719ba41a3ba9cc3d7b82ce708", "1ae0c81fc80e424a986f872d37dfd105", "bc4479f17c904fab8c48437893860666", "9eda7ec9e0654d0ebd69b893b2a82446", "3fc6686709e54b08a136857ac6746e0f", "45a0c58a53644682921c0b7719a5fbef", "3e907191449145a9be3ee67fa7608032", "65abb05649a643a78f74bf980e733544", "10024f2ae6b64ae3b6fbcf9f648fa606", "47f2c295b55544358873b0c10e86c3ae", "dc857be677954e33af409b3ad0eb8bad", "ea664560c3c644bcb1e9dd9aa10fa226", "5e754b29dfab4ce0993b352f0db2ae74", "410b775106004eb28f186d2465271ee8", "86383e136b274ef5b508e863bf0ad8f9", "b46a15fb1d574ed9a2271746fad0ab49", "6c4bf13f3bc741a3846cbcb303bd3a4b", "c088cf42853849e3a368c6079c2ffed3", "03509301c905465faff707374157194b", "d99a5d547a9a4e1aad28d7bced59712e" ] }, "id": "U0kG_W5AY7uE", "outputId": "578418b8-11d6-4db0-c55a-1084b895b3ce" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "679cc8100859428eaed49abdb40f93d1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)olve/main/vocab.json: 0%| | 0.00/1.04M [00:00