zad12
This commit is contained in:
parent
64d2b9ebda
commit
3b0cab7eef
21694
dev-0/out.tsv
21694
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
443
lab12.ipynb
Normal file
443
lab12.ipynb
Normal file
@ -0,0 +1,443 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "W8-j-5oV0o46",
|
||||
"outputId": "5cf81efc-7e9b-46a6-d3bd-792a4b4b39b9"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting transformers\n",
|
||||
" Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m105.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)\n",
|
||||
"Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n",
|
||||
" Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
|
||||
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n",
|
||||
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n",
|
||||
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n",
|
||||
"Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n",
|
||||
" Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m117.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n",
|
||||
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.5.0)\n",
|
||||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)\n",
|
||||
"Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
|
||||
"Installing collected packages: tokenizers, huggingface-hub, transformers\n",
|
||||
"Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)\n",
|
||||
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.12.0)\n",
|
||||
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n",
|
||||
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)\n",
|
||||
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)\n",
|
||||
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)\n",
|
||||
"Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)\n",
|
||||
"Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)\n",
|
||||
"Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)\n",
|
||||
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install transformers\n",
|
||||
"!pip install torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "O6aa5mpE0s6H",
|
||||
"outputId": "18112d31-6a14-4b91-b9db-44ea197c8d0c"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Cloning into 'challenging-america-word-gap-prediction'...\n",
|
||||
"remote: Wymienianie obiektów: 27, gotowe.\u001b[K\n",
|
||||
"remote: Zliczanie obiektów: 100% (27/27), gotowe.\u001b[K\n",
|
||||
"remote: Kompresowanie obiektów: 100% (23/23), gotowe.\u001b[K\n",
|
||||
"remote: Razem 27 (delty 2), użyte ponownie 17 (delty 0), paczki użyte ponownie 0\u001b[K\n",
|
||||
"Receiving objects: 100% (27/27), 278.33 MiB | 8.52 MiB/s, done.\n",
|
||||
"Resolving deltas: 100% (2/2), done.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"id": "uHkXCRs-0iSr"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import sys\n",
|
||||
"from transformers import GPT2Tokenizer, GPT2LMHeadModel\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"id": "HyKM4zn41YvQ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lzma\n",
|
||||
"from itertools import islice\n",
|
||||
"import regex as re\n",
|
||||
"import sys\n",
|
||||
"from torchtext.vocab import build_vocab_from_iterator\n",
|
||||
"from torch import nn\n",
|
||||
"from torch.utils.data import IterableDataset\n",
|
||||
"import itertools"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "-k8RhlmI06mQ",
|
||||
"outputId": "e2ef4117-5d5b-40e9-f774-9faba825042c"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"/content/challenging-america-word-gap-prediction\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%cd /content/challenging-america-word-gap-prediction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"\n",
|
||||
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "PCA7Ank2dnwM"
|
||||
},
|
||||
"execution_count": 28,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
||||
"model = GPT2LMHeadModel.from_pretrained(\"gpt2\").to(device)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "U0kG_W5AY7uE"
|
||||
},
|
||||
"execution_count": 32,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"id": "F4MXeKLxMQ4N"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def prediction(word: str) -> str:\n",
|
||||
" left_context =tokenizer.encode(word, return_tensors=\"pt\").to(device)\n",
|
||||
" out = model(left_context)\n",
|
||||
" prob_dist=torch.softmax(out[0][-1],dim=1)\n",
|
||||
" values,index =prob_dist.topk(5)\n",
|
||||
" token = [] \n",
|
||||
" for x in index[-1]:\n",
|
||||
" token.append(tokenizer.decode(x))\n",
|
||||
" zipped = list(zip(values[-1], token))\n",
|
||||
" for index, element in enumerate(zipped):\n",
|
||||
" unk = None\n",
|
||||
" if '<unk>' in element:\n",
|
||||
" unk = zipped.pop(index)\n",
|
||||
" zipped.append(('', unk[1]))\n",
|
||||
" break\n",
|
||||
" if unk is None:\n",
|
||||
" zipped[-1] = ('', zipped[-1][1])\n",
|
||||
" return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"id": "My_1_4L5MMc3"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_outputs(folder_name):\n",
|
||||
" print(f'Creating outputs in {folder_name}')\n",
|
||||
" with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
|
||||
" with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
|
||||
" for line in fid:\n",
|
||||
" separated = line.split('\\t')\n",
|
||||
" prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
|
||||
" output_line = prediction(prefix)\n",
|
||||
" f.write(output_line + '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "4VQPcLF-OChJ",
|
||||
"outputId": "7a7408b2-ad26-4041-887e-99a8e9d36d0f"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Creating outputs in dev-0\n",
|
||||
"Creating outputs in test-A\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"create_outputs('dev-0')\n",
|
||||
"create_outputs('test-A')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "YCGOd41pzfAC"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
||||
"model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
|
||||
"def get_words_from_line(line):\n",
|
||||
" line = line.rstrip()\n",
|
||||
" yield '<s>'\n",
|
||||
" for t in line.split():\n",
|
||||
" yield t\n",
|
||||
" yield '</s>'\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_word_lines_from_file(file_name):\n",
|
||||
" with lzma.open(file_name, encoding='utf8', mode=\"rt\") as fh:\n",
|
||||
" for line in fh:\n",
|
||||
" pattern = r'\\^\\^|\\n|\\\\|[<>]|[()]'\n",
|
||||
" line = re.sub(pattern, '', line)\n",
|
||||
" yield line\n",
|
||||
"\n",
|
||||
"for line in get_word_lines_from_file(\"train/in.tsv.xz\"):\n",
|
||||
" # line = line.strip('\\n')\n",
|
||||
" # fields = line.split(\"\\t\")\n",
|
||||
" # print(line)\n",
|
||||
" left_context = str(line)\n",
|
||||
" input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
|
||||
" # print(input_ids)\n",
|
||||
" output = model(input_ids)\n",
|
||||
" # print(output[0].shape())\n",
|
||||
" prob_dist=torch.softmax(output[0][-1],dim=1)\n",
|
||||
" values,index =prob_dist.topk(20) \n",
|
||||
" print(left_context[-100:])\n",
|
||||
" print(values.size())\n",
|
||||
" print(index.size())\n",
|
||||
" break\n",
|
||||
" for x,indx in zip(values,index):\n",
|
||||
" for i in range(20):\n",
|
||||
" token = tokenizer.decode(indx[i])\n",
|
||||
" print(f'{x[i]} {indx[i]} {token}')\n",
|
||||
" print('-------------------------')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "01zkM5giNUR3"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# line = line.strip('\\n')\n",
|
||||
"# fields = line.split(\"\\t\")\n",
|
||||
"# print(line)\n",
|
||||
"left_context = \"he\"\n",
|
||||
"input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
|
||||
"# print(input_ids)\n",
|
||||
"output = model(input_ids)\n",
|
||||
"# print(output[0].shape())\n",
|
||||
"prob_dist=torch.softmax(output[0][-1],dim=1)\n",
|
||||
"values,index =prob_dist.topk(5) \n",
|
||||
"token = []\n",
|
||||
"for x in index[-1]:\n",
|
||||
" token.append(tokenizer.decode(x))\n",
|
||||
" # print(token)\n",
|
||||
"for x,token in zip(values[-1],token):\n",
|
||||
" # token = tokenizer.decode(indx)\n",
|
||||
" print(f'{x} {token}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "lDc9Nw40C3dr"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for line in get_word_lines_from_file(\"dev-0/in.tsv.xz\"):\n",
|
||||
" # line = line.strip('\\n')\n",
|
||||
" # fields = line.split(\"\\t\")\n",
|
||||
" # print(line)\n",
|
||||
" left_context = str(line)\n",
|
||||
" input_ids = tokenizer.encode(left_context, return_tensors=\"pt\")\n",
|
||||
" # print(input_ids)\n",
|
||||
" output = model(input_ids)\n",
|
||||
" # print(output[0].shape())\n",
|
||||
" prob_dist=torch.softmax(output[0][-1],dim=1)\n",
|
||||
" values,index =prob_dist.topk(20) \n",
|
||||
" print(left_context[-100:])\n",
|
||||
" # print(values.size())\n",
|
||||
" # print(index.size())\n",
|
||||
" # print(values[])\n",
|
||||
" # break\n",
|
||||
" for x,indx in zip(values[-1],index[-1]):\n",
|
||||
" token = tokenizer.decode(indx)\n",
|
||||
" print(f'{x} {indx} {token}')\n",
|
||||
" print('-------------------------')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "si7wLC2Tx-kg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"token = tokenizer.decode(256 )\n",
|
||||
"print(token)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "lJoE0Cwz0JCM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"top_indices[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "tgmT1vG20U_1"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"top_probs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "U9GVSAZz4SlW"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"top =prob_dist.topk(20) \n",
|
||||
"top_indices = top.indices.tolist()\n",
|
||||
"top_probs = top.values.tolist()\n",
|
||||
"top_words = tokenizer.decode(top_indices)\n",
|
||||
"print(top_words,'\\n',top_indices,'\\n',top_probs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "8_WSZ_v99xSH"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(index[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "OAiJNMNMwNNg"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(prob_dist.topk(2)[0].size())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "PIUjH8-ow1y9"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
BIN
model1.bin
BIN
model1.bin
Binary file not shown.
23284
test-A/out.tsv
23284
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user