{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch \n", "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", "\n", "import numpy as np\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "torch.__version__, device" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')\n", "\n", "model = GPT2LMHeadModel.from_pretrained('gpt2-medium')\n", "model.to(device)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lzma \n", "\n", "\n", "def file_iterator(file_path):\n", " print(file_path, file_path.endswith(\".xz\"))\n", " if file_path.endswith(\".xz\"):\n", " with lzma.open(file_path, mode=\"r\") as fp:\n", " for line in fp.readlines():\n", " yield line.decode(\"utf-8\")#.split(\"\\t\")[7]\n", " else:\n", " with open(file_path, \"r\", encoding=\"utf-8\") as fp:\n", " for line in fp.readlines():\n", " yield line\n", "\n", "def clear_line(line):\n", " return line.lower().strip(\"\\n\").replace(\"\\\\n\", \"\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "K = 20\n", "for file_path in ('dev-0', 'test-A'):\n", " print('Working on file from folder:', file_path)\n", " data_iterator = file_iterator(f'{file_path}/in.tsv.xz')\n", " with open(f'{file_path}/out-tr-dec.tsv', 'w', encoding='utf-8') as fp:\n", " for line in data_iterator:\n", " # print([(i, part) for i, part in enumerate(line.split('\\t'))])\n", " left_context = clear_line(line.split('\\t')[6])\n", " # print(left_context)\n", " inputs = tokenizer.encode(left_context, return_tensors='pt').to(device)\n", " preds = model(inputs)\n", " # print('\\n', preds)\n", " z_dist = preds[0][0][-1]\n", " probability_distances = torch.softmax(preds[0][0][-1], dim=0)\n", " top_k = probability_distances.topk(K)\n", " # print(top_k)\n", " results = [f'{tokenizer.decode([idx])}:{value}' for value, idx in zip(top_k.values, top_k.indices)]\n", " # print(results)\n", " line_to_write = ' '.join(results) + f' :{1 - torch.sum(top_k.values)}\\n'\n", " # print(line_to_write)\n", " fp.write(line_to_write)\n", " # break\n", " # break" ] } ], "metadata": { "kernelspec": { "display_name": "mj_venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }