fix

cos tam
solution
2021-06-24 19:07:58 +02:00 · 2021-06-22 19:27:08 +02:00 · 2021-06-09 03:01:30 +02:00 · 2021-06-09 00:19:16 +02:00 · 2021-06-08 12:39:08 +02:00
11 changed files with 3124 additions and 0 deletions
--- a/.ipynb_checkpoints/Program-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Program-checkpoint.ipynb
@ -0,0 +1,438 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e574fca4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv\n",
    "import os.path\n",
    "import shutil\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from itertools import islice\n",
    "from sklearn.model_selection import train_test_split\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter\n",
    "from nltk.tokenize import word_tokenize\n",
    "import gensim.downloader as api\n",
    "from gensim.models.word2vec import Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b476f295",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting gensim\n",
      "  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
      "Collecting Cython==0.29.21\n",
      "  Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
      "Collecting smart-open>=1.8.1\n",
      "  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
      "Installing collected packages: smart-open, Cython, gensim\n",
      "  Attempting uninstall: Cython\n",
      "    Found existing installation: Cython 0.29.23\n",
      "    Uninstalling Cython-0.29.23:\n",
      "      Successfully uninstalled Cython-0.29.23\n",
      "Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fbe3a657",
   "metadata": {},
   "outputs": [],
   "source": [
    "class NERModel(torch.nn.Module):\n",
    "\n",
    "    def __init__(self,):\n",
    "        super(NERModel, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(23628,200)\n",
    "        self.fc1 = torch.nn.Linear(600,9)\n",
    "        \n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.emb(x)\n",
    "        x = x.reshape(600) \n",
    "        x = self.fc1(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3497a580",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "            x = (\" \".join(new_line))\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3e78d902",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ec8537cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt):\n",
    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "847c958a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def labels_process(dt):\n",
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "66bee163",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(input_tokens, labels):\n",
    "\n",
    "  results = []\n",
    "  \n",
    "  for i in range(len(input_tokens)):\n",
    "    line_results = []\n",
    "    for j in range(1, len(input_tokens[i]) - 1):\n",
    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
    "        predicted = ner_model(x.long())\n",
    "        result = torch.argmax(predicted)\n",
    "        label = labels[result]\n",
    "        line_results.append(label)\n",
    "    results.append(line_results)\n",
    "\n",
    "  return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "39046f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9b40a8b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
    "train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
    "train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "02a12cbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = build_vocab(train['b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8cc6d19d",
   "metadata": {},
   "outputs": [],
   "source": [
    "  tensors = []\n",
    "\n",
    "  for sent in train[\"b\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "690085f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NVIDIA GeForce RTX 2060'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.get_device_name(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "64b2d751",
   "metadata": {},
   "outputs": [],
   "source": [
    "device_gpu = torch.device(\"cuda:0\")\n",
    "ner_model = NERModel().to(device_gpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "094d7e69",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = labels_process(train['a'])\n",
    "train_tokens_ids = data_process(train['b'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "17291b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "045b7186",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0\n",
      "f1: 0.6373470953763748\n",
      "acc: 0.9116419913061858\n",
      "epoch: 1\n",
      "f1: 0.7973076923076923\n",
      "acc: 0.9540771782783307\n",
      "epoch: 2\n",
      "f1: 0.8640167364016735\n",
      "acc: 0.9702287410511612\n",
      "epoch: 3\n",
      "f1: 0.9038441719055962\n",
      "acc: 0.9793820591289644\n",
      "epoch: 4\n",
      "f1: 0.928903400400047\n",
      "acc: 0.9850890978100043\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(5):\n",
    "    acc_score = 0\n",
    "    prec_score = 0\n",
    "    selected_items = 0\n",
    "    recall_score = 0\n",
    "    relevant_items = 0\n",
    "    items_total = 0\n",
    "    ner_model.train()\n",
    "    for i in range(len(train_labels)):\n",
    "        for j in range(1, len(train_labels[i]) - 1):\n",
    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
    "\n",
    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
    "\n",
    "            Y_predictions = ner_model(X.long())\n",
    "\n",
    "            acc_score += int(torch.argmax(Y_predictions) == Y)\n",
    "            if torch.argmax(Y_predictions) != 0:\n",
    "                selected_items += 1\n",
    "            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                prec_score += 1\n",
    "            if Y.item() != 0:\n",
    "                relevant_items += 1\n",
    "            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                recall_score += 1\n",
    "\n",
    "            items_total += 1\n",
    "            optimizer.zero_grad()\n",
    "            loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "    precision = prec_score / selected_items\n",
    "    recall = recall_score / relevant_items\n",
    "    f1_score = (2 * precision * recall) / (precision + recall)\n",
    "    print(f'epoch: {epoch}')\n",
    "    print(f'f1: {f1_score}')\n",
    "    print(f'acc: {acc_score / items_total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f75aa5e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_tensors_list(data):\n",
    "  tensors = []\n",
    "\n",
    "  for sent in data[\"a\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)\n",
    "\n",
    "  return tensors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "49215802",
   "metadata": {},
   "outputs": [],
   "source": [
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "dev_tokens_ids = data_process(dev[\"a\"])\n",
    "\n",
    "dev_extra_tensors = create_tensors_list(dev)\n",
    "\n",
    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
    "\n",
    "results = predict(dev_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "8c5b007e",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "test_tokens_ids = data_process(test[\"a\"])\n",
    "\n",
    "test_extra_tensors = create_tensors_list(test)\n",
    "\n",
    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
    "\n",
    "results = predict(test_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
@ -0,0 +1,6 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.ipynb_checkpoints/gru-checkpoint.ipynb
+++ b/.ipynb_checkpoints/gru-checkpoint.ipynb
@ -0,0 +1,376 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bce0cfa7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "from os import sep\n",
    "from nltk import word_tokenize\n",
    "import pandas as pd\n",
    "import torch\n",
    "from TorchCRF import CRF\n",
    "import gensim\n",
    "from torch._C import device\n",
    "from tqdm import tqdm\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter, OrderedDict\n",
    "import spacy\n",
    "\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "import numpy as np\n",
    "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
    "import csv\n",
    "import pickle\n",
    "\n",
    "import lzma\n",
    "import re\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "67ace382",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pytorch-crf in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (0.7.2)\n"
     ]
    }
   ],
   "source": [
    "!pip3 install pytorch-crf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "adc9a4de",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torchcrf'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-3-2a643b4fc1bb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 20\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mtorchcrf\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCRF\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torchcrf'"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import gensim\n",
    "import torch\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter\n",
    "\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "import torch\n",
    "from torchcrf import CRF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6695751c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d247e4fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt, vocab):\n",
    "    return [torch.tensor([vocab[token] for token in document], dtype=torch.long) for document in dt]\n",
    "\n",
    "\n",
    "def get_scores(y_true, y_pred):\n",
    "    acc_score = 0\n",
    "    tp = 0\n",
    "    fp = 0\n",
    "    selected_items = 0\n",
    "    relevant_items = 0\n",
    "    for p, t in zip(y_pred, y_true):\n",
    "        if p == t:\n",
    "            acc_score += 1\n",
    "        if p > 0 and p == t:\n",
    "            tp += 1\n",
    "        if p > 0:\n",
    "            selected_items += 1\n",
    "        if t > 0:\n",
    "            relevant_items += 1\n",
    "\n",
    "    if selected_items == 0:\n",
    "        precision = 1.0\n",
    "    else:\n",
    "        precision = tp / selected_items\n",
    "\n",
    "    if relevant_items == 0:\n",
    "        recall = 1.0\n",
    "    else:\n",
    "        recall = tp / relevant_items\n",
    "\n",
    "    if precision + recall == 0.0:\n",
    "        f1 = 0.0\n",
    "    else:\n",
    "        f1 = 2 * precision * recall / (precision + recall)\n",
    "\n",
    "    return precision, recall, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6061642",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "            x = (\" \".join(new_line))\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d7c4dd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class GRU(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(GRU, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(len(vocab_x.itos),100)\n",
    "        self.dropout = torch.nn.Dropout(0.2)\n",
    "        self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)\n",
    "        self.fc1 = torch.nn.Linear(2* 256 , 9)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        emb = torch.relu(self.emb(x))\n",
    "        emb = self.dropout(emb)        \n",
    "        gru_output, h_n = self.rec(emb)        \n",
    "        out_weights = self.fc1(gru_output)\n",
    "        return out_weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd5e419d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dev_eval(model, crf, dev_tokens, dev_labels_tokens, vocab):\n",
    "    Y_true = []\n",
    "    Y_pred = []\n",
    "    model.eval()\n",
    "    crf.eval()\n",
    "    for i in tqdm(range(len(dev_labels_tokens))):\n",
    "        batch_tokens = dev_tokens[i].unsqueeze(0)\n",
    "        tags = list(dev_labels_tokens[i].numpy())\n",
    "        Y_true += tags\n",
    "\n",
    "        Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
    "        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
    "        Y_pred += [crf.decode(Y_batch_pred)[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c808bbd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('train/train.tsv', sep='\\t',\n",
    "                    names=['labels', 'document'])\n",
    "\n",
    "Y_train = [y.split(sep=\" \") for y in train['labels'].values]\n",
    "X_train = [x.split(sep=\" \") for x in train['document'].values]\n",
    "\n",
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['document'])\n",
    "exp = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['labels'])\n",
    "X_dev = [x.split(sep=\" \") for x in dev['document'].values]\n",
    "Y_dev = [y.split(sep=\" \") for y in exp['labels'].values]\n",
    "\n",
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['document'])\n",
    "X_test = test['document'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79485c9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_x = build_vocab(X_train)\n",
    "vocab_y = build_vocab(Y_train)\n",
    "train_tokens = data_process(X_train, vocab_x)\n",
    "labels_tokens = data_process(Y_train, vocab_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3726c82a",
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.cuda.get_device_name(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f29e3b63",
   "metadata": {},
   "outputs": [],
   "source": [
    "device_gpu = torch.device(\"cuda:0\")\n",
    "model = GRU()\n",
    "crf = CRF(9)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c321d58",
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = torch.ByteTensor([1, 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05482a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "params = list(model.parameters()) + list(crf.parameters())\n",
    "optimizer = torch.optim.Adam(params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21a5282e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(2):\n",
    "    crf.train()\n",
    "    model.train()\n",
    "    for i in tqdm(range(len(labels_tokens))):\n",
    "        batch_tokens = train_tokens[i].unsqueeze(0)\n",
    "        tags = labels_tokens[i].unsqueeze(1)\n",
    "\n",
    "        predicted_tags = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        loss = -crf(predicted_tags, tags)\n",
    "\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cec14c35",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install pytorch-crf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ee634f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torchcrf import CRF"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Program.ipynb
+++ b/Program.ipynb
@ -0,0 +1,418 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e574fca4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv\n",
    "import os.path\n",
    "import shutil\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from itertools import islice\n",
    "from sklearn.model_selection import train_test_split\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter\n",
    "from nltk.tokenize import word_tokenize\n",
    "import gensim.downloader as api\n",
    "from gensim.models.word2vec import Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fbe3a657",
   "metadata": {},
   "outputs": [],
   "source": [
    "class NERModel(torch.nn.Module):\n",
    "\n",
    "    def __init__(self,):\n",
    "        super(NERModel, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(23628,200)\n",
    "        self.fc1 = torch.nn.Linear(600,9)\n",
    "        \n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.emb(x)\n",
    "        x = x.reshape(600) \n",
    "        x = self.fc1(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3497a580",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "            x = (\" \".join(new_line))\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3e78d902",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ec8537cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt):\n",
    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "847c958a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def labels_process(dt):\n",
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "66bee163",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(input_tokens, labels):\n",
    "\n",
    "  results = []\n",
    "  \n",
    "  for i in range(len(input_tokens)):\n",
    "    line_results = []\n",
    "    for j in range(1, len(input_tokens[i]) - 1):\n",
    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
    "        predicted = ner_model(x.long())\n",
    "        result = torch.argmax(predicted)\n",
    "        label = labels[result]\n",
    "        line_results.append(label)\n",
    "    results.append(line_results)\n",
    "\n",
    "  return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "39046f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9b40a8b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
    "train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
    "train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "02a12cbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = build_vocab(train['b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8cc6d19d",
   "metadata": {},
   "outputs": [],
   "source": [
    "  tensors = []\n",
    "\n",
    "  for sent in train[\"b\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "690085f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NVIDIA GeForce RTX 2060'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.get_device_name(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "64b2d751",
   "metadata": {},
   "outputs": [],
   "source": [
    "device_gpu = torch.device(\"cuda:0\")\n",
    "ner_model = NERModel().to(device_gpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "094d7e69",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = labels_process(train['a'])\n",
    "train_tokens_ids = data_process(train['b'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "17291b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "045b7186",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0\n",
      "f1: 0.6310260230881535\n",
      "acc: 0.9099004714510215\n",
      "epoch: 1\n",
      "f1: 0.7977381727751791\n",
      "acc: 0.9539025667888947\n",
      "epoch: 2\n",
      "f1: 0.8635445687583837\n",
      "acc: 0.9699162783858546\n",
      "epoch: 3\n",
      "f1: 0.9047002002591589\n",
      "acc: 0.9794417946385082\n",
      "epoch: 4\n",
      "f1: 0.9300697243387956\n",
      "acc: 0.9852774944170274\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(5):\n",
    "    acc_score = 0\n",
    "    prec_score = 0\n",
    "    selected_items = 0\n",
    "    recall_score = 0\n",
    "    relevant_items = 0\n",
    "    items_total = 0\n",
    "    ner_model.train()\n",
    "    for i in range(len(train_labels)):\n",
    "        for j in range(1, len(train_labels[i]) - 1):\n",
    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
    "\n",
    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
    "\n",
    "            Y_predictions = ner_model(X.long())\n",
    "\n",
    "            acc_score += int(torch.argmax(Y_predictions) == Y)\n",
    "            if torch.argmax(Y_predictions) != 0:\n",
    "                selected_items += 1\n",
    "            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                prec_score += 1\n",
    "            if Y.item() != 0:\n",
    "                relevant_items += 1\n",
    "            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                recall_score += 1\n",
    "\n",
    "            items_total += 1\n",
    "            optimizer.zero_grad()\n",
    "            loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "    precision = prec_score / selected_items\n",
    "    recall = recall_score / relevant_items\n",
    "    f1_score = (2 * precision * recall) / (precision + recall)\n",
    "    print(f'epoch: {epoch}')\n",
    "    print(f'f1: {f1_score}')\n",
    "    print(f'acc: {acc_score / items_total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f75aa5e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_tensors_list(data):\n",
    "  tensors = []\n",
    "\n",
    "  for sent in data[\"a\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)\n",
    "\n",
    "  return tensors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "49215802",
   "metadata": {},
   "outputs": [],
   "source": [
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "dev_tokens_ids = data_process(dev[\"a\"])\n",
    "\n",
    "dev_extra_tensors = create_tensors_list(dev)\n",
    "\n",
    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
    "\n",
    "results = predict(dev_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "8c5b007e",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "test_tokens_ids = data_process(test[\"a\"])\n",
    "\n",
    "test_extra_tensors = create_tensors_list(test)\n",
    "\n",
    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
    "\n",
    "results = predict(test_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "000dd425",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"seq_labeling.model\"\n",
    "torch.save(ner_model.state_dict(), model_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/gru.ipynb
+++ b/gru.ipynb
@ -0,0 +1,307 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bce0cfa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from os import sep\n",
    "from nltk import word_tokenize\n",
    "import pandas as pd\n",
    "import torch\n",
    "from torchcrf import CRF\n",
    "import gensim\n",
    "from torch._C import device\n",
    "from tqdm import tqdm\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter, OrderedDict\n",
    "\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "import numpy as np\n",
    "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
    "import csv\n",
    "import pickle\n",
    "\n",
    "import lzma\n",
    "import re\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6695751c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d247e4fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt, vocab):\n",
    "    return [torch.tensor([vocab[token] for token in document], dtype=torch.long) for document in dt]\n",
    "\n",
    "\n",
    "def get_scores(y_true, y_pred):\n",
    "    acc_score = 0\n",
    "    tp = 0\n",
    "    fp = 0\n",
    "    selected_items = 0\n",
    "    relevant_items = 0\n",
    "    for p, t in zip(y_pred, y_true):\n",
    "        if p == t:\n",
    "            acc_score += 1\n",
    "        if p > 0 and p == t:\n",
    "            tp += 1\n",
    "        if p > 0:\n",
    "            selected_items += 1\n",
    "        if t > 0:\n",
    "            relevant_items += 1\n",
    "\n",
    "    if selected_items == 0:\n",
    "        precision = 1.0\n",
    "    else:\n",
    "        precision = tp / selected_items\n",
    "\n",
    "    if relevant_items == 0:\n",
    "        recall = 1.0\n",
    "    else:\n",
    "        recall = tp / relevant_items\n",
    "\n",
    "    if precision + recall == 0.0:\n",
    "        f1 = 0.0\n",
    "    else:\n",
    "        f1 = 2 * precision * recall / (precision + recall)\n",
    "\n",
    "    return precision, recall, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b6061642",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "            x = (\" \".join(new_line))\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3d7c4dd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "class GRU(torch.nn.Module):\n",
    "    def __init__(self):\n",
    "        super(GRU, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(len(vocab_x.itos),100)\n",
    "        self.dropout = torch.nn.Dropout(0.2)\n",
    "        self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)\n",
    "        self.fc1 = torch.nn.Linear(2* 256 , 9)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        emb = torch.relu(self.emb(x))\n",
    "        emb = self.dropout(emb)        \n",
    "        gru_output, h_n = self.rec(emb)        \n",
    "        out_weights = self.fc1(gru_output)\n",
    "        return out_weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cd5e419d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dev_eval(model, crf, dev_tokens, dev_labels_tokens, vocab):\n",
    "    Y_true = []\n",
    "    Y_pred = []\n",
    "    model.eval()\n",
    "    crf.eval()\n",
    "    for i in tqdm(range(len(dev_labels_tokens))):\n",
    "        batch_tokens = dev_tokens[i].unsqueeze(0)\n",
    "        tags = list(dev_labels_tokens[i].numpy())\n",
    "        Y_true += tags\n",
    "\n",
    "        Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
    "        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
    "        Y_pred += [crf.decode(Y_batch_pred)[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c808bbd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('train/train.tsv', sep='\\t',\n",
    "                    names=['labels', 'document'])\n",
    "\n",
    "Y_train = [y.split(sep=\" \") for y in train['labels'].values]\n",
    "X_train = [x.split(sep=\" \") for x in train['document'].values]\n",
    "\n",
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['document'])\n",
    "exp = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['labels'])\n",
    "X_dev = [x.split(sep=\" \") for x in dev['document'].values]\n",
    "Y_dev = [y.split(sep=\" \") for y in exp['labels'].values]\n",
    "\n",
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['document'])\n",
    "X_test = test['document'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "79485c9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_x = build_vocab(X_train)\n",
    "vocab_y = build_vocab(Y_train)\n",
    "train_tokens = data_process(X_train, vocab_x)\n",
    "labels_tokens = data_process(Y_train, vocab_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f29e3b63",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = GRU()\n",
    "crf = CRF(9)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "05482a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "params = list(model.parameters()) + list(crf.parameters())\n",
    "optimizer = torch.optim.Adam(params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "21a5282e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/945 [00:00<?, ?it/s]\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "expected last dimension of emissions is 10, got 9",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-14-6dc1a1c63d46>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m         \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m         \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mcrf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredicted_tags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m         \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    887\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    888\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 889\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    890\u001b[0m         for hook in itertools.chain(\n\u001b[1;32m    891\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torchcrf/__init__.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, emissions, tags, mask, reduction)\u001b[0m\n\u001b[1;32m     88\u001b[0m             \u001b[0mreduction\u001b[0m \u001b[0;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mnone\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0motherwise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     89\u001b[0m         \"\"\"\n\u001b[0;32m---> 90\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memissions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     91\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mreduction\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'none'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'mean'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'token_mean'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     92\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'invalid reduction: {reduction}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torchcrf/__init__.py\u001b[0m in \u001b[0;36m_validate\u001b[0;34m(self, emissions, tags, mask)\u001b[0m\n\u001b[1;32m    147\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'emissions must have dimension of 3, got {emissions.dim()}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    148\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0memissions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_tags\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 149\u001b[0;31m             raise ValueError(\n\u001b[0m\u001b[1;32m    150\u001b[0m                 \u001b[0;34mf'expected last dimension of emissions is {self.num_tags}, '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    151\u001b[0m                 f'got {emissions.size(2)}')\n",
      "\u001b[0;31mValueError\u001b[0m: expected last dimension of emissions is 10, got 9"
     ]
    }
   ],
   "source": [
    "for i in range(2):\n",
    "    crf.train()\n",
    "    model.train()\n",
    "    for i in tqdm(range(len(labels_tokens))):\n",
    "        batch_tokens = train_tokens[i].unsqueeze(0)\n",
    "        tags = labels_tokens[i].unsqueeze(1)\n",
    "\n",
    "        predicted_tags = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        loss = -crf(predicted_tags, tags)\n",
    "\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "366ab1fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_pred = []\n",
    "model.eval()\n",
    "crf.eval()\n",
    "for i in tqdm(range(len(test_tokens))):\n",
    "    batch_tokens = test_tokens[i].unsqueeze(0)\n",
    "\n",
    "    Y_batch_pred = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
    "    Y_pred += [crf.decode(Y_batch_pred)[0]]\n",
    "\n",
    "Y_pred_translate = translate(Y_pred, vocab)\n",
    "return Y_pred_translate"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/seq_labeling.model
+++ b/seq_labeling.model
--- a/solution.py
+++ b/solution.py
@ -0,0 +1,189 @@
 import pandas as pd
 import numpy as np
 import csv
 import os.path
 import shutil
 import torch
 from tqdm import tqdm
 from itertools import islice
 from sklearn.model_selection import train_test_split
 from torchtext.vocab import Vocab
 from collections import Counter
 from nltk.tokenize import word_tokenize
 import gensim.downloader as api
 from gensim.models.word2vec import Word2Vec
 class NERModel(torch.nn.Module):
    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23628,200)
        self.fc1 = torch.nn.Linear(600,9)
    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        return x
 def process_output(lines):
    result = []
    for line in lines:
        last_label = None
        new_line = []
        for label in line:
            if(label != "O" and label[0:2] == "I-"):
                if last_label == None or last_label == "O":
                    label = label.replace('I-', 'B-')
                else:
                    label = "I-" + last_label[2:]
            last_label = label
            new_line.append(label)
            x = (" ".join(new_line))
        result.append(" ".join(new_line))
    return result
 def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
 def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
 def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
 def predict(input_tokens, labels):
  results = []
  for i in range(len(input_tokens)):
    line_results = []
    for j in range(1, len(input_tokens[i]) - 1):
        x = input_tokens[i][j-1: j+2].to(device_gpu)
        predicted = ner_model(x.long())
        result = torch.argmax(predicted)
        label = labels[result]
        line_results.append(label)
    results.append(line_results)
  return results
 train = pd.read_csv('train/train.tsv.xz', sep='\t', names=['a', 'b'])
 labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] 
 train["a"]=train["a"].apply(lambda x: [labels.index(y) for y in  x.split()])
 train["b"]=train["b"].apply(lambda x: x.split())
 vocab = build_vocab(train['b'])
  tensors = []
  for sent in train["b"]:
    sent_tensor = torch.tensor(())
    for word in sent:
      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
      sent_tensor = torch.cat((sent_tensor, temp))
    tensors.append(sent_tensor)
 device_gpu = torch.device("cuda:0")
 ner_model = NERModel().to(device_gpu)
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(ner_model.parameters())
 train_labels = labels_process(train['a'])
 train_tokens_ids = data_process(train['b'])
 train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]
 for epoch in range(5):
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    ner_model.train()
    for i in range(len(train_labels)):
        for j in range(1, len(train_labels[i]) - 1):
            X = train_tensors[i][j - 1: j + 2].to(device_gpu)
            Y = train_labels[i][j: j + 1].to(device_gpu)
            Y_predictions = ner_model(X.long())
            acc_score += int(torch.argmax(Y_predictions) == Y)
            if torch.argmax(Y_predictions) != 0:
                selected_items += 1
            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            if Y.item() != 0:
                relevant_items += 1
            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            items_total += 1
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2 * precision * recall) / (precision + recall)
    print(f'epoch: {epoch}')
    print(f'f1: {f1_score}')
    print(f'acc: {acc_score / items_total}')
 def create_tensors_list(data):
  tensors = []
  for sent in data["a"]:
    sent_tensor = torch.tensor(())
    for word in sent:
      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
      sent_tensor = torch.cat((sent_tensor, temp))
    tensors.append(sent_tensor)
  return tensors
 dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['a'])
 dev["a"] = dev["a"].apply(lambda x: x.split())
 dev_tokens_ids = data_process(dev["a"])
 dev_extra_tensors = create_tensors_list(dev)
 dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]
 results = predict(dev_tensors, labels)
 results_processed = process_output(results)
 with open("dev-0/out.tsv", "w") as f:
  for line in results_processed:
    f.write(line + "\n")
 test = pd.read_csv('test-A/in.tsv', sep='\t', names=['a'])
 test["a"] = test["a"].apply(lambda x: x.split())
 test_tokens_ids = data_process(test["a"])
 test_extra_tensors = create_tensors_list(test)
 test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]
 results = predict(test_tensors, labels)
 results_processed = process_output(results)
 with open("test-A/out.tsv", "w") as f:
  for line in results_processed:
    f.write(line + "\n")
 model_path = "seq_labeling.model"
 torch.save(ner_model.state_dict(), model_path)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
Author	SHA1	Message	Date
s434695	17184e30e8	fix	2021-06-24 19:07:58 +02:00
Tomasz Grzybowski	142eed56c0	cos tam	2021-06-22 19:27:08 +02:00
Tomasz Grzybowski	c6aaaf6544	solution	2021-06-09 03:01:30 +02:00
s434695	d6b3d1c0d1	zmiana pc	2021-06-09 00:19:16 +02:00
s434695	e26b491316	geval	2021-06-08 12:39:08 +02:00