solution

2021-06-09 03:01:30 +02:00 · 2021-06-09 03:01:30 +02:00 · c6aaaf6544
commit c6aaaf6544
parent d6b3d1c0d1
7 changed files with 1253 additions and 436009 deletions
--- a/.ipynb_checkpoints/Program-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Program-checkpoint.ipynb
@ -1,6 +1,438 @@
 {
- "cells": [],
+ "cells": [
- "metadata": {},
+  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e574fca4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv\n",
    "import os.path\n",
    "import shutil\n",
    "import torch\n",
    "from tqdm import tqdm\n",
    "from itertools import islice\n",
    "from sklearn.model_selection import train_test_split\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter\n",
    "from nltk.tokenize import word_tokenize\n",
    "import gensim.downloader as api\n",
    "from gensim.models.word2vec import Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b476f295",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting gensim\n",
      "  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
      "Collecting Cython==0.29.21\n",
      "  Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
      "Collecting smart-open>=1.8.1\n",
      "  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
      "Installing collected packages: smart-open, Cython, gensim\n",
      "  Attempting uninstall: Cython\n",
      "    Found existing installation: Cython 0.29.23\n",
      "    Uninstalling Cython-0.29.23:\n",
      "      Successfully uninstalled Cython-0.29.23\n",
      "Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fbe3a657",
   "metadata": {},
   "outputs": [],
   "source": [
    "class NERModel(torch.nn.Module):\n",
    "\n",
    "    def __init__(self,):\n",
    "        super(NERModel, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(23628,200)\n",
    "        self.fc1 = torch.nn.Linear(600,9)\n",
    "        \n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.emb(x)\n",
    "        x = x.reshape(600) \n",
    "        x = self.fc1(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3497a580",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "            x = (\" \".join(new_line))\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3e78d902",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ec8537cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt):\n",
    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "847c958a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def labels_process(dt):\n",
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "66bee163",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(input_tokens, labels):\n",
    "\n",
    "  results = []\n",
    "  \n",
    "  for i in range(len(input_tokens)):\n",
    "    line_results = []\n",
    "    for j in range(1, len(input_tokens[i]) - 1):\n",
    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
    "        predicted = ner_model(x.long())\n",
    "        result = torch.argmax(predicted)\n",
    "        label = labels[result]\n",
    "        line_results.append(label)\n",
    "    results.append(line_results)\n",
    "\n",
    "  return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "39046f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9b40a8b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
    "train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
    "train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "02a12cbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = build_vocab(train['b'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8cc6d19d",
   "metadata": {},
   "outputs": [],
   "source": [
    "  tensors = []\n",
    "\n",
    "  for sent in train[\"b\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "690085f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NVIDIA GeForce RTX 2060'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.get_device_name(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "64b2d751",
   "metadata": {},
   "outputs": [],
   "source": [
    "device_gpu = torch.device(\"cuda:0\")\n",
    "ner_model = NERModel().to(device_gpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "094d7e69",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = labels_process(train['a'])\n",
    "train_tokens_ids = data_process(train['b'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "17291b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "045b7186",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0\n",
      "f1: 0.6373470953763748\n",
      "acc: 0.9116419913061858\n",
      "epoch: 1\n",
      "f1: 0.7973076923076923\n",
      "acc: 0.9540771782783307\n",
      "epoch: 2\n",
      "f1: 0.8640167364016735\n",
      "acc: 0.9702287410511612\n",
      "epoch: 3\n",
      "f1: 0.9038441719055962\n",
      "acc: 0.9793820591289644\n",
      "epoch: 4\n",
      "f1: 0.928903400400047\n",
      "acc: 0.9850890978100043\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(5):\n",
    "    acc_score = 0\n",
    "    prec_score = 0\n",
    "    selected_items = 0\n",
    "    recall_score = 0\n",
    "    relevant_items = 0\n",
    "    items_total = 0\n",
    "    ner_model.train()\n",
    "    for i in range(len(train_labels)):\n",
    "        for j in range(1, len(train_labels[i]) - 1):\n",
    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
    "\n",
    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
    "\n",
    "            Y_predictions = ner_model(X.long())\n",
    "\n",
    "            acc_score += int(torch.argmax(Y_predictions) == Y)\n",
    "            if torch.argmax(Y_predictions) != 0:\n",
    "                selected_items += 1\n",
    "            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                prec_score += 1\n",
    "            if Y.item() != 0:\n",
    "                relevant_items += 1\n",
    "            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
    "                recall_score += 1\n",
    "\n",
    "            items_total += 1\n",
    "            optimizer.zero_grad()\n",
    "            loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "    precision = prec_score / selected_items\n",
    "    recall = recall_score / relevant_items\n",
    "    f1_score = (2 * precision * recall) / (precision + recall)\n",
    "    print(f'epoch: {epoch}')\n",
    "    print(f'f1: {f1_score}')\n",
    "    print(f'acc: {acc_score / items_total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f75aa5e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_tensors_list(data):\n",
    "  tensors = []\n",
    "\n",
    "  for sent in data[\"a\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)\n",
    "\n",
    "  return tensors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "49215802",
   "metadata": {},
   "outputs": [],
   "source": [
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "dev_tokens_ids = data_process(dev[\"a\"])\n",
    "\n",
    "dev_extra_tensors = create_tensors_list(dev)\n",
    "\n",
    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
    "\n",
    "results = predict(dev_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "8c5b007e",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "test_tokens_ids = data_process(test[\"a\"])\n",
    "\n",
    "test_extra_tensors = create_tensors_list(test)\n",
    "\n",
    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
    "\n",
    "results = predict(test_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Program.ipynb
+++ b/Program.ipynb
@ -5,7 +5,16 @@
   "execution_count": 1,
   "id": "e574fca4",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
@ -23,6 +32,37 @@
    "from gensim.models.word2vec import Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b476f295",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting gensim\n",
      "  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
      "Collecting Cython==0.29.21\n",
      "  Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
      "Collecting smart-open>=1.8.1\n",
      "  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
      "Installing collected packages: smart-open, Cython, gensim\n",
      "  Attempting uninstall: Cython\n",
      "    Found existing installation: Cython 0.29.23\n",
      "    Uninstalling Cython-0.29.23:\n",
      "      Successfully uninstalled Cython-0.29.23\n",
      "Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
     ]
    }
   ],
   "source": [
    "!pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
@ -106,6 +146,30 @@
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "66bee163",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(input_tokens, labels):\n",
    "\n",
    "  results = []\n",
    "  \n",
    "  for i in range(len(input_tokens)):\n",
    "    line_results = []\n",
    "    for j in range(1, len(input_tokens[i]) - 1):\n",
    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
    "        predicted = ner_model(x.long())\n",
    "        result = torch.argmax(predicted)\n",
    "        label = labels[result]\n",
    "        line_results.append(label)\n",
    "    results.append(line_results)\n",
    "\n",
    "  return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
@ -113,9 +177,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])\n",
+    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])"
   ]
  },
  {
@ -142,7 +204,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
   "id": "8cc6d19d",
   "metadata": {},
   "outputs": [],
@ -160,20 +222,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 15,
   "id": "690085f6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NVIDIA GeForce RTX 2060'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.get_device_name(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "64b2d751",
   "metadata": {},
   "outputs": [],
   "source": [
-    "device_cpu = torch.device(\"cpu\")\n",
+    "device_gpu = torch.device(\"cuda:0\")\n",
-    "ner_model = NERModel().to(device_cpu)\n",
+    "ner_model = NERModel().to(device_gpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
   "id": "094d7e69",
   "metadata": {},
   "outputs": [],
@ -184,7 +267,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
   "id": "17291b41",
   "metadata": {},
   "outputs": [],
@ -194,10 +277,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "045b7186",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0\n",
      "f1: 0.6373470953763748\n",
      "acc: 0.9116419913061858\n",
      "epoch: 1\n",
      "f1: 0.7973076923076923\n",
      "acc: 0.9540771782783307\n",
      "epoch: 2\n",
      "f1: 0.8640167364016735\n",
      "acc: 0.9702287410511612\n",
      "epoch: 3\n",
      "f1: 0.9038441719055962\n",
      "acc: 0.9793820591289644\n",
      "epoch: 4\n",
      "f1: 0.928903400400047\n",
      "acc: 0.9850890978100043\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(5):\n",
    "    acc_score = 0\n",
@ -209,9 +314,9 @@
    "    ner_model.train()\n",
    "    for i in range(len(train_labels)):\n",
    "        for j in range(1, len(train_labels[i]) - 1):\n",
-    "            X = train_tensors[i][j - 1: j + 2].to(device_cpu)\n",
+    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
    "\n",
-    "            Y = train_labels[i][j: j + 1].to(device_cpu)\n",
+    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
    "\n",
    "            Y_predictions = ner_model(X.long())\n",
    "\n",
@ -238,6 +343,75 @@
    "    print(f'f1: {f1_score}')\n",
    "    print(f'acc: {acc_score / items_total}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f75aa5e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_tensors_list(data):\n",
    "  tensors = []\n",
    "\n",
    "  for sent in data[\"a\"]:\n",
    "    sent_tensor = torch.tensor(())\n",
    "    for word in sent:\n",
    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
    "\n",
    "    tensors.append(sent_tensor)\n",
    "\n",
    "  return tensors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "49215802",
   "metadata": {},
   "outputs": [],
   "source": [
    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "dev_tokens_ids = data_process(dev[\"a\"])\n",
    "\n",
    "dev_extra_tensors = create_tensors_list(dev)\n",
    "\n",
    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
    "\n",
    "results = predict(dev_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "8c5b007e",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
    "\n",
    "test_tokens_ids = data_process(test[\"a\"])\n",
    "\n",
    "test_extra_tensors = create_tensors_list(test)\n",
    "\n",
    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
    "\n",
    "results = predict(test_tensors, labels)\n",
    "results_processed = process_output(results)\n",
    "\n",
    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
    "  for line in results_processed:\n",
    "    f.write(line + \"\\n\")"
   ]
  }
 ],
 "metadata": {
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
@ -1,331 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0895b7c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Informacje na temat zakomentowanego kodu oraz wyników znajdują się w README.md \n",
    "\n",
    "import pandas as pd\n",
    "import os.path\n",
    "import shutil\n",
    "import torch\n",
    "import pandas as pd\n",
    "from torchtext.vocab import Vocab\n",
    "from collections import Counter\n",
    "\n",
    "# class NERModelWithAlpha(torch.nn.Module):\n",
    "#     def __init__(self,):\n",
    "#         super(NERModel, self).__init__()\n",
    "#         self.emb = torch.nn.Embedding(23629,200)\n",
    "#         self.fc1 = torch.nn.Linear(1200,9)       \n",
    "\n",
    "#     def forward(self, x):\n",
    "#         x = self.emb(x)\n",
    "#         x = x.reshape(1200) \n",
    "#         x = self.fc1(x)\n",
    "#         return x\n",
    "\n",
    "class NERModel(torch.nn.Module):\n",
    "    def __init__(self,):\n",
    "        super(NERModel, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(23628,200)\n",
    "        self.fc1 = torch.nn.Linear(600,9)       \n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.emb(x)\n",
    "        x = x.reshape(600) \n",
    "        x = self.fc1(x)\n",
    "        return x\n",
    "\n",
    "def data_process(dt):\n",
    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]\n",
    "    \n",
    "# def data_process(dt):\n",
    "#     result = []\n",
    "#     for document in dt:\n",
    "#         sentence = [vocab['<bos>'],vocab['<alpha>']]\n",
    "#         for token in document:\n",
    "#             sentence += [vocab[token]]\n",
    "#             sentence += [vocab['<alpha>'] if token.isalpha() else vocab['<notalpha>']]\n",
    "#         sentence += [vocab['<eos>'],vocab['<alpha>']]\n",
    "#         result.append(torch.tensor(sentence, dtype = torch.long))\n",
    "#     return result\n",
    "\n",
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) #, '<alpha>', '<notalpha>'])\n",
    "\n",
    "def labels_process(dt):\n",
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n",
    "\n",
    "def process(model, x):\n",
    "    predicted = model(x)\n",
    "    result = torch.argmax(predicted)\n",
    "    return labels[result]\n",
    "\n",
    "def process_dataset(model, path):\n",
    "    with open(path, 'r') as f:\n",
    "        lines = f.readlines()\n",
    "        X = [x.split() for x in lines]\n",
    "    data_tokens_ids = data_process(X)\n",
    "    results = []\n",
    "    for i in range(len(data_tokens_ids)):\n",
    "        line_results = []\n",
    "        for j in range(1, len(data_tokens_ids[i]) - 1):\n",
    "#         for j in range(2, len(data_tokens_ids[i]) - 3, 2):\n",
    "            #x = data_tokens_ids[i][j-2: j+4].to(device_gpu)\n",
    "            x = data_tokens_ids[i][j-1: j+2].to(device_cpu)\n",
    "            label = process(model, x)\n",
    "            line_results.append(label)\n",
    "        results.append(line_results)\n",
    "    return results\n",
    "\n",
    "# Przetwarzanie danych z wyjścia modelu (gdy B- i I- nie dotyczą tej samej etykiety)\n",
    "def process_output(lines):\n",
    "    result = []\n",
    "    for line in lines:\n",
    "        last_label = None\n",
    "        new_line = []\n",
    "        for label in line:\n",
    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
    "                if last_label == None or last_label == \"O\":\n",
    "                    label = label.replace('I-', 'B-')\n",
    "                else:\n",
    "                    label = \"I-\" + last_label[2:]\n",
    "            last_label = label\n",
    "            new_line.append(label)\n",
    "        result.append(\" \".join(new_line))\n",
    "    return result\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b2f73f9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2a94110d",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.isfile('train/train.tsv'):\n",
    "    import lzma\n",
    "    with lzma.open('train/train.tsv.xz', 'rb') as f_in:\n",
    "        with open('train/train.tsv', 'wb') as f_out:\n",
    "            shutil.copyfileobj(f_in, f_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "02b81af3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('train/train.tsv', sep='\\t', names=['iob', 'tokens'])\n",
    "data[\"iob\"]=data[\"iob\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
    "data[\"tokens\"]=data[\"tokens\"].apply(lambda x: x.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f005db98",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iob</th>\n",
       "      <th>tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ...</td>\n",
       "      <td>[EU, rejects, German, call, to, boycott, Briti...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...</td>\n",
       "      <td>[Rare, Hendrix, song, draft, sells, for, almos...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...</td>\n",
       "      <td>[China, says, Taiwan, spoils, atmosphere, for,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...</td>\n",
       "      <td>[China, says, time, right, for, Taiwan, talks,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...</td>\n",
       "      <td>[German, July, car, registrations, up, 14.2, p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>940</th>\n",
       "      <td>[0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ...</td>\n",
       "      <td>[CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>941</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...</td>\n",
       "      <td>[CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>942</th>\n",
       "      <td>[0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ...</td>\n",
       "      <td>[SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>943</th>\n",
       "      <td>[0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...</td>\n",
       "      <td>[MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>944</th>\n",
       "      <td>[0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...</td>\n",
       "      <td>[GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>945 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   iob  \\\n",
       "0    [5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ...   \n",
       "1    [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...   \n",
       "2    [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...   \n",
       "3    [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...   \n",
       "4    [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...   \n",
       "..                                                 ...   \n",
       "940  [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ...   \n",
       "941  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...   \n",
       "942  [0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ...   \n",
       "943  [0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...   \n",
       "944  [0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...   \n",
       "\n",
       "                                                tokens  \n",
       "0    [EU, rejects, German, call, to, boycott, Briti...  \n",
       "1    [Rare, Hendrix, song, draft, sells, for, almos...  \n",
       "2    [China, says, Taiwan, spoils, atmosphere, for,...  \n",
       "3    [China, says, time, right, for, Taiwan, talks,...  \n",
       "4    [German, July, car, registrations, up, 14.2, p...  \n",
       "..                                                 ...  \n",
       "940  [CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ...  \n",
       "941  [CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL...  \n",
       "942  [SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS...  \n",
       "943  [MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR...  \n",
       "944  [GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR...  \n",
       "\n",
       "[945 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4a114973",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torchtext.vocab.Vocab at 0x7ff2dd0edac0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = build_vocab(data['tokens'])\n",
    "vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c666872d",
   "metadata": {},
   "outputs": [],
   "source": [
    "device_cpu = torch.device(\"cpu\")\n",
    "ner_model = NERModel().to(device_cpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Untitled1.ipynb
+++ b/Untitled1.ipynb
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/solution.py
+++ b/solution.py
@ -0,0 +1,186 @@
 import pandas as pd
 import numpy as np
 import csv
 import os.path
 import shutil
 import torch
 from tqdm import tqdm
 from itertools import islice
 from sklearn.model_selection import train_test_split
 from torchtext.vocab import Vocab
 from collections import Counter
 from nltk.tokenize import word_tokenize
 import gensim.downloader as api
 from gensim.models.word2vec import Word2Vec
 class NERModel(torch.nn.Module):
    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23628,200)
        self.fc1 = torch.nn.Linear(600,9)
    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        return x
 def process_output(lines):
    result = []
    for line in lines:
        last_label = None
        new_line = []
        for label in line:
            if(label != "O" and label[0:2] == "I-"):
                if last_label == None or last_label == "O":
                    label = label.replace('I-', 'B-')
                else:
                    label = "I-" + last_label[2:]
            last_label = label
            new_line.append(label)
            x = (" ".join(new_line))
        result.append(" ".join(new_line))
    return result
 def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
 def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
 def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
 def predict(input_tokens, labels):
  results = []
  for i in range(len(input_tokens)):
    line_results = []
    for j in range(1, len(input_tokens[i]) - 1):
        x = input_tokens[i][j-1: j+2].to(device_gpu)
        predicted = ner_model(x.long())
        result = torch.argmax(predicted)
        label = labels[result]
        line_results.append(label)
    results.append(line_results)
  return results
 train = pd.read_csv('train/train.tsv.xz', sep='\t', names=['a', 'b'])
 labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] 
 train["a"]=train["a"].apply(lambda x: [labels.index(y) for y in  x.split()])
 train["b"]=train["b"].apply(lambda x: x.split())
 vocab = build_vocab(train['b'])
  tensors = []
  for sent in train["b"]:
    sent_tensor = torch.tensor(())
    for word in sent:
      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
      sent_tensor = torch.cat((sent_tensor, temp))
    tensors.append(sent_tensor)
 device_gpu = torch.device("cuda:0")
 ner_model = NERModel().to(device_gpu)
 criterion = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(ner_model.parameters())
 train_labels = labels_process(train['a'])
 train_tokens_ids = data_process(train['b'])
 train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]
 for epoch in range(5):
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    ner_model.train()
    for i in range(len(train_labels)):
        for j in range(1, len(train_labels[i]) - 1):
            X = train_tensors[i][j - 1: j + 2].to(device_gpu)
            Y = train_labels[i][j: j + 1].to(device_gpu)
            Y_predictions = ner_model(X.long())
            acc_score += int(torch.argmax(Y_predictions) == Y)
            if torch.argmax(Y_predictions) != 0:
                selected_items += 1
            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            if Y.item() != 0:
                relevant_items += 1
            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            items_total += 1
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2 * precision * recall) / (precision + recall)
    print(f'epoch: {epoch}')
    print(f'f1: {f1_score}')
    print(f'acc: {acc_score / items_total}')
 def create_tensors_list(data):
  tensors = []
  for sent in data["a"]:
    sent_tensor = torch.tensor(())
    for word in sent:
      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
      sent_tensor = torch.cat((sent_tensor, temp))
    tensors.append(sent_tensor)
  return tensors
 dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['a'])
 dev["a"] = dev["a"].apply(lambda x: x.split())
 dev_tokens_ids = data_process(dev["a"])
 dev_extra_tensors = create_tensors_list(dev)
 dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]
 results = predict(dev_tensors, labels)
 results_processed = process_output(results)
 with open("dev-0/out.tsv", "w") as f:
  for line in results_processed:
    f.write(line + "\n")
 test = pd.read_csv('test-A/in.tsv', sep='\t', names=['a'])
 test["a"] = test["a"].apply(lambda x: x.split())
 test_tokens_ids = data_process(test["a"])
 test_extra_tensors = create_tensors_list(test)
 test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]
 results = predict(test_tensors, labels)
 results_processed = process_output(results)
 with open("test-A/out.tsv", "w") as f:
  for line in results_processed:
    f.write(line + "\n")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv