solution

2021-06-09 03:01:30 +02:00 · 2021-06-09 03:01:30 +02:00 · c6aaaf6544
commit c6aaaf6544
parent d6b3d1c0d1
7 changed files with 1253 additions and 436009 deletions
--- a/.ipynb_checkpoints/Program-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Program-checkpoint.ipynb
@ -1,6 +1,438 @@
 {
- "cells": [],
- "metadata": {},
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e574fca4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
+      "  warnings.warn(msg)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import csv\n",
+    "import os.path\n",
+    "import shutil\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "from itertools import islice\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from torchtext.vocab import Vocab\n",
+    "from collections import Counter\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "import gensim.downloader as api\n",
+    "from gensim.models.word2vec import Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b476f295",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting gensim\n",
+      "  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
+      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
+      "Collecting Cython==0.29.21\n",
+      "  Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
+      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
+      "Collecting smart-open>=1.8.1\n",
+      "  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
+      "Installing collected packages: smart-open, Cython, gensim\n",
+      "  Attempting uninstall: Cython\n",
+      "    Found existing installation: Cython 0.29.23\n",
+      "    Uninstalling Cython-0.29.23:\n",
+      "      Successfully uninstalled Cython-0.29.23\n",
+      "Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fbe3a657",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class NERModel(torch.nn.Module):\n",
+    "\n",
+    "    def __init__(self,):\n",
+    "        super(NERModel, self).__init__()\n",
+    "        self.emb = torch.nn.Embedding(23628,200)\n",
+    "        self.fc1 = torch.nn.Linear(600,9)\n",
+    "        \n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.emb(x)\n",
+    "        x = x.reshape(600) \n",
+    "        x = self.fc1(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "3497a580",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_output(lines):\n",
+    "    result = []\n",
+    "    for line in lines:\n",
+    "        last_label = None\n",
+    "        new_line = []\n",
+    "        for label in line:\n",
+    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
+    "                if last_label == None or last_label == \"O\":\n",
+    "                    label = label.replace('I-', 'B-')\n",
+    "                else:\n",
+    "                    label = \"I-\" + last_label[2:]\n",
+    "            last_label = label\n",
+    "            new_line.append(label)\n",
+    "            x = (\" \".join(new_line))\n",
+    "        result.append(\" \".join(new_line))\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3e78d902",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_vocab(dataset):\n",
+    "    counter = Counter()\n",
+    "    for document in dataset:\n",
+    "        counter.update(document)\n",
+    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ec8537cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def data_process(dt):\n",
+    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "847c958a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def labels_process(dt):\n",
+    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "66bee163",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(input_tokens, labels):\n",
+    "\n",
+    "  results = []\n",
+    "  \n",
+    "  for i in range(len(input_tokens)):\n",
+    "    line_results = []\n",
+    "    for j in range(1, len(input_tokens[i]) - 1):\n",
+    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
+    "        predicted = ner_model(x.long())\n",
+    "        result = torch.argmax(predicted)\n",
+    "        label = labels[result]\n",
+    "        line_results.append(label)\n",
+    "    results.append(line_results)\n",
+    "\n",
+    "  return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "39046f3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "9b40a8b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
+    "train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
+    "train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "02a12cbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab = build_vocab(train['b'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8cc6d19d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "  tensors = []\n",
+    "\n",
+    "  for sent in train[\"b\"]:\n",
+    "    sent_tensor = torch.tensor(())\n",
+    "    for word in sent:\n",
+    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
+    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
+    "\n",
+    "    tensors.append(sent_tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "690085f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'NVIDIA GeForce RTX 2060'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.cuda.get_device_name(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "64b2d751",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device_gpu = torch.device(\"cuda:0\")\n",
+    "ner_model = NERModel().to(device_gpu)\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "optimizer = torch.optim.Adam(ner_model.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "094d7e69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_labels = labels_process(train['a'])\n",
+    "train_tokens_ids = data_process(train['b'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "17291b41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "045b7186",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 0\n",
+      "f1: 0.6373470953763748\n",
+      "acc: 0.9116419913061858\n",
+      "epoch: 1\n",
+      "f1: 0.7973076923076923\n",
+      "acc: 0.9540771782783307\n",
+      "epoch: 2\n",
+      "f1: 0.8640167364016735\n",
+      "acc: 0.9702287410511612\n",
+      "epoch: 3\n",
+      "f1: 0.9038441719055962\n",
+      "acc: 0.9793820591289644\n",
+      "epoch: 4\n",
+      "f1: 0.928903400400047\n",
+      "acc: 0.9850890978100043\n"
+     ]
+    }
+   ],
+   "source": [
+    "for epoch in range(5):\n",
+    "    acc_score = 0\n",
+    "    prec_score = 0\n",
+    "    selected_items = 0\n",
+    "    recall_score = 0\n",
+    "    relevant_items = 0\n",
+    "    items_total = 0\n",
+    "    ner_model.train()\n",
+    "    for i in range(len(train_labels)):\n",
+    "        for j in range(1, len(train_labels[i]) - 1):\n",
+    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
+    "\n",
+    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
+    "\n",
+    "            Y_predictions = ner_model(X.long())\n",
+    "\n",
+    "            acc_score += int(torch.argmax(Y_predictions) == Y)\n",
+    "            if torch.argmax(Y_predictions) != 0:\n",
+    "                selected_items += 1\n",
+    "            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
+    "                prec_score += 1\n",
+    "            if Y.item() != 0:\n",
+    "                relevant_items += 1\n",
+    "            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
+    "                recall_score += 1\n",
+    "\n",
+    "            items_total += 1\n",
+    "            optimizer.zero_grad()\n",
+    "            loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "    precision = prec_score / selected_items\n",
+    "    recall = recall_score / relevant_items\n",
+    "    f1_score = (2 * precision * recall) / (precision + recall)\n",
+    "    print(f'epoch: {epoch}')\n",
+    "    print(f'f1: {f1_score}')\n",
+    "    print(f'acc: {acc_score / items_total}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "f75aa5e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_tensors_list(data):\n",
+    "  tensors = []\n",
+    "\n",
+    "  for sent in data[\"a\"]:\n",
+    "    sent_tensor = torch.tensor(())\n",
+    "    for word in sent:\n",
+    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
+    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
+    "\n",
+    "    tensors.append(sent_tensor)\n",
+    "\n",
+    "  return tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "49215802",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
+    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
+    "\n",
+    "dev_tokens_ids = data_process(dev[\"a\"])\n",
+    "\n",
+    "dev_extra_tensors = create_tensors_list(dev)\n",
+    "\n",
+    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
+    "\n",
+    "results = predict(dev_tensors, labels)\n",
+    "results_processed = process_output(results)\n",
+    "\n",
+    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
+    "  for line in results_processed:\n",
+    "    f.write(line + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "8c5b007e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
+    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
+    "\n",
+    "test_tokens_ids = data_process(test[\"a\"])\n",
+    "\n",
+    "test_extra_tensors = create_tensors_list(test)\n",
+    "\n",
+    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
+    "\n",
+    "results = predict(test_tensors, labels)\n",
+    "results_processed = process_output(results)\n",
+    "\n",
+    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
+    "  for line in results_processed:\n",
+    "    f.write(line + \"\\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Program.ipynb
+++ b/Program.ipynb
@ -5,7 +5,16 @@
   "execution_count": 1,
   "id": "e574fca4",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
+      "  warnings.warn(msg)\n"
+     ]
+    }
+   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
@ -23,6 +32,37 @@
    "from gensim.models.word2vec import Word2Vec"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b476f295",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting gensim\n",
+      "  Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
+      "Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
+      "Collecting Cython==0.29.21\n",
+      "  Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
+      "Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
+      "Collecting smart-open>=1.8.1\n",
+      "  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
+      "Installing collected packages: smart-open, Cython, gensim\n",
+      "  Attempting uninstall: Cython\n",
+      "    Found existing installation: Cython 0.29.23\n",
+      "    Uninstalling Cython-0.29.23:\n",
+      "      Successfully uninstalled Cython-0.29.23\n",
+      "Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install gensim"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 2,
@ -106,6 +146,30 @@
    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "66bee163",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(input_tokens, labels):\n",
+    "\n",
+    "  results = []\n",
+    "  \n",
+    "  for i in range(len(input_tokens)):\n",
+    "    line_results = []\n",
+    "    for j in range(1, len(input_tokens[i]) - 1):\n",
+    "        x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
+    "        predicted = ner_model(x.long())\n",
+    "        result = torch.argmax(predicted)\n",
+    "        label = labels[result]\n",
+    "        line_results.append(label)\n",
+    "    results.append(line_results)\n",
+    "\n",
+    "  return results"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 7,
@ -113,9 +177,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])\n",
-    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
-    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])"
+    "train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
   ]
  },
  {
@ -142,7 +204,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
   "id": "8cc6d19d",
   "metadata": {},
   "outputs": [],
@ -160,20 +222,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 15,
+   "id": "690085f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'NVIDIA GeForce RTX 2060'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.cuda.get_device_name(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
   "id": "64b2d751",
   "metadata": {},
   "outputs": [],
   "source": [
-    "device_cpu = torch.device(\"cpu\")\n",
-    "ner_model = NERModel().to(device_cpu)\n",
+    "device_gpu = torch.device(\"cuda:0\")\n",
+    "ner_model = NERModel().to(device_gpu)\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(ner_model.parameters())"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
   "id": "094d7e69",
   "metadata": {},
   "outputs": [],
@ -184,7 +267,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
   "id": "17291b41",
   "metadata": {},
   "outputs": [],
@ -194,10 +277,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "id": "045b7186",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 0\n",
+      "f1: 0.6373470953763748\n",
+      "acc: 0.9116419913061858\n",
+      "epoch: 1\n",
+      "f1: 0.7973076923076923\n",
+      "acc: 0.9540771782783307\n",
+      "epoch: 2\n",
+      "f1: 0.8640167364016735\n",
+      "acc: 0.9702287410511612\n",
+      "epoch: 3\n",
+      "f1: 0.9038441719055962\n",
+      "acc: 0.9793820591289644\n",
+      "epoch: 4\n",
+      "f1: 0.928903400400047\n",
+      "acc: 0.9850890978100043\n"
+     ]
+    }
+   ],
   "source": [
    "for epoch in range(5):\n",
    "    acc_score = 0\n",
@ -209,9 +314,9 @@
    "    ner_model.train()\n",
    "    for i in range(len(train_labels)):\n",
    "        for j in range(1, len(train_labels[i]) - 1):\n",
-    "            X = train_tensors[i][j - 1: j + 2].to(device_cpu)\n",
+    "            X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
    "\n",
-    "            Y = train_labels[i][j: j + 1].to(device_cpu)\n",
+    "            Y = train_labels[i][j: j + 1].to(device_gpu)\n",
    "\n",
    "            Y_predictions = ner_model(X.long())\n",
    "\n",
@ -238,6 +343,75 @@
    "    print(f'f1: {f1_score}')\n",
    "    print(f'acc: {acc_score / items_total}')"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "f75aa5e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_tensors_list(data):\n",
+    "  tensors = []\n",
+    "\n",
+    "  for sent in data[\"a\"]:\n",
+    "    sent_tensor = torch.tensor(())\n",
+    "    for word in sent:\n",
+    "      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
+    "      sent_tensor = torch.cat((sent_tensor, temp))\n",
+    "\n",
+    "    tensors.append(sent_tensor)\n",
+    "\n",
+    "  return tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "49215802",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
+    "dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
+    "\n",
+    "dev_tokens_ids = data_process(dev[\"a\"])\n",
+    "\n",
+    "dev_extra_tensors = create_tensors_list(dev)\n",
+    "\n",
+    "dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
+    "\n",
+    "results = predict(dev_tensors, labels)\n",
+    "results_processed = process_output(results)\n",
+    "\n",
+    "with open(\"dev-0/out.tsv\", \"w\") as f:\n",
+    "  for line in results_processed:\n",
+    "    f.write(line + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "8c5b007e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
+    "test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
+    "\n",
+    "test_tokens_ids = data_process(test[\"a\"])\n",
+    "\n",
+    "test_extra_tensors = create_tensors_list(test)\n",
+    "\n",
+    "test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
+    "\n",
+    "results = predict(test_tensors, labels)\n",
+    "results_processed = process_output(results)\n",
+    "\n",
+    "with open(\"test-A/out.tsv\", \"w\") as f:\n",
+    "  for line in results_processed:\n",
+    "    f.write(line + \"\\n\")"
+   ]
  }
 ],
 "metadata": {
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
@ -1,331 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0895b7c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Informacje na temat zakomentowanego kodu oraz wyników znajdują się w README.md \n",
-    "\n",
-    "import pandas as pd\n",
-    "import os.path\n",
-    "import shutil\n",
-    "import torch\n",
-    "import pandas as pd\n",
-    "from torchtext.vocab import Vocab\n",
-    "from collections import Counter\n",
-    "\n",
-    "# class NERModelWithAlpha(torch.nn.Module):\n",
-    "#     def __init__(self,):\n",
-    "#         super(NERModel, self).__init__()\n",
-    "#         self.emb = torch.nn.Embedding(23629,200)\n",
-    "#         self.fc1 = torch.nn.Linear(1200,9)       \n",
-    "\n",
-    "#     def forward(self, x):\n",
-    "#         x = self.emb(x)\n",
-    "#         x = x.reshape(1200) \n",
-    "#         x = self.fc1(x)\n",
-    "#         return x\n",
-    "\n",
-    "class NERModel(torch.nn.Module):\n",
-    "    def __init__(self,):\n",
-    "        super(NERModel, self).__init__()\n",
-    "        self.emb = torch.nn.Embedding(23628,200)\n",
-    "        self.fc1 = torch.nn.Linear(600,9)       \n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        x = self.emb(x)\n",
-    "        x = x.reshape(600) \n",
-    "        x = self.fc1(x)\n",
-    "        return x\n",
-    "\n",
-    "def data_process(dt):\n",
-    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]\n",
-    "    \n",
-    "# def data_process(dt):\n",
-    "#     result = []\n",
-    "#     for document in dt:\n",
-    "#         sentence = [vocab['<bos>'],vocab['<alpha>']]\n",
-    "#         for token in document:\n",
-    "#             sentence += [vocab[token]]\n",
-    "#             sentence += [vocab['<alpha>'] if token.isalpha() else vocab['<notalpha>']]\n",
-    "#         sentence += [vocab['<eos>'],vocab['<alpha>']]\n",
-    "#         result.append(torch.tensor(sentence, dtype = torch.long))\n",
-    "#     return result\n",
-    "\n",
-    "def build_vocab(dataset):\n",
-    "    counter = Counter()\n",
-    "    for document in dataset:\n",
-    "        counter.update(document)\n",
-    "    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) #, '<alpha>', '<notalpha>'])\n",
-    "\n",
-    "def labels_process(dt):\n",
-    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n",
-    "\n",
-    "def process(model, x):\n",
-    "    predicted = model(x)\n",
-    "    result = torch.argmax(predicted)\n",
-    "    return labels[result]\n",
-    "\n",
-    "def process_dataset(model, path):\n",
-    "    with open(path, 'r') as f:\n",
-    "        lines = f.readlines()\n",
-    "        X = [x.split() for x in lines]\n",
-    "    data_tokens_ids = data_process(X)\n",
-    "    results = []\n",
-    "    for i in range(len(data_tokens_ids)):\n",
-    "        line_results = []\n",
-    "        for j in range(1, len(data_tokens_ids[i]) - 1):\n",
-    "#         for j in range(2, len(data_tokens_ids[i]) - 3, 2):\n",
-    "            #x = data_tokens_ids[i][j-2: j+4].to(device_gpu)\n",
-    "            x = data_tokens_ids[i][j-1: j+2].to(device_cpu)\n",
-    "            label = process(model, x)\n",
-    "            line_results.append(label)\n",
-    "        results.append(line_results)\n",
-    "    return results\n",
-    "\n",
-    "# Przetwarzanie danych z wyjścia modelu (gdy B- i I- nie dotyczą tej samej etykiety)\n",
-    "def process_output(lines):\n",
-    "    result = []\n",
-    "    for line in lines:\n",
-    "        last_label = None\n",
-    "        new_line = []\n",
-    "        for label in line:\n",
-    "            if(label != \"O\" and label[0:2] == \"I-\"):\n",
-    "                if last_label == None or last_label == \"O\":\n",
-    "                    label = label.replace('I-', 'B-')\n",
-    "                else:\n",
-    "                    label = \"I-\" + last_label[2:]\n",
-    "            last_label = label\n",
-    "            new_line.append(label)\n",
-    "        result.append(\" \".join(new_line))\n",
-    "    return result\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "b2f73f9e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "2a94110d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if not os.path.isfile('train/train.tsv'):\n",
-    "    import lzma\n",
-    "    with lzma.open('train/train.tsv.xz', 'rb') as f_in:\n",
-    "        with open('train/train.tsv', 'wb') as f_out:\n",
-    "            shutil.copyfileobj(f_in, f_out)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "02b81af3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = pd.read_csv('train/train.tsv', sep='\\t', names=['iob', 'tokens'])\n",
-    "data[\"iob\"]=data[\"iob\"].apply(lambda x: [labels.index(y) for y in  x.split()])\n",
-    "data[\"tokens\"]=data[\"tokens\"].apply(lambda x: x.split())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "f005db98",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>iob</th>\n",
-       "      <th>tokens</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ...</td>\n",
-       "      <td>[EU, rejects, German, call, to, boycott, Briti...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...</td>\n",
-       "      <td>[Rare, Hendrix, song, draft, sells, for, almos...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...</td>\n",
-       "      <td>[China, says, Taiwan, spoils, atmosphere, for,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...</td>\n",
-       "      <td>[China, says, time, right, for, Taiwan, talks,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...</td>\n",
-       "      <td>[German, July, car, registrations, up, 14.2, p...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>940</th>\n",
-       "      <td>[0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ...</td>\n",
-       "      <td>[CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>941</th>\n",
-       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...</td>\n",
-       "      <td>[CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>942</th>\n",
-       "      <td>[0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ...</td>\n",
-       "      <td>[SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>943</th>\n",
-       "      <td>[0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...</td>\n",
-       "      <td>[MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>944</th>\n",
-       "      <td>[0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...</td>\n",
-       "      <td>[GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>945 rows × 2 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                   iob  \\\n",
-       "0    [5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ...   \n",
-       "1    [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...   \n",
-       "2    [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...   \n",
-       "3    [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...   \n",
-       "4    [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...   \n",
-       "..                                                 ...   \n",
-       "940  [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ...   \n",
-       "941  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...   \n",
-       "942  [0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ...   \n",
-       "943  [0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...   \n",
-       "944  [0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...   \n",
-       "\n",
-       "                                                tokens  \n",
-       "0    [EU, rejects, German, call, to, boycott, Briti...  \n",
-       "1    [Rare, Hendrix, song, draft, sells, for, almos...  \n",
-       "2    [China, says, Taiwan, spoils, atmosphere, for,...  \n",
-       "3    [China, says, time, right, for, Taiwan, talks,...  \n",
-       "4    [German, July, car, registrations, up, 14.2, p...  \n",
-       "..                                                 ...  \n",
-       "940  [CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ...  \n",
-       "941  [CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL...  \n",
-       "942  [SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS...  \n",
-       "943  [MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR...  \n",
-       "944  [GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR...  \n",
-       "\n",
-       "[945 rows x 2 columns]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "4a114973",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<torchtext.vocab.Vocab at 0x7ff2dd0edac0>"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "vocab = build_vocab(data['tokens'])\n",
-    "vocab"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "c666872d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device_cpu = torch.device(\"cpu\")\n",
-    "ner_model = NERModel().to(device_cpu)\n",
-    "criterion = torch.nn.CrossEntropyLoss()\n",
-    "optimizer = torch.optim.Adam(ner_model.parameters())"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/Untitled1.ipynb
+++ b/Untitled1.ipynb
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/solution.py
+++ b/solution.py
@ -0,0 +1,186 @@
+import pandas as pd
+import numpy as np
+import csv
+import os.path
+import shutil
+import torch
+from tqdm import tqdm
+from itertools import islice
+from sklearn.model_selection import train_test_split
+from torchtext.vocab import Vocab
+from collections import Counter
+from nltk.tokenize import word_tokenize
+import gensim.downloader as api
+from gensim.models.word2vec import Word2Vec
+
+class NERModel(torch.nn.Module):
+
+    def __init__(self,):
+        super(NERModel, self).__init__()
+        self.emb = torch.nn.Embedding(23628,200)
+        self.fc1 = torch.nn.Linear(600,9)
+        
+
+    def forward(self, x):
+        x = self.emb(x)
+        x = x.reshape(600) 
+        x = self.fc1(x)
+        return x
+
+def process_output(lines):
+    result = []
+    for line in lines:
+        last_label = None
+        new_line = []
+        for label in line:
+            if(label != "O" and label[0:2] == "I-"):
+                if last_label == None or last_label == "O":
+                    label = label.replace('I-', 'B-')
+                else:
+                    label = "I-" + last_label[2:]
+            last_label = label
+            new_line.append(label)
+            x = (" ".join(new_line))
+        result.append(" ".join(new_line))
+    return result
+
+def build_vocab(dataset):
+    counter = Counter()
+    for document in dataset:
+        counter.update(document)
+    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
+
+def data_process(dt):
+    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
+
+def labels_process(dt):
+    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
+
+def predict(input_tokens, labels):
+
+  results = []
+  
+  for i in range(len(input_tokens)):
+    line_results = []
+    for j in range(1, len(input_tokens[i]) - 1):
+        x = input_tokens[i][j-1: j+2].to(device_gpu)
+        predicted = ner_model(x.long())
+        result = torch.argmax(predicted)
+        label = labels[result]
+        line_results.append(label)
+    results.append(line_results)
+
+  return results
+
+train = pd.read_csv('train/train.tsv.xz', sep='\t', names=['a', 'b'])
+
+labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] 
+train["a"]=train["a"].apply(lambda x: [labels.index(y) for y in  x.split()])
+train["b"]=train["b"].apply(lambda x: x.split())
+
+vocab = build_vocab(train['b'])
+
+  tensors = []
+
+  for sent in train["b"]:
+    sent_tensor = torch.tensor(())
+    for word in sent:
+      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
+      sent_tensor = torch.cat((sent_tensor, temp))
+
+    tensors.append(sent_tensor)
+
+device_gpu = torch.device("cuda:0")
+ner_model = NERModel().to(device_gpu)
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(ner_model.parameters())
+
+train_labels = labels_process(train['a'])
+train_tokens_ids = data_process(train['b'])
+
+train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]
+
+for epoch in range(5):
+    acc_score = 0
+    prec_score = 0
+    selected_items = 0
+    recall_score = 0
+    relevant_items = 0
+    items_total = 0
+    ner_model.train()
+    for i in range(len(train_labels)):
+        for j in range(1, len(train_labels[i]) - 1):
+            X = train_tensors[i][j - 1: j + 2].to(device_gpu)
+
+            Y = train_labels[i][j: j + 1].to(device_gpu)
+
+            Y_predictions = ner_model(X.long())
+
+            acc_score += int(torch.argmax(Y_predictions) == Y)
+            if torch.argmax(Y_predictions) != 0:
+                selected_items += 1
+            if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
+                prec_score += 1
+            if Y.item() != 0:
+                relevant_items += 1
+            if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
+                recall_score += 1
+
+            items_total += 1
+            optimizer.zero_grad()
+            loss = criterion(Y_predictions.unsqueeze(0), Y)
+            loss.backward()
+            optimizer.step()
+
+    precision = prec_score / selected_items
+    recall = recall_score / relevant_items
+    f1_score = (2 * precision * recall) / (precision + recall)
+    print(f'epoch: {epoch}')
+    print(f'f1: {f1_score}')
+    print(f'acc: {acc_score / items_total}')
+
+def create_tensors_list(data):
+  tensors = []
+
+  for sent in data["a"]:
+    sent_tensor = torch.tensor(())
+    for word in sent:
+      temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
+      sent_tensor = torch.cat((sent_tensor, temp))
+
+    tensors.append(sent_tensor)
+
+  return tensors
+
+dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['a'])
+dev["a"] = dev["a"].apply(lambda x: x.split())
+
+dev_tokens_ids = data_process(dev["a"])
+
+dev_extra_tensors = create_tensors_list(dev)
+
+dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]
+
+results = predict(dev_tensors, labels)
+results_processed = process_output(results)
+
+with open("dev-0/out.tsv", "w") as f:
+  for line in results_processed:
+    f.write(line + "\n")
+
+test = pd.read_csv('test-A/in.tsv', sep='\t', names=['a'])
+test["a"] = test["a"].apply(lambda x: x.split())
+
+test_tokens_ids = data_process(test["a"])
+
+test_extra_tensors = create_tensors_list(test)
+
+test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]
+
+results = predict(test_tensors, labels)
+results_processed = process_output(results)
+
+with open("test-A/out.tsv", "w") as f:
+  for line in results_processed:
+    f.write(line + "\n")
+    
--- a/test-A/out.tsv
+++ b/test-A/out.tsv