diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..c654f3f --- /dev/null +++ b/main.ipynb @@ -0,0 +1,770 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "main.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "OY5VomOSCBez" + }, + "source": [ + "import numpy as np\n", + "import gensim\n", + "import torch\n", + "import pandas as pd\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from torchtext.vocab import Vocab\n", + "from collections import Counter\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "import lzma\n", + "import re\n", + "import itertools" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VXcowLY6HlNC" + }, + "source": [ + "class NeuralNetworkModel(torch.nn.Module):\n", + "\n", + " def __init__(self, output_size):\n", + " super(NeuralNetworkModel, self).__init__()\n", + " self.fc1 = torch.nn.Linear(10_000,len(train_tokens_ids))\n", + " self.softmax = torch.nn.Softmax(dim=0)\n", + " \n", + "\n", + " def forward(self, x):\n", + " x = self.fc1(x)\n", + " x = self.softmax(x)\n", + " return x" + ], + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "OXX_vPpTHhOq" + }, + "source": [ + "class NERModel(torch.nn.Module):\n", + "\n", + " def __init__(self,):\n", + " super(NERModel, self).__init__()\n", + " self.emb = torch.nn.Embedding(23627,200)\n", + " self.fc1 = torch.nn.Linear(600,9)\n", + "\n", + " def forward(self, x):\n", + " x = self.emb(x)\n", + " x = x.reshape(600) \n", + " x = self.fc1(x)\n", + " return x" + ], + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NNpGPta9C4TI" + }, + "source": [ + "def get_dataset(path):\n", + " data = lzma.open(path).read().decode('UTF-8').split('\\n')\n", + " return [line.split('\\t') for line in data][:-1]\n", + "\n", + "train_data = get_dataset('train.tsv.xz')\n", + "\n", + "tokens = []\n", + "ner_tags = []\n", + "\n", + "for i in train_data:\n", + " ner_tags.append(i[0].split())\n", + " tokens.append(i[1].split())\n", + "\n", + "ner_tags_set = list(set(itertools.chain(*ner_tags)))\n", + "\n", + "ner_tags_dictionary = {}\n", + "\n", + "for i in range(len(ner_tags_set)):\n", + " ner_tags_dictionary[ner_tags_set[i]] = i" + ], + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vvOF0opUGEMN" + }, + "source": [ + "for i in range(len(ner_tags)):\n", + " for j in range(len(ner_tags[i])):\n", + " ner_tags[i][j] = ner_tags_dictionary[ner_tags[i][j]]\n", + "\n", + "def data_preprocessing(data):\n", + " return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in data ]\n", + "\n", + "def labels_preprocessing(data):\n", + " return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in data ]\n", + "\n", + "def build_vocab(dataset):\n", + " counter = Counter()\n", + " for document in dataset:\n", + " counter.update(document)\n", + " return Vocab(counter, specials=['', '', '', ''])\n", + "\n", + "\n", + "vocab = build_vocab(tokens)\n", + "train_tokens_ids = data_preprocessing(tokens)\n", + "train_labels = labels_preprocessing(ner_tags)" + ], + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 437 + }, + "id": "yoCYSZNeHJeT", + "outputId": "78acbcd4-ca6e-4702-8e91-d1a906b2e252" + }, + "source": [ + "nn_model = NeuralNetworkModel(len(train_tokens_ids))\n", + "train_tokens_ids[0][1:4]\n", + "\n", + "ner_model = NERModel()\n", + "ner_model(train_tokens_ids[0][1:4])\n", + "\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.Adam(ner_model.parameters())\n", + "\n", + "for epoch in range(2):\n", + " loss_score = 0\n", + " acc_score = 0\n", + " prec_score = 0\n", + " selected_items = 0\n", + " recall_score = 0\n", + " relevant_items = 0\n", + " items_total = 0\n", + " nn_model.train()\n", + " for i in range(100):\n", + " for j in range(1, len(train_labels[i]) - 1):\n", + " \n", + " X = train_tokens_ids[i][j-1: j+2]\n", + " Y = train_labels[i][j: j+1]\n", + "\n", + " Y_predictions = ner_model(X)\n", + " \n", + " \n", + " acc_score += int(torch.argmax(Y_predictions) == Y)\n", + " \n", + " if torch.argmax(Y_predictions) != 0:\n", + " selected_items +=1\n", + " if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n", + " prec_score += 1\n", + " \n", + " if Y.item() != 0:\n", + " relevant_items +=1\n", + " if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n", + " recall_score += 1\n", + " \n", + " items_total += 1\n", + "\n", + " \n", + " optimizer.zero_grad()\n", + " loss = criterion(Y_predictions.unsqueeze(0), Y)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + "\n", + " loss_score += loss.item() \n", + " \n", + " precision = prec_score / selected_items\n", + " recall = recall_score / relevant_items\n", + " f1_score = (2*precision * recall) / (precision + recall)\n", + " display('epoch: ', epoch)\n", + " display('loss: ', loss_score / items_total)\n", + " display('acc: ', acc_score / items_total)\n", + " display('prec: ', precision)\n", + " display('recall: : ', recall)\n", + " display('f1: ', f1_score)" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'epoch: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'loss: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.5326548681839177" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'acc: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.856584693173983" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'prec: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8661894535910284" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'recall: : '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8678875394472602" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'f1: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8670376650982827" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'epoch: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "1" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'loss: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.28523138210252" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'acc: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.9227304068030338" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'prec: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.929291481534566" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'recall: : '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.9300468585636416" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'f1: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.9296690166089138" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KOVSTjGWVuq9" + }, + "source": [ + "with open('dev-0/in.tsv', \"r\", encoding=\"utf-8\") as f:\n", + " dev_0_data = [line.rstrip() for line in f]\n", + " \n", + "dev_0_data = [i.split() for i in dev_0_data]\n", + "with open('dev-0/expected.tsv', \"r\", encoding=\"utf-8\") as f:\n", + " dev_0_tags = [line.rstrip() for line in f]\n", + " \n", + "dev_0_tags = [i.split() for i in dev_0_tags]\n", + "for i in range(len(dev_0_tags)):\n", + " for j in range(len(dev_0_tags[i])):\n", + " dev_0_tags[i][j] = ner_tags_dictionary[dev_0_tags[i][j]]\n", + "test_tokens_ids = data_preprocessing(dev_0_data)\n", + "test_labels = labels_preprocessing(dev_0_tags)\n" + ], + "execution_count": 41, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 192 + }, + "id": "Pt7sVRdhWCqC", + "outputId": "acd75e73-e26f-48f6-f36c-a96668822968" + }, + "source": [ + "result = []\n", + "\n", + "loss_score = 0\n", + "acc_score = 0\n", + "prec_score = 0\n", + "selected_items = 0\n", + "recall_score = 0\n", + "relevant_items = 0\n", + "items_total = 0\n", + "nn_model.eval()\n", + "\n", + "for i in range(len(test_tokens_ids)):\n", + " result.append([])\n", + " for j in range(1, len(test_labels[i]) - 1):\n", + "\n", + " X = test_tokens_ids[i][j-1: j+2]\n", + " Y = test_labels[i][j: j+1]\n", + "\n", + " Y_predictions = ner_model(X)\n", + "\n", + "\n", + " acc_score += int(torch.argmax(Y_predictions) == Y)\n", + "\n", + " if torch.argmax(Y_predictions) != 0:\n", + " selected_items +=1\n", + " if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n", + " prec_score += 1\n", + "\n", + " if Y.item() != 0:\n", + " relevant_items +=1\n", + " if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n", + " recall_score += 1\n", + "\n", + " items_total += 1\n", + " loss = criterion(Y_predictions.unsqueeze(0), Y)\n", + " loss_score += loss.item() \n", + " \n", + " result[i].append(int(torch.argmax(Y_predictions)))\n", + "\n", + "precision = prec_score / selected_items\n", + "recall = recall_score / relevant_items\n", + "f1_score = (2*precision * recall) / (precision + recall)\n", + "display('loss: ', loss_score / items_total)\n", + "display('acc: ', acc_score / items_total)\n", + "display('prec: ', precision)\n", + "display('recall: : ', recall)\n", + "display('f1: ', f1_score)" + ], + "execution_count": 42, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'loss: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.7647399755562154" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'acc: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8461623270428695" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'prec: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.870708854926657" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'recall: : '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8624788421673228" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'f1: '" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "0.8665743085080972" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "d-QCHMrycKwH" + }, + "source": [ + "with open('test-A/in.tsv', \"r\", encoding=\"utf-8\") as f:\n", + " test_data = [line.rstrip() for line in f]\n", + " \n", + "test_data = [i.split() for i in test_data]\n", + "test_tokens_ids = data_preprocessing(test_data)\n", + "result = []\n", + "\n", + "loss_score = 0\n", + "acc_score = 0\n", + "prec_score = 0\n", + "selected_items = 0\n", + "recall_score = 0\n", + "relevant_items = 0\n", + "items_total = 0\n", + "nn_model.eval()\n", + "\n", + "test_tokens_length = len(test_tokens_ids)\n", + "\n", + "for i in range(test_tokens_length):\n", + " result.append([])\n", + " for j in range(1, len(test_tokens_ids[i]) - 1):\n", + "\n", + " X = test_tokens_ids[i][j-1: j + 2]\n", + "\n", + " Y_predictions = ner_model(X)\n", + " result[i].append(int(torch.argmax(Y_predictions)))" + ], + "execution_count": 49, + "outputs": [] + } + ] +} \ No newline at end of file