Compare commits

..

5 Commits

Author SHA1 Message Date
17184e30e8 fix 2021-06-24 19:07:58 +02:00
142eed56c0 cos tam 2021-06-22 19:27:08 +02:00
c6aaaf6544 solution 2021-06-09 03:01:30 +02:00
d6b3d1c0d1 zmiana pc 2021-06-09 00:19:16 +02:00
e26b491316 geval 2021-06-08 12:39:08 +02:00
11 changed files with 3124 additions and 0 deletions

View File

@ -0,0 +1,438 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e574fca4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import csv\n",
"import os.path\n",
"import shutil\n",
"import torch\n",
"from tqdm import tqdm\n",
"from itertools import islice\n",
"from sklearn.model_selection import train_test_split\n",
"from torchtext.vocab import Vocab\n",
"from collections import Counter\n",
"from nltk.tokenize import word_tokenize\n",
"import gensim.downloader as api\n",
"from gensim.models.word2vec import Word2Vec"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b476f295",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting gensim\n",
" Downloading gensim-4.0.1-cp38-cp38-win_amd64.whl (23.9 MB)\n",
"Requirement already satisfied: scipy>=0.18.1 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.6.2)\n",
"Collecting Cython==0.29.21\n",
" Downloading Cython-0.29.21-cp38-cp38-win_amd64.whl (1.7 MB)\n",
"Requirement already satisfied: numpy>=1.11.3 in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (from gensim) (1.20.1)\n",
"Collecting smart-open>=1.8.1\n",
" Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)\n",
"Installing collected packages: smart-open, Cython, gensim\n",
" Attempting uninstall: Cython\n",
" Found existing installation: Cython 0.29.23\n",
" Uninstalling Cython-0.29.23:\n",
" Successfully uninstalled Cython-0.29.23\n",
"Successfully installed Cython-0.29.21 gensim-4.0.1 smart-open-5.1.0\n"
]
}
],
"source": [
"!pip install gensim"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fbe3a657",
"metadata": {},
"outputs": [],
"source": [
"class NERModel(torch.nn.Module):\n",
"\n",
" def __init__(self,):\n",
" super(NERModel, self).__init__()\n",
" self.emb = torch.nn.Embedding(23628,200)\n",
" self.fc1 = torch.nn.Linear(600,9)\n",
" \n",
"\n",
" def forward(self, x):\n",
" x = self.emb(x)\n",
" x = x.reshape(600) \n",
" x = self.fc1(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3497a580",
"metadata": {},
"outputs": [],
"source": [
"def process_output(lines):\n",
" result = []\n",
" for line in lines:\n",
" last_label = None\n",
" new_line = []\n",
" for label in line:\n",
" if(label != \"O\" and label[0:2] == \"I-\"):\n",
" if last_label == None or last_label == \"O\":\n",
" label = label.replace('I-', 'B-')\n",
" else:\n",
" label = \"I-\" + last_label[2:]\n",
" last_label = label\n",
" new_line.append(label)\n",
" x = (\" \".join(new_line))\n",
" result.append(\" \".join(new_line))\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3e78d902",
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ec8537cf",
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt):\n",
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "847c958a",
"metadata": {},
"outputs": [],
"source": [
"def labels_process(dt):\n",
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "66bee163",
"metadata": {},
"outputs": [],
"source": [
"def predict(input_tokens, labels):\n",
"\n",
" results = []\n",
" \n",
" for i in range(len(input_tokens)):\n",
" line_results = []\n",
" for j in range(1, len(input_tokens[i]) - 1):\n",
" x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
" predicted = ner_model(x.long())\n",
" result = torch.argmax(predicted)\n",
" label = labels[result]\n",
" line_results.append(label)\n",
" results.append(line_results)\n",
"\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "39046f3f",
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9b40a8b6",
"metadata": {},
"outputs": [],
"source": [
"labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
"train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in x.split()])\n",
"train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "02a12cbd",
"metadata": {},
"outputs": [],
"source": [
"vocab = build_vocab(train['b'])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "8cc6d19d",
"metadata": {},
"outputs": [],
"source": [
" tensors = []\n",
"\n",
" for sent in train[\"b\"]:\n",
" sent_tensor = torch.tensor(())\n",
" for word in sent:\n",
" temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
" sent_tensor = torch.cat((sent_tensor, temp))\n",
"\n",
" tensors.append(sent_tensor)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "690085f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'NVIDIA GeForce RTX 2060'"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.get_device_name(0)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "64b2d751",
"metadata": {},
"outputs": [],
"source": [
"device_gpu = torch.device(\"cuda:0\")\n",
"ner_model = NERModel().to(device_gpu)\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(ner_model.parameters())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "094d7e69",
"metadata": {},
"outputs": [],
"source": [
"train_labels = labels_process(train['a'])\n",
"train_tokens_ids = data_process(train['b'])\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "17291b41",
"metadata": {},
"outputs": [],
"source": [
"train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "045b7186",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch: 0\n",
"f1: 0.6373470953763748\n",
"acc: 0.9116419913061858\n",
"epoch: 1\n",
"f1: 0.7973076923076923\n",
"acc: 0.9540771782783307\n",
"epoch: 2\n",
"f1: 0.8640167364016735\n",
"acc: 0.9702287410511612\n",
"epoch: 3\n",
"f1: 0.9038441719055962\n",
"acc: 0.9793820591289644\n",
"epoch: 4\n",
"f1: 0.928903400400047\n",
"acc: 0.9850890978100043\n"
]
}
],
"source": [
"for epoch in range(5):\n",
" acc_score = 0\n",
" prec_score = 0\n",
" selected_items = 0\n",
" recall_score = 0\n",
" relevant_items = 0\n",
" items_total = 0\n",
" ner_model.train()\n",
" for i in range(len(train_labels)):\n",
" for j in range(1, len(train_labels[i]) - 1):\n",
" X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
"\n",
" Y = train_labels[i][j: j + 1].to(device_gpu)\n",
"\n",
" Y_predictions = ner_model(X.long())\n",
"\n",
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
" if torch.argmax(Y_predictions) != 0:\n",
" selected_items += 1\n",
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
" prec_score += 1\n",
" if Y.item() != 0:\n",
" relevant_items += 1\n",
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
" recall_score += 1\n",
"\n",
" items_total += 1\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" precision = prec_score / selected_items\n",
" recall = recall_score / relevant_items\n",
" f1_score = (2 * precision * recall) / (precision + recall)\n",
" print(f'epoch: {epoch}')\n",
" print(f'f1: {f1_score}')\n",
" print(f'acc: {acc_score / items_total}')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "f75aa5e2",
"metadata": {},
"outputs": [],
"source": [
"def create_tensors_list(data):\n",
" tensors = []\n",
"\n",
" for sent in data[\"a\"]:\n",
" sent_tensor = torch.tensor(())\n",
" for word in sent:\n",
" temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
" sent_tensor = torch.cat((sent_tensor, temp))\n",
"\n",
" tensors.append(sent_tensor)\n",
"\n",
" return tensors"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "49215802",
"metadata": {},
"outputs": [],
"source": [
"dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
"dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
"\n",
"dev_tokens_ids = data_process(dev[\"a\"])\n",
"\n",
"dev_extra_tensors = create_tensors_list(dev)\n",
"\n",
"dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
"\n",
"results = predict(dev_tensors, labels)\n",
"results_processed = process_output(results)\n",
"\n",
"with open(\"dev-0/out.tsv\", \"w\") as f:\n",
" for line in results_processed:\n",
" f.write(line + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "8c5b007e",
"metadata": {},
"outputs": [],
"source": [
"test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
"test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
"\n",
"test_tokens_ids = data_process(test[\"a\"])\n",
"\n",
"test_extra_tensors = create_tensors_list(test)\n",
"\n",
"test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
"\n",
"results = predict(test_tensors, labels)\n",
"results_processed = process_output(results)\n",
"\n",
"with open(\"test-A/out.tsv\", \"w\") as f:\n",
" for line in results_processed:\n",
" f.write(line + \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,376 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bce0cfa7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"from os import sep\n",
"from nltk import word_tokenize\n",
"import pandas as pd\n",
"import torch\n",
"from TorchCRF import CRF\n",
"import gensim\n",
"from torch._C import device\n",
"from tqdm import tqdm\n",
"from torchtext.vocab import Vocab\n",
"from collections import Counter, OrderedDict\n",
"import spacy\n",
"\n",
"\n",
"from torch.utils.data import DataLoader\n",
"import numpy as np\n",
"from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
"import csv\n",
"import pickle\n",
"\n",
"import lzma\n",
"import re\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "67ace382",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pytorch-crf in c:\\users\\grzyb\\anaconda3\\lib\\site-packages (0.7.2)\n"
]
}
],
"source": [
"!pip3 install pytorch-crf"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "adc9a4de",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'torchcrf'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-2a643b4fc1bb>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 20\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mtorchcrf\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCRF\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torchcrf'"
]
}
],
"source": [
"import numpy as np\n",
"import gensim\n",
"import torch\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from torchtext.vocab import Vocab\n",
"from collections import Counter\n",
"\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"from tqdm.notebook import tqdm\n",
"\n",
"import torch\n",
"from torchcrf import CRF"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6695751c",
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d247e4fe",
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt, vocab):\n",
" return [torch.tensor([vocab[token] for token in document], dtype=torch.long) for document in dt]\n",
"\n",
"\n",
"def get_scores(y_true, y_pred):\n",
" acc_score = 0\n",
" tp = 0\n",
" fp = 0\n",
" selected_items = 0\n",
" relevant_items = 0\n",
" for p, t in zip(y_pred, y_true):\n",
" if p == t:\n",
" acc_score += 1\n",
" if p > 0 and p == t:\n",
" tp += 1\n",
" if p > 0:\n",
" selected_items += 1\n",
" if t > 0:\n",
" relevant_items += 1\n",
"\n",
" if selected_items == 0:\n",
" precision = 1.0\n",
" else:\n",
" precision = tp / selected_items\n",
"\n",
" if relevant_items == 0:\n",
" recall = 1.0\n",
" else:\n",
" recall = tp / relevant_items\n",
"\n",
" if precision + recall == 0.0:\n",
" f1 = 0.0\n",
" else:\n",
" f1 = 2 * precision * recall / (precision + recall)\n",
"\n",
" return precision, recall, f1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6061642",
"metadata": {},
"outputs": [],
"source": [
"def process_output(lines):\n",
" result = []\n",
" for line in lines:\n",
" last_label = None\n",
" new_line = []\n",
" for label in line:\n",
" if(label != \"O\" and label[0:2] == \"I-\"):\n",
" if last_label == None or last_label == \"O\":\n",
" label = label.replace('I-', 'B-')\n",
" else:\n",
" label = \"I-\" + last_label[2:]\n",
" last_label = label\n",
" new_line.append(label)\n",
" x = (\" \".join(new_line))\n",
" result.append(\" \".join(new_line))\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d7c4dd3",
"metadata": {},
"outputs": [],
"source": [
"class GRU(torch.nn.Module):\n",
" def __init__(self):\n",
" super(GRU, self).__init__()\n",
" self.emb = torch.nn.Embedding(len(vocab_x.itos),100)\n",
" self.dropout = torch.nn.Dropout(0.2)\n",
" self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)\n",
" self.fc1 = torch.nn.Linear(2* 256 , 9)\n",
" \n",
" def forward(self, x):\n",
" emb = torch.relu(self.emb(x))\n",
" emb = self.dropout(emb) \n",
" gru_output, h_n = self.rec(emb) \n",
" out_weights = self.fc1(gru_output)\n",
" return out_weights"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd5e419d",
"metadata": {},
"outputs": [],
"source": [
"def dev_eval(model, crf, dev_tokens, dev_labels_tokens, vocab):\n",
" Y_true = []\n",
" Y_pred = []\n",
" model.eval()\n",
" crf.eval()\n",
" for i in tqdm(range(len(dev_labels_tokens))):\n",
" batch_tokens = dev_tokens[i].unsqueeze(0)\n",
" tags = list(dev_labels_tokens[i].numpy())\n",
" Y_true += tags\n",
"\n",
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
" Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
" Y_pred += [crf.decode(Y_batch_pred)[0]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c808bbd5",
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('train/train.tsv', sep='\\t',\n",
" names=['labels', 'document'])\n",
"\n",
"Y_train = [y.split(sep=\" \") for y in train['labels'].values]\n",
"X_train = [x.split(sep=\" \") for x in train['document'].values]\n",
"\n",
"dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['document'])\n",
"exp = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['labels'])\n",
"X_dev = [x.split(sep=\" \") for x in dev['document'].values]\n",
"Y_dev = [y.split(sep=\" \") for y in exp['labels'].values]\n",
"\n",
"test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['document'])\n",
"X_test = test['document'].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79485c9a",
"metadata": {},
"outputs": [],
"source": [
"vocab_x = build_vocab(X_train)\n",
"vocab_y = build_vocab(Y_train)\n",
"train_tokens = data_process(X_train, vocab_x)\n",
"labels_tokens = data_process(Y_train, vocab_y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3726c82a",
"metadata": {},
"outputs": [],
"source": [
"torch.cuda.get_device_name(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f29e3b63",
"metadata": {},
"outputs": [],
"source": [
"device_gpu = torch.device(\"cuda:0\")\n",
"model = GRU()\n",
"crf = CRF(9)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c321d58",
"metadata": {},
"outputs": [],
"source": [
"mask = torch.ByteTensor([1, 1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05482a7c",
"metadata": {},
"outputs": [],
"source": [
"criterion = torch.nn.CrossEntropyLoss()\n",
"params = list(model.parameters()) + list(crf.parameters())\n",
"optimizer = torch.optim.Adam(params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21a5282e",
"metadata": {},
"outputs": [],
"source": [
"for i in range(2):\n",
" crf.train()\n",
" model.train()\n",
" for i in tqdm(range(len(labels_tokens))):\n",
" batch_tokens = train_tokens[i].unsqueeze(0)\n",
" tags = labels_tokens[i].unsqueeze(1)\n",
"\n",
" predicted_tags = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
"\n",
" optimizer.zero_grad()\n",
" loss = -crf(predicted_tags, tags)\n",
"\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cec14c35",
"metadata": {},
"outputs": [],
"source": [
"!pip3 install pytorch-crf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ee634f7",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torchcrf import CRF"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

418
Program.ipynb Normal file
View File

@ -0,0 +1,418 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e574fca4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\grzyb\\anaconda3\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import csv\n",
"import os.path\n",
"import shutil\n",
"import torch\n",
"from tqdm import tqdm\n",
"from itertools import islice\n",
"from sklearn.model_selection import train_test_split\n",
"from torchtext.vocab import Vocab\n",
"from collections import Counter\n",
"from nltk.tokenize import word_tokenize\n",
"import gensim.downloader as api\n",
"from gensim.models.word2vec import Word2Vec"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fbe3a657",
"metadata": {},
"outputs": [],
"source": [
"class NERModel(torch.nn.Module):\n",
"\n",
" def __init__(self,):\n",
" super(NERModel, self).__init__()\n",
" self.emb = torch.nn.Embedding(23628,200)\n",
" self.fc1 = torch.nn.Linear(600,9)\n",
" \n",
"\n",
" def forward(self, x):\n",
" x = self.emb(x)\n",
" x = x.reshape(600) \n",
" x = self.fc1(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3497a580",
"metadata": {},
"outputs": [],
"source": [
"def process_output(lines):\n",
" result = []\n",
" for line in lines:\n",
" last_label = None\n",
" new_line = []\n",
" for label in line:\n",
" if(label != \"O\" and label[0:2] == \"I-\"):\n",
" if last_label == None or last_label == \"O\":\n",
" label = label.replace('I-', 'B-')\n",
" else:\n",
" label = \"I-\" + last_label[2:]\n",
" last_label = label\n",
" new_line.append(label)\n",
" x = (\" \".join(new_line))\n",
" result.append(\" \".join(new_line))\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3e78d902",
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ec8537cf",
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt):\n",
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "847c958a",
"metadata": {},
"outputs": [],
"source": [
"def labels_process(dt):\n",
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "66bee163",
"metadata": {},
"outputs": [],
"source": [
"def predict(input_tokens, labels):\n",
"\n",
" results = []\n",
" \n",
" for i in range(len(input_tokens)):\n",
" line_results = []\n",
" for j in range(1, len(input_tokens[i]) - 1):\n",
" x = input_tokens[i][j-1: j+2].to(device_gpu)\n",
" predicted = ner_model(x.long())\n",
" result = torch.argmax(predicted)\n",
" label = labels[result]\n",
" line_results.append(label)\n",
" results.append(line_results)\n",
"\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "39046f3f",
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('train/train.tsv.xz', sep='\\t', names=['a', 'b'])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9b40a8b6",
"metadata": {},
"outputs": [],
"source": [
"labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] \n",
"train[\"a\"]=train[\"a\"].apply(lambda x: [labels.index(y) for y in x.split()])\n",
"train[\"b\"]=train[\"b\"].apply(lambda x: x.split())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "02a12cbd",
"metadata": {},
"outputs": [],
"source": [
"vocab = build_vocab(train['b'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8cc6d19d",
"metadata": {},
"outputs": [],
"source": [
" tensors = []\n",
"\n",
" for sent in train[\"b\"]:\n",
" sent_tensor = torch.tensor(())\n",
" for word in sent:\n",
" temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
" sent_tensor = torch.cat((sent_tensor, temp))\n",
"\n",
" tensors.append(sent_tensor)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "690085f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'NVIDIA GeForce RTX 2060'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.get_device_name(0)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "64b2d751",
"metadata": {},
"outputs": [],
"source": [
"device_gpu = torch.device(\"cuda:0\")\n",
"ner_model = NERModel().to(device_gpu)\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(ner_model.parameters())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "094d7e69",
"metadata": {},
"outputs": [],
"source": [
"train_labels = labels_process(train['a'])\n",
"train_tokens_ids = data_process(train['b'])\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "17291b41",
"metadata": {},
"outputs": [],
"source": [
"train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "045b7186",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch: 0\n",
"f1: 0.6310260230881535\n",
"acc: 0.9099004714510215\n",
"epoch: 1\n",
"f1: 0.7977381727751791\n",
"acc: 0.9539025667888947\n",
"epoch: 2\n",
"f1: 0.8635445687583837\n",
"acc: 0.9699162783858546\n",
"epoch: 3\n",
"f1: 0.9047002002591589\n",
"acc: 0.9794417946385082\n",
"epoch: 4\n",
"f1: 0.9300697243387956\n",
"acc: 0.9852774944170274\n"
]
}
],
"source": [
"for epoch in range(5):\n",
" acc_score = 0\n",
" prec_score = 0\n",
" selected_items = 0\n",
" recall_score = 0\n",
" relevant_items = 0\n",
" items_total = 0\n",
" ner_model.train()\n",
" for i in range(len(train_labels)):\n",
" for j in range(1, len(train_labels[i]) - 1):\n",
" X = train_tensors[i][j - 1: j + 2].to(device_gpu)\n",
"\n",
" Y = train_labels[i][j: j + 1].to(device_gpu)\n",
"\n",
" Y_predictions = ner_model(X.long())\n",
"\n",
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
" if torch.argmax(Y_predictions) != 0:\n",
" selected_items += 1\n",
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
" prec_score += 1\n",
" if Y.item() != 0:\n",
" relevant_items += 1\n",
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
" recall_score += 1\n",
"\n",
" items_total += 1\n",
" optimizer.zero_grad()\n",
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" precision = prec_score / selected_items\n",
" recall = recall_score / relevant_items\n",
" f1_score = (2 * precision * recall) / (precision + recall)\n",
" print(f'epoch: {epoch}')\n",
" print(f'f1: {f1_score}')\n",
" print(f'acc: {acc_score / items_total}')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f75aa5e2",
"metadata": {},
"outputs": [],
"source": [
"def create_tensors_list(data):\n",
" tensors = []\n",
"\n",
" for sent in data[\"a\"]:\n",
" sent_tensor = torch.tensor(())\n",
" for word in sent:\n",
" temp = torch.tensor([word[0].isupper(), word[0].isdigit()])\n",
" sent_tensor = torch.cat((sent_tensor, temp))\n",
"\n",
" tensors.append(sent_tensor)\n",
"\n",
" return tensors"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "49215802",
"metadata": {},
"outputs": [],
"source": [
"dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['a'])\n",
"dev[\"a\"] = dev[\"a\"].apply(lambda x: x.split())\n",
"\n",
"dev_tokens_ids = data_process(dev[\"a\"])\n",
"\n",
"dev_extra_tensors = create_tensors_list(dev)\n",
"\n",
"dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]\n",
"\n",
"results = predict(dev_tensors, labels)\n",
"results_processed = process_output(results)\n",
"\n",
"with open(\"dev-0/out.tsv\", \"w\") as f:\n",
" for line in results_processed:\n",
" f.write(line + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8c5b007e",
"metadata": {},
"outputs": [],
"source": [
"test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['a'])\n",
"test[\"a\"] = test[\"a\"].apply(lambda x: x.split())\n",
"\n",
"test_tokens_ids = data_process(test[\"a\"])\n",
"\n",
"test_extra_tensors = create_tensors_list(test)\n",
"\n",
"test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]\n",
"\n",
"results = predict(test_tensors, labels)\n",
"results_processed = process_output(results)\n",
"\n",
"with open(\"test-A/out.tsv\", \"w\") as f:\n",
" for line in results_processed:\n",
" f.write(line + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "000dd425",
"metadata": {},
"outputs": [],
"source": [
"model_path = \"seq_labeling.model\"\n",
"torch.save(ner_model.state_dict(), model_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

215
dev-0/out.tsv Normal file

File diff suppressed because one or more lines are too long

BIN
geval Executable file

Binary file not shown.

307
gru.ipynb Normal file
View File

@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bce0cfa7",
"metadata": {},
"outputs": [],
"source": [
"from os import sep\n",
"from nltk import word_tokenize\n",
"import pandas as pd\n",
"import torch\n",
"from torchcrf import CRF\n",
"import gensim\n",
"from torch._C import device\n",
"from tqdm import tqdm\n",
"from torchtext.vocab import Vocab\n",
"from collections import Counter, OrderedDict\n",
"\n",
"\n",
"from torch.utils.data import DataLoader\n",
"import numpy as np\n",
"from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
"import csv\n",
"import pickle\n",
"\n",
"import lzma\n",
"import re\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6695751c",
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d247e4fe",
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt, vocab):\n",
" return [torch.tensor([vocab[token] for token in document], dtype=torch.long) for document in dt]\n",
"\n",
"\n",
"def get_scores(y_true, y_pred):\n",
" acc_score = 0\n",
" tp = 0\n",
" fp = 0\n",
" selected_items = 0\n",
" relevant_items = 0\n",
" for p, t in zip(y_pred, y_true):\n",
" if p == t:\n",
" acc_score += 1\n",
" if p > 0 and p == t:\n",
" tp += 1\n",
" if p > 0:\n",
" selected_items += 1\n",
" if t > 0:\n",
" relevant_items += 1\n",
"\n",
" if selected_items == 0:\n",
" precision = 1.0\n",
" else:\n",
" precision = tp / selected_items\n",
"\n",
" if relevant_items == 0:\n",
" recall = 1.0\n",
" else:\n",
" recall = tp / relevant_items\n",
"\n",
" if precision + recall == 0.0:\n",
" f1 = 0.0\n",
" else:\n",
" f1 = 2 * precision * recall / (precision + recall)\n",
"\n",
" return precision, recall, f1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b6061642",
"metadata": {},
"outputs": [],
"source": [
"def process_output(lines):\n",
" result = []\n",
" for line in lines:\n",
" last_label = None\n",
" new_line = []\n",
" for label in line:\n",
" if(label != \"O\" and label[0:2] == \"I-\"):\n",
" if last_label == None or last_label == \"O\":\n",
" label = label.replace('I-', 'B-')\n",
" else:\n",
" label = \"I-\" + last_label[2:]\n",
" last_label = label\n",
" new_line.append(label)\n",
" x = (\" \".join(new_line))\n",
" result.append(\" \".join(new_line))\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3d7c4dd3",
"metadata": {},
"outputs": [],
"source": [
"class GRU(torch.nn.Module):\n",
" def __init__(self):\n",
" super(GRU, self).__init__()\n",
" self.emb = torch.nn.Embedding(len(vocab_x.itos),100)\n",
" self.dropout = torch.nn.Dropout(0.2)\n",
" self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)\n",
" self.fc1 = torch.nn.Linear(2* 256 , 9)\n",
" \n",
" def forward(self, x):\n",
" emb = torch.relu(self.emb(x))\n",
" emb = self.dropout(emb) \n",
" gru_output, h_n = self.rec(emb) \n",
" out_weights = self.fc1(gru_output)\n",
" return out_weights"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cd5e419d",
"metadata": {},
"outputs": [],
"source": [
"def dev_eval(model, crf, dev_tokens, dev_labels_tokens, vocab):\n",
" Y_true = []\n",
" Y_pred = []\n",
" model.eval()\n",
" crf.eval()\n",
" for i in tqdm(range(len(dev_labels_tokens))):\n",
" batch_tokens = dev_tokens[i].unsqueeze(0)\n",
" tags = list(dev_labels_tokens[i].numpy())\n",
" Y_true += tags\n",
"\n",
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
" Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
" Y_pred += [crf.decode(Y_batch_pred)[0]]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c808bbd5",
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv('train/train.tsv', sep='\\t',\n",
" names=['labels', 'document'])\n",
"\n",
"Y_train = [y.split(sep=\" \") for y in train['labels'].values]\n",
"X_train = [x.split(sep=\" \") for x in train['document'].values]\n",
"\n",
"dev = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['document'])\n",
"exp = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['labels'])\n",
"X_dev = [x.split(sep=\" \") for x in dev['document'].values]\n",
"Y_dev = [y.split(sep=\" \") for y in exp['labels'].values]\n",
"\n",
"test = pd.read_csv('test-A/in.tsv', sep='\\t', names=['document'])\n",
"X_test = test['document'].values"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "79485c9a",
"metadata": {},
"outputs": [],
"source": [
"vocab_x = build_vocab(X_train)\n",
"vocab_y = build_vocab(Y_train)\n",
"train_tokens = data_process(X_train, vocab_x)\n",
"labels_tokens = data_process(Y_train, vocab_y)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f29e3b63",
"metadata": {},
"outputs": [],
"source": [
"model = GRU()\n",
"crf = CRF(9)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "05482a7c",
"metadata": {},
"outputs": [],
"source": [
"criterion = torch.nn.CrossEntropyLoss()\n",
"params = list(model.parameters()) + list(crf.parameters())\n",
"optimizer = torch.optim.Adam(params)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "21a5282e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/945 [00:00<?, ?it/s]\n"
]
},
{
"ename": "ValueError",
"evalue": "expected last dimension of emissions is 10, got 9",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-14-6dc1a1c63d46>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mcrf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredicted_tags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 887\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 888\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 889\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 890\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 891\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torchcrf/__init__.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, emissions, tags, mask, reduction)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0mreduction\u001b[0m \u001b[0;32mis\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mnone\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0motherwise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \"\"\"\n\u001b[0;32m---> 90\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memissions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduction\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'none'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'mean'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'token_mean'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'invalid reduction: {reduction}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/torchcrf/__init__.py\u001b[0m in \u001b[0;36m_validate\u001b[0;34m(self, emissions, tags, mask)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf'emissions must have dimension of 3, got {emissions.dim()}'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0memissions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_tags\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 149\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 150\u001b[0m \u001b[0;34mf'expected last dimension of emissions is {self.num_tags}, '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 151\u001b[0m f'got {emissions.size(2)}')\n",
"\u001b[0;31mValueError\u001b[0m: expected last dimension of emissions is 10, got 9"
]
}
],
"source": [
"for i in range(2):\n",
" crf.train()\n",
" model.train()\n",
" for i in tqdm(range(len(labels_tokens))):\n",
" batch_tokens = train_tokens[i].unsqueeze(0)\n",
" tags = labels_tokens[i].unsqueeze(1)\n",
"\n",
" predicted_tags = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
"\n",
" optimizer.zero_grad()\n",
" loss = -crf(predicted_tags, tags)\n",
"\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "366ab1fe",
"metadata": {},
"outputs": [],
"source": [
"Y_pred = []\n",
"model.eval()\n",
"crf.eval()\n",
"for i in tqdm(range(len(test_tokens))):\n",
" batch_tokens = test_tokens[i].unsqueeze(0)\n",
"\n",
" Y_batch_pred = model(batch_tokens).squeeze(0).unsqueeze(1)\n",
" Y_pred += [crf.decode(Y_batch_pred)[0]]\n",
"\n",
"Y_pred_translate = translate(Y_pred, vocab)\n",
"return Y_pred_translate"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

BIN
seq_labeling.model Normal file

Binary file not shown.

189
solution.py Normal file
View File

@ -0,0 +1,189 @@
import pandas as pd
import numpy as np
import csv
import os.path
import shutil
import torch
from tqdm import tqdm
from itertools import islice
from sklearn.model_selection import train_test_split
from torchtext.vocab import Vocab
from collections import Counter
from nltk.tokenize import word_tokenize
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
class NERModel(torch.nn.Module):
def __init__(self,):
super(NERModel, self).__init__()
self.emb = torch.nn.Embedding(23628,200)
self.fc1 = torch.nn.Linear(600,9)
def forward(self, x):
x = self.emb(x)
x = x.reshape(600)
x = self.fc1(x)
return x
def process_output(lines):
result = []
for line in lines:
last_label = None
new_line = []
for label in line:
if(label != "O" and label[0:2] == "I-"):
if last_label == None or last_label == "O":
label = label.replace('I-', 'B-')
else:
label = "I-" + last_label[2:]
last_label = label
new_line.append(label)
x = (" ".join(new_line))
result.append(" ".join(new_line))
return result
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
def data_process(dt):
return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
def labels_process(dt):
return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
def predict(input_tokens, labels):
results = []
for i in range(len(input_tokens)):
line_results = []
for j in range(1, len(input_tokens[i]) - 1):
x = input_tokens[i][j-1: j+2].to(device_gpu)
predicted = ner_model(x.long())
result = torch.argmax(predicted)
label = labels[result]
line_results.append(label)
results.append(line_results)
return results
train = pd.read_csv('train/train.tsv.xz', sep='\t', names=['a', 'b'])
labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
train["a"]=train["a"].apply(lambda x: [labels.index(y) for y in x.split()])
train["b"]=train["b"].apply(lambda x: x.split())
vocab = build_vocab(train['b'])
tensors = []
for sent in train["b"]:
sent_tensor = torch.tensor(())
for word in sent:
temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
sent_tensor = torch.cat((sent_tensor, temp))
tensors.append(sent_tensor)
device_gpu = torch.device("cuda:0")
ner_model = NERModel().to(device_gpu)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())
train_labels = labels_process(train['a'])
train_tokens_ids = data_process(train['b'])
train_tensors = [torch.cat((token, tensors[i])) for i, token in enumerate(train_tokens_ids)]
for epoch in range(5):
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
ner_model.train()
for i in range(len(train_labels)):
for j in range(1, len(train_labels[i]) - 1):
X = train_tensors[i][j - 1: j + 2].to(device_gpu)
Y = train_labels[i][j: j + 1].to(device_gpu)
Y_predictions = ner_model(X.long())
acc_score += int(torch.argmax(Y_predictions) == Y)
if torch.argmax(Y_predictions) != 0:
selected_items += 1
if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
prec_score += 1
if Y.item() != 0:
relevant_items += 1
if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
recall_score += 1
items_total += 1
optimizer.zero_grad()
loss = criterion(Y_predictions.unsqueeze(0), Y)
loss.backward()
optimizer.step()
precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2 * precision * recall) / (precision + recall)
print(f'epoch: {epoch}')
print(f'f1: {f1_score}')
print(f'acc: {acc_score / items_total}')
def create_tensors_list(data):
tensors = []
for sent in data["a"]:
sent_tensor = torch.tensor(())
for word in sent:
temp = torch.tensor([word[0].isupper(), word[0].isdigit()])
sent_tensor = torch.cat((sent_tensor, temp))
tensors.append(sent_tensor)
return tensors
dev = pd.read_csv('dev-0/in.tsv', sep='\t', names=['a'])
dev["a"] = dev["a"].apply(lambda x: x.split())
dev_tokens_ids = data_process(dev["a"])
dev_extra_tensors = create_tensors_list(dev)
dev_tensors = [torch.cat((token, dev_extra_tensors[i])) for i, token in enumerate(dev_tokens_ids)]
results = predict(dev_tensors, labels)
results_processed = process_output(results)
with open("dev-0/out.tsv", "w") as f:
for line in results_processed:
f.write(line + "\n")
test = pd.read_csv('test-A/in.tsv', sep='\t', names=['a'])
test["a"] = test["a"].apply(lambda x: x.split())
test_tokens_ids = data_process(test["a"])
test_extra_tensors = create_tensors_list(test)
test_tensors = [torch.cat((token, test_extra_tensors[i])) for i, token in enumerate(test_tokens_ids)]
results = predict(test_tensors, labels)
results_processed = process_output(results)
with open("test-A/out.tsv", "w") as f:
for line in results_processed:
f.write(line + "\n")
model_path = "seq_labeling.model"
torch.save(ner_model.state_dict(), model_path)

230
test-A/out.tsv Normal file

File diff suppressed because one or more lines are too long

945
train/train.tsv Normal file

File diff suppressed because one or more lines are too long