From 59826ad6707d577d69c3728582fc544cb7b4fe2d Mon Sep 17 00:00:00 2001 From: Michal Gulczynski Date: Sun, 26 May 2024 20:27:00 +0200 Subject: [PATCH] zad 3 --- README.md | 68 --- RNN.ipynb | 1078 ++++++++++++++++++++++++++++++++++++++++++++++++ config.txt | 1 - dev-0/out.tsv | 215 ++++++++++ test-A/out.tsv | 230 +++++++++++ 5 files changed, 1523 insertions(+), 69 deletions(-) delete mode 100644 README.md create mode 100644 RNN.ipynb delete mode 100644 config.txt create mode 100644 dev-0/out.tsv create mode 100644 test-A/out.tsv diff --git a/README.md b/README.md deleted file mode 100644 index 7a94838..0000000 --- a/README.md +++ /dev/null @@ -1,68 +0,0 @@ -CoNLL-2003 English Named Entity Recognition. -====================================================== - -NER challenge for CoNLL-2003 English. -Annotations were taken from [University of Antwerp](https://www.clips.uantwerpen.be/conll2003/ner/). -The English data is a collection of news wire articles from the [Reuters Corpus](https://trec.nist.gov/data/reuters/reuters.html), RCV1. - -Format of the train set ------------------------ - -The train set has just two columns separated by TABs: - -* the expected BIO labels, -* the docuemnt. - -Each line is a separate training item. Note that this is TSV format, -not CSV, double quotes are not interpreted in a special way! - -Preprocessing snippet located [here](https://git.applica.pl/snippets/18) - -End-of-lines inside documents were replaced with the '' tag. - -The train is compressed with the xz compressor, in order to see a -random sample of 10 training items, run: - - xzcat train/train.tsv.xz | shuf -n 10 | less -S - -(The `-S` disables line wrapping, press "q" to exit `less` browser.) - -Format of the test sets ------------------------ - -For the test sets, the input data is given in two files: the text in -`in.tsv` and the expected labels in `expected.tsv`. (The files have -`.tsv` extensions for consistency but actually they do not contain TABs.) - -To see the first 5 test items run: - - cat dev-0/in.tsv | paste dev-0/expected.tsv - | head -n 5 - -The `expected.tsv` file for the `test-A` test set is hidden and is not -available in the master branch. - - -Evaluation metrics ------------------- - -One evaluation metric is used: - -* BIO-F1 - -Directory structure -------------------- - -* `README.md` — this file -* `config.txt` — GEval configuration file -* `train/` — directory with training data -* `train/train.tsv.xz` — train set -* `dev-0/` — directory with dev (test) data (split preserved from CoNLL-2003) -* `dev-0/in.tsv` — input data for the dev set -* `dev-0/expected.tsv` — expected (reference) data for the dev set -* `test-A` — directory with test data -* `test-A/in.tsv` — input data for the test set -* `test-A/expected.tsv` — expected (reference) data for the test set (hidden from the developers, - not available in the `master` branch) - -Usually, there is no reason to change these files. - diff --git a/RNN.ipynb b/RNN.ipynb new file mode 100644 index 0000000..3cfa6b7 --- /dev/null +++ b/RNN.ipynb @@ -0,0 +1,1078 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "import torch\n", + "import pandas as pd\n", + "from torchtext.vocab import vocab\n", + "from sklearn.model_selection import train_test_split\n", + "from tqdm.notebook import tqdm\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CUDA nie jest dostępna. Model będzie uruchomiony na CPU.\n" + ] + } + ], + "source": [ + "if torch.cuda.is_available():\n", + " print(\"CUDA jest dostępna!\")\n", + " print(f\"Nazwa urządzenia: {torch.cuda.get_device_name(0)}\")\n", + " device = torch.device(\"cuda\")\n", + "else:\n", + " print(\"CUDA nie jest dostępna. Model będzie uruchomiony na CPU.\")\n", + " device = torch.device(\"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = pd.read_csv(\"train.tsv\", sep='\\t', header=None, names=['labels', 'documents'])\n", + "train_data[\"tokenized_documents\"] = train_data[\"documents\"].apply(lambda x: x.split())\n", + "train_data[\"tokenized_labels\"] = train_data[\"labels\"].apply(lambda x: x.split())\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(\n", + " train_data[\"tokenized_documents\"], train_data[\"tokenized_labels\"], test_size=0.2, random_state=42\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def build_vocab(dataset):\n", + " counter = Counter()\n", + " for document in dataset:\n", + " counter.update(document)\n", + " return vocab(counter, specials=[\"\", \"\", \"\", \"\"])\n", + "\n", + "train_vocab = build_vocab(X_train)\n", + "itos = train_vocab.get_itos()\n", + "train_vocab.set_default_index(train_vocab[\"\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def data_process(dt):\n", + " return [\n", + " torch.tensor(\n", + " [train_vocab[\"\"]] + [train_vocab[token] for token in document] + [train_vocab[\"\"]],\n", + " dtype=torch.long,\n", + " )\n", + " for document in dt\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "train_tokens_ids = data_process(X_train)\n", + "val_tokens_ids = data_process(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "labels = [\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-MISC\", \"I-MISC\"]\n", + "\n", + "label_to_index = {label: idx for idx, label in enumerate(labels)}\n", + "\n", + "def labels_process(dt, label_to_index):\n", + " return [\n", + " torch.tensor(\n", + " [0] + [label_to_index[label] for label in document] + [0],\n", + " dtype=torch.long,\n", + " device=device,\n", + " )\n", + " for document in dt\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_labels = labels_process(y_train, label_to_index)\n", + "val_labels = labels_process(y_val, label_to_index)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "all_label_indices = [\n", + " label_to_index[label]\n", + " for document in y_train\n", + " for label in document\n", + "]\n", + "\n", + "num_tags = max(all_label_indices) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "class LSTM(torch.nn.Module):\n", + "\n", + " def __init__(self):\n", + " super(LSTM, self).__init__()\n", + " self.emb = torch.nn.Embedding(len(train_vocab.get_itos()), 100)\n", + " self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)\n", + " self.fc1 = torch.nn.Linear(256, num_tags)\n", + "\n", + " def forward(self, x):\n", + " emb = torch.relu(self.emb(x))\n", + " lstm_output, (h_n, c_n) = self.rec(emb)\n", + " out_weights = self.fc1(lstm_output)\n", + " return out_weights" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "lstm = LSTM()\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.Adam(lstm.parameters())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def get_scores(y_true, y_pred):\n", + " acc_score = 0\n", + " tp = 0\n", + " fp = 0\n", + " selected_items = 0\n", + " relevant_items = 0\n", + "\n", + " for p, t in zip(y_pred, y_true):\n", + " if p == t:\n", + " acc_score += 1\n", + "\n", + " if p > 0 and p == t:\n", + " tp += 1\n", + "\n", + " if p > 0:\n", + " selected_items += 1\n", + "\n", + " if t > 0:\n", + " relevant_items += 1\n", + "\n", + " if selected_items == 0:\n", + " precision = 1.0\n", + " else:\n", + " precision = tp / selected_items\n", + "\n", + " if relevant_items == 0:\n", + " recall = 1.0\n", + " else:\n", + " recall = tp / relevant_items\n", + "\n", + " if precision + recall == 0.0:\n", + " f1 = 0.0\n", + " else:\n", + " f1 = 2 * precision * recall / (precision + recall)\n", + "\n", + " return precision, recall, f1\n", + "\n", + "def eval_model(dataset_tokens, dataset_labels, model):\n", + " Y_true = []\n", + " Y_pred = []\n", + " for i in tqdm(range(len(dataset_labels))):\n", + " batch_tokens = dataset_tokens[i].unsqueeze(0)\n", + " tags = list(dataset_labels[i].numpy())\n", + " Y_true += tags\n", + "\n", + " Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n", + " Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n", + " Y_pred += list(Y_batch_pred.numpy())\n", + "\n", + " return get_scores(Y_true, Y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e740a20b87d346f6ab948dcec6f6b1e7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/756 [00:00