From 52b75b144e9c4bf4aade89ea514f4ccd2dbeaf27 Mon Sep 17 00:00:00 2001 From: s464863 Date: Thu, 30 May 2024 08:33:51 +0200 Subject: [PATCH] Initial commit --- Projekt.ipynb | 1402 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1402 insertions(+) create mode 100644 Projekt.ipynb diff --git a/Projekt.ipynb b/Projekt.ipynb new file mode 100644 index 0000000..2eae0e3 --- /dev/null +++ b/Projekt.ipynb @@ -0,0 +1,1402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## POS Tagging using LSTM" + ], + "metadata": { + "collapsed": false + }, + "id": "d03db3876ae84fdc" + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import torchtext\n", + "from torchtext.vocab import vocab\n", + "\n", + "from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report\n", + "\n", + "from tqdm.notebook import tqdm\n", + "\n", + "import datasets\n", + "\n", + "from collections import Counter" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T04:56:09.952503200Z", + "start_time": "2024-05-30T04:56:06.967530400Z" + } + }, + "id": "583c93622c61177b" + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "# Load the dataset\n", + "dataset = datasets.load_dataset('batterydata/pos_tagging')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T04:56:22.604270500Z", + "start_time": "2024-05-30T04:56:14.602312200Z" + } + }, + "id": "9a73f4af39424a1f" + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "# Convert the dataset to pandas DataFrame\n", + "train_dataset = dataset['train']\n", + "test_dataset = dataset['test']\n", + "\n", + "train_dataset.set_format(type='pandas')\n", + "test_dataset.set_format(type='pandas')\n", + "\n", + "df_train = pd.concat([train_dataset['words'], train_dataset['labels']], axis=1)\n", + "df_test = pd.concat([test_dataset['words'], test_dataset['labels']], axis=1)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:02:39.173066900Z", + "start_time": "2024-05-30T05:02:39.117326300Z" + } + }, + "id": "f2d1e260eb9cad0" + }, + { + "cell_type": "code", + "execution_count": 81, + "outputs": [ + { + "data": { + "text/plain": "1451" + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_test)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:34:37.254376Z", + "start_time": "2024-05-30T05:34:37.240989300Z" + } + }, + "id": "60c16f74d5df36b0" + }, + { + "cell_type": "code", + "execution_count": 83, + "outputs": [ + { + "data": { + "text/plain": "13054" + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_train)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:34:45.153855800Z", + "start_time": "2024-05-30T05:34:45.137542300Z" + } + }, + "id": "184cfb64cddd5c51" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "# Method for building the vocabulary from DataFrame dataset\n", + "# Special tokens:\n", + "# - unknown token\n", + "# - padding token\n", + "# - beginning of sentence token\n", + "# - end of sentence token\n", + "def build_vocab(dataset):\n", + " # Initialize the counter\n", + " counter = Counter()\n", + " \n", + " # Iterate over the dataset and update the counter\n", + " for idx, document in dataset.iterrows():\n", + " counter.update(document['words'])\n", + " \n", + " # Return the vocabulary\n", + " return vocab(counter, specials=['', '', '', ''])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:03:08.896211Z", + "start_time": "2024-05-30T05:03:08.891565300Z" + } + }, + "id": "d0ab581622dec851" + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "# Build the vocabulary\n", + "v = build_vocab(df_train)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:03:23.852859100Z", + "start_time": "2024-05-30T05:03:23.410789500Z" + } + }, + "id": "cfac7f6325c6bc0a" + }, + { + "cell_type": "code", + "execution_count": 84, + "outputs": [ + { + "data": { + "text/plain": "24851" + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(v)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:35:56.342196300Z", + "start_time": "2024-05-30T05:35:56.326491100Z" + } + }, + "id": "2a599cdb42e1dd7e" + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [], + "source": [ + "# Mapping from index to token\n", + "itos = v.get_itos()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:03:31.890910700Z", + "start_time": "2024-05-30T05:03:31.877808400Z" + } + }, + "id": "1669b13ea4c7e3d7" + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [], + "source": [ + "# Set default index for unknown tokens\n", + "v.set_default_index(v[\"\"])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:03:37.066493400Z", + "start_time": "2024-05-30T05:03:37.058550900Z" + } + }, + "id": "4a5612b9816daf0d" + }, + { + "cell_type": "code", + "execution_count": 54, + "outputs": [], + "source": [ + "# Get unique POS tags\n", + "pos_tags = df_train['labels'].explode().unique().tolist()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:18:33.019728200Z", + "start_time": "2024-05-30T05:18:32.985438100Z" + } + }, + "id": "e205a3f2fa7468a9" + }, + { + "cell_type": "code", + "execution_count": 55, + "outputs": [], + "source": [ + "# Mapping from POS tag to index\n", + "label2idx = {label: idx for idx, label in enumerate(pos_tags)}\n", + "\n", + "# Mapping from index to POS tag\n", + "idx2label = {idx: label for label, idx in label2idx.items()}" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:18:37.168881400Z", + "start_time": "2024-05-30T05:18:37.163076800Z" + } + }, + "id": "c39568b2c58a89e5" + }, + { + "cell_type": "code", + "execution_count": 56, + "outputs": [], + "source": [ + "# Method for vectorizing text data using the vocabulary mapping\n", + "def text_to_vec(data):\n", + " return [torch.tensor([v['']] + [v[token] for token in document] + [v['']], dtype=torch.long) for document in data]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:18:38.256181300Z", + "start_time": "2024-05-30T05:18:38.247671200Z" + } + }, + "id": "a65bdf3264844e78" + }, + { + "cell_type": "code", + "execution_count": 57, + "outputs": [], + "source": [ + "# Method for vectorizing POS tags data using the POS tags mapping\n", + "def pos_tags_to_vec(data):\n", + " return [torch.tensor([20] + [label2idx[tag] for tag in document] + [20], dtype=torch.long) for document in data]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:18:38.646303500Z", + "start_time": "2024-05-30T05:18:38.637786100Z" + } + }, + "id": "90ceb0f6639c23f6" + }, + { + "cell_type": "code", + "execution_count": 96, + "outputs": [], + "source": [ + "# Vectorize the text data (input)\n", + "X_train = text_to_vec(df_train['words'])\n", + "X_test = text_to_vec(df_test['words'])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:45:44.884726Z", + "start_time": "2024-05-30T05:45:44.390728500Z" + } + }, + "id": "c32f2310d38b3442" + }, + { + "cell_type": "code", + "execution_count": 97, + "outputs": [], + "source": [ + "# Vectorize the POS tags data (output)\n", + "y_train = pos_tags_to_vec(df_train['labels'])\n", + "y_test = pos_tags_to_vec(df_test['labels'])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:45:46.004430800Z", + "start_time": "2024-05-30T05:45:45.746219600Z" + } + }, + "id": "8255ae3faf474132" + }, + { + "cell_type": "markdown", + "source": [ + "## LSTM Models" + ], + "metadata": { + "collapsed": false + }, + "id": "add07dd1d8b699f9" + }, + { + "cell_type": "code", + "execution_count": 86, + "outputs": [], + "source": [ + "# Basic LSTM model\n", + "class LSTM(nn.Module):\n", + " def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):\n", + " super(LSTM, self).__init__()\n", + " \n", + " # Embedding layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + " \n", + " # LSTM layer\n", + " self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)\n", + " \n", + " # Fully connected layer\n", + " self.fc = nn.Linear(hidden_dim, output_dim)\n", + " \n", + " self.relu = nn.ReLU()\n", + " \n", + " def forward(self, x):\n", + " # Embedding\n", + " embedding = self.relu(self.embedding(x))\n", + " \n", + " # LSTM\n", + " output, (hidden, cell) = self.lstm(embedding)\n", + " \n", + " # Fully connected\n", + " output = self.fc(output)\n", + " \n", + " return output" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:42:11.781180Z", + "start_time": "2024-05-30T05:42:11.762152200Z" + } + }, + "id": "d9d9e2b81dca3e47" + }, + { + "cell_type": "code", + "execution_count": 88, + "outputs": [], + "source": [ + "# LSTM model with dropout\n", + "class LSTMWithDropout(nn.Module):\n", + " def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_prob=0.5):\n", + " super(LSTMWithDropout, self).__init__()\n", + " \n", + " # Embedding layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + " \n", + " # LSTM layer\n", + " self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)\n", + " \n", + " # Dropout layer\n", + " self.dropout = nn.Dropout(dropout_prob)\n", + " \n", + " # Fully connected layer\n", + " self.fc = nn.Linear(hidden_dim, output_dim)\n", + " \n", + " self.relu = nn.ReLU()\n", + " \n", + " def forward(self, x):\n", + " # Embedding\n", + " embedding = self.relu(self.embedding(x))\n", + " \n", + " # LSTM\n", + " output, (hidden, cell) = self.lstm(embedding)\n", + " \n", + " # Dropout\n", + " output = self.dropout(output)\n", + " \n", + " # Fully connected\n", + " output = self.fc(output)\n", + " \n", + " return output" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:42:38.928687500Z", + "start_time": "2024-05-30T05:42:38.922120700Z" + } + }, + "id": "d8e190f59de1e675" + }, + { + "cell_type": "code", + "execution_count": 89, + "outputs": [], + "source": [ + "class StackedLSTM(nn.Module):\n", + " def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2):\n", + " super(StackedLSTM, self).__init__()\n", + " \n", + " # Embedding layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + " \n", + " # Stacked LSTM layers\n", + " self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)\n", + " \n", + " # Fully connected layer\n", + " self.fc = nn.Linear(hidden_dim, output_dim)\n", + " \n", + " self.relu = nn.ReLU()\n", + " \n", + " def forward(self, x):\n", + " # Embedding\n", + " embedding = self.relu(self.embedding(x))\n", + " \n", + " # LSTM\n", + " output, (hidden, cell) = self.lstm(embedding)\n", + " \n", + " # Fully connected\n", + " output = self.fc(output)\n", + " \n", + " return output" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:42:41.562489200Z", + "start_time": "2024-05-30T05:42:41.542254700Z" + } + }, + "id": "c38a934d939afecf" + }, + { + "cell_type": "code", + "execution_count": 91, + "outputs": [], + "source": [ + "class BidirectionalLSTM(nn.Module):\n", + " def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):\n", + " super(BidirectionalLSTM, self).__init__()\n", + " \n", + " # Embedding layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + " \n", + " # Bidirectional LSTM layer\n", + " self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)\n", + " \n", + " # Fully connected layer\n", + " self.fc = nn.Linear(hidden_dim * 2, output_dim)\n", + " \n", + " self.relu = nn.ReLU()\n", + " \n", + " def forward(self, x):\n", + " # Embedding\n", + " embedding = self.relu(self.embedding(x))\n", + " \n", + " # LSTM\n", + " output, (hidden, cell) = self.lstm(embedding)\n", + " \n", + " # Concatenate the outputs from both directions\n", + " output = self.fc(output)\n", + " \n", + " return output" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:42:59.916550600Z", + "start_time": "2024-05-30T05:42:59.908460800Z" + } + }, + "id": "211c084868ad07ac" + }, + { + "cell_type": "markdown", + "source": [ + "## Training and Evaluation Methods" + ], + "metadata": { + "collapsed": false + }, + "id": "4b3e5007817bcf05" + }, + { + "cell_type": "code", + "execution_count": 146, + "outputs": [], + "source": [ + "# Segeval evaluation\n", + "def evaluate_model(model, X_test, y_test):\n", + " \"\"\"\n", + " Method for evaluating the model\n", + " :param model: model\n", + " :param X: input data\n", + " :param y: output data \n", + " :return: dictionary with metrics values\n", + " \"\"\"\n", + " # Use GPU if available\n", + " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + " \n", + " # Move the model to the device\n", + " model = model.to(device)\n", + " \n", + " # Move the data to the device\n", + " X = [x.to(device) for x in X_test]\n", + " y = [y.to(device) for y in y_test]\n", + " \n", + " # No gradients\n", + " with torch.no_grad():\n", + " # Predict the labels\n", + " y_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X]\n", + " \n", + " # Convert the labels to ner tags\n", + " y_pred = [[idx2label[int(idx)] for idx in y] for y in y_pred]\n", + " y_tags = [[idx2label[int(idx)] for idx in y] for y in y]\n", + " \n", + " # Calculate the metrics\n", + " accuracy = accuracy_score(y_tags, y_pred)\n", + " precision = precision_score(y_tags, y_pred)\n", + " recall = recall_score(y_tags, y_pred)\n", + " f1 = f1_score(y_tags, y_pred)\n", + " \n", + " return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T06:19:54.397919700Z", + "start_time": "2024-05-30T06:19:54.382149800Z" + } + }, + "id": "481abca2f316793c" + }, + { + "cell_type": "code", + "execution_count": 112, + "outputs": [], + "source": [ + "import random\n", + "\n", + "# Train model\n", + "def train(model, X_train, y_train, X_test, y_test, epochs = 5, seed=1234):\n", + " \"\"\"\n", + " Method for training the model\n", + " :param model: model\n", + " :param X_train: input data for training\n", + " :param y_train: output data for training\n", + " :param X_test: input data for testing\n", + " :param y_test: output data for testing\n", + " :param epochs: number of epochs\n", + " \"\"\"\n", + " # Seed for reproducibility\n", + " torch.manual_seed(seed)\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + "\n", + " # Use GPU if available\n", + " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + " \n", + " # Loss function and optimizer\n", + " criterion = nn.CrossEntropyLoss()\n", + " optimizer = optim.Adam(model.parameters())\n", + " \n", + " # Move training to GPU\n", + " model = model.to(device)\n", + " X_train_device = [x.to(device) for x in X_train]\n", + " y_train_device = [y.to(device) for y in y_train]\n", + " X_test_device = [x.to(device) for x in X_test]\n", + " y_test_device = [y.to(device) for y in y_test]\n", + " \n", + " # Training loop\n", + " model.train()\n", + "\n", + " for epoch in range(epochs):\n", + " for idx in tqdm(range(len(X_train_device))):\n", + " # Zero the gradients\n", + " optimizer.zero_grad()\n", + " \n", + " # Forward pass\n", + " output = model(X_train_device[idx].unsqueeze(0))\n", + " \n", + " # Calculate the loss\n", + " loss = criterion(output.squeeze(0), y_train_device[idx])\n", + " \n", + " # Backward pass\n", + " loss.backward()\n", + " \n", + " # Update the weights\n", + " optimizer.step()\n", + " \n", + " # Evaluate the model on the dev set\n", + " metrics = evaluate_model(model, X_test_device, y_test_device)\n", + " \n", + " print(f'Epoch: {epoch+1}, Accuracy: {metrics[\"accuracy\"]}, Precision: {metrics[\"precision\"]}, Recall: {metrics[\"recall\"]}, F1: {metrics[\"f1\"]}')" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:59:17.968008900Z", + "start_time": "2024-05-30T05:59:17.949318600Z" + } + }, + "id": "c9c6e4a60baaf950" + }, + { + "cell_type": "markdown", + "source": [ + "## Basic LSTM Model" + ], + "metadata": { + "collapsed": false + }, + "id": "9a2ef38c4595d331" + }, + { + "cell_type": "code", + "execution_count": 107, + "outputs": [], + "source": [ + "# Model parameters\n", + "vocab_size = len(v)\n", + "embedding_dim = 64\n", + "hidden_dim = 128\n", + "output_dim = len(pos_tags)\n", + "epochs = 7" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:51:54.574424200Z", + "start_time": "2024-05-30T05:51:54.559314300Z" + } + }, + "id": "e6df7aaff2d06e84" + }, + { + "cell_type": "code", + "execution_count": 108, + "outputs": [], + "source": [ + "# Initialize the model\n", + "model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-05-30T05:51:54.908172500Z", + "start_time": "2024-05-30T05:51:54.889921400Z" + } + }, + "id": "a8abf9db9958387f" + }, + { + "cell_type": "code", + "execution_count": 109, + "outputs": [ + { + "data": { + "text/plain": " 0%| | 0/13054 [00:00