{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "from torchtext.vocab import vocab\n", "from sklearn.model_selection import train_test_split\n", "from tqdm.notebook import tqdm\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def load_datasets():\n", " train_data = pd.read_csv(\n", " \"train/train.tsv.xz\", compression=\"xz\", sep=\"\\t\", names=[\"Tag\", \"Sentence\"]\n", " )\n", " dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n", " dev_labels = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", names=[\"Tag\"])\n", " test_data = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n", "\n", " return train_data, dev_data, dev_labels, test_data\n", "\n", "train_data, dev_data, dev_labels, test_data = load_datasets()\n", "\n", "train_sentences, val_sentences, train_tags, val_tags = train_test_split(\n", " train_data[\"Sentence\"], train_data[\"Tag\"], test_size=0.1, random_state=42\n", ")\n", "\n", "train_data = pd.DataFrame({\"Sentence\": train_sentences, \"Tag\": train_tags})\n", "val_data = pd.DataFrame({\"Sentence\": val_sentences, \"Tag\": val_tags})\n", "\n", "def tokenize_column(dataframe, column):\n", " return dataframe[column].apply(lambda x: x.split())\n", "\n", "train_data[\"tokens\"] = tokenize_column(train_data, \"Sentence\")\n", "train_data[\"tag_tokens\"] = tokenize_column(train_data, \"Tag\")\n", "val_data[\"tokens\"] = tokenize_column(val_data, \"Sentence\")\n", "val_data[\"tag_tokens\"] = tokenize_column(val_data, \"Tag\")\n", "dev_data[\"tokens\"] = tokenize_column(dev_data, \"Sentence\")\n", "dev_labels[\"tag_tokens\"] = tokenize_column(dev_labels, \"Tag\")\n", "test_data[\"tokens\"] = tokenize_column(test_data, \"Sentence\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def create_vocab(token_list):\n", " token_counter = Counter()\n", " for tokens in token_list:\n", " token_counter.update(tokens)\n", " return vocab(token_counter, specials=[\"\", \"\", \"\", \"\"])\n", "\n", "vocab_obj = create_vocab(train_data[\"tokens\"])\n", "\n", "vocab_obj.set_default_index(vocab_obj[\"\"])\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "def convert_to_tensor(token_lists, vocab_obj, device):\n", " return [\n", " torch.tensor(\n", " [vocab_obj[\"\"]] + [vocab_obj[token] for token in tokens] + [vocab_obj[\"\"]],\n", " dtype=torch.long,\n", " device=device,\n", " )\n", " for tokens in token_lists\n", " ]\n", "\n", "train_tensor = convert_to_tensor(train_data[\"tokens\"], vocab_obj, device)\n", "val_tensor = convert_to_tensor(val_data[\"tokens\"], vocab_obj, device)\n", "dev_tensor = convert_to_tensor(dev_data[\"tokens\"], vocab_obj, device)\n", "test_tensor = convert_to_tensor(test_data[\"tokens\"], vocab_obj, device)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tag_list = [\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-MISC\", \"I-MISC\"]\n", "\n", "tag_to_index = {tag: idx for idx, tag in enumerate(tag_list)}\n", "\n", "def convert_tags_to_tensor(tag_tokens, tag_to_index, device):\n", " return [\n", " torch.tensor(\n", " [0] + [tag_to_index[tag] for tag in tags] + [0],\n", " dtype=torch.long,\n", " device=device,\n", " )\n", " for tags in tag_tokens\n", " ]\n", "\n", "train_tag_tensor = convert_tags_to_tensor(train_data[\"tag_tokens\"], tag_to_index, device)\n", "val_tag_tensor = convert_tags_to_tensor(val_data[\"tag_tokens\"], tag_to_index, device)\n", "dev_tag_tensor = convert_tags_to_tensor(dev_labels[\"tag_tokens\"], tag_to_index, device)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def calculate_metrics(true_labels, predicted_labels):\n", " true_positives = 0\n", " total_selected = 0\n", " total_relevant = 0\n", "\n", " for pred, true in zip(predicted_labels, true_labels):\n", " if pred == true:\n", " true_positives += 1\n", " if pred > 0:\n", " total_selected += 1\n", " if true > 0:\n", " total_relevant += 1\n", "\n", " precision = true_positives / total_selected if total_selected > 0 else 1.0\n", " recall = true_positives / total_relevant if total_relevant > 0 else 1.0\n", " f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0\n", "\n", " return precision, recall, f1_score\n", "\n", "max_tag_index = max(tag_to_index.values()) + 1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "class BiLSTMModel(torch.nn.Module):\n", " def __init__(self, vocab_size, embed_size, hidden_size, num_layers, output_size):\n", " super(BiLSTMModel, self).__init__()\n", " self.embedding = torch.nn.Embedding(vocab_size, embed_size)\n", " self.lstm = torch.nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n", " self.fc = torch.nn.Linear(hidden_size * 2, output_size)\n", "\n", " def forward(self, x):\n", " embedded = torch.relu(self.embedding(x))\n", " lstm_out, _ = self.lstm(embedded)\n", " logits = self.fc(lstm_out)\n", " return logits\n", "\n", "model = BiLSTMModel(len(vocab_obj.get_itos()), 100, 100, 1, max_tag_index).to(device)\n", "loss_fn = torch.nn.CrossEntropyLoss()\n", "optimizer = torch.optim.Adam(model.parameters())\n", "\n", "def evaluate_model(tokens, labels, model):\n", " true_labels = []\n", " predicted_labels = []\n", " for i in tqdm(range(len(labels))):\n", " inputs = tokens[i].unsqueeze(0)\n", " true = list(labels[i].cpu().numpy())\n", " true_labels += true\n", "\n", " with torch.no_grad():\n", " logits = model(inputs).squeeze(0)\n", " predicted = torch.argmax(logits, dim=1)\n", " predicted_labels += list(predicted.cpu().numpy())\n", "\n", " return calculate_metrics(true_labels, predicted_labels)\n", "\n", "def predict_labels(tokens, model, tag_to_index):\n", " predictions = []\n", " index_to_tag = {v: k for k, v in tag_to_index.items()}\n", "\n", " for i in tqdm(range(len(tokens))):\n", " inputs = tokens[i].unsqueeze(0)\n", " with torch.no_grad():\n", " logits = model(inputs).squeeze(0)\n", " predicted = torch.argmax(logits, dim=1)\n", " tags = [index_to_tag[label.item()] for label in predicted[1:-1]]\n", " predictions.append(\" \".join(tags))\n", "\n", " return predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "EPOCHS = 10\n", "\n", "for epoch in range(EPOCHS):\n", " model.train()\n", " for i in tqdm(range(len(train_tag_tensor))):\n", " inputs = train_tensor[i].unsqueeze(0)\n", " targets = train_tag_tensor[i].unsqueeze(1)\n", "\n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " loss = loss_fn(outputs.squeeze(0), targets.squeeze(1))\n", " loss.backward()\n", " optimizer.step()\n", "\n", " model.eval()\n", " print(evaluate_model(val_tensor, val_tag_tensor, model))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "evaluate_model(val_tensor, val_tag_tensor, model)\n", "evaluate_model(dev_tensor, dev_tag_tensor, model)\n", "\n", "dev_predictions = predict_labels(dev_tensor, model, tag_to_index)\n", "dev_predictions_df = pd.DataFrame(dev_predictions, columns=[\"Tag\"])\n", "dev_predictions_df.to_csv(\"dev-0/out.tsv\", index=False, header=False)\n", "\n", "test_predictions = predict_labels(test_tensor, model, tag_to_index)\n", "test_predictions_df = pd.DataFrame(test_predictions, columns=[\"Tag\"])\n", "test_predictions_df.to_csv(\"test-A/out.tsv\", index=False, header=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 2 }