259 lines
9.3 KiB
Plaintext
259 lines
9.3 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import torch\n",
|
||
|
"from torchtext.vocab import vocab\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from tqdm.notebook import tqdm\n",
|
||
|
"from collections import Counter"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def load_datasets():\n",
|
||
|
" train_data = pd.read_csv(\n",
|
||
|
" \"train/train.tsv.xz\", compression=\"xz\", sep=\"\\t\", names=[\"Tag\", \"Sentence\"]\n",
|
||
|
" )\n",
|
||
|
" dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",
|
||
|
" dev_labels = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", names=[\"Tag\"])\n",
|
||
|
" test_data = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",
|
||
|
"\n",
|
||
|
" return train_data, dev_data, dev_labels, test_data\n",
|
||
|
"\n",
|
||
|
"train_data, dev_data, dev_labels, test_data = load_datasets()\n",
|
||
|
"\n",
|
||
|
"train_sentences, val_sentences, train_tags, val_tags = train_test_split(\n",
|
||
|
" train_data[\"Sentence\"], train_data[\"Tag\"], test_size=0.1, random_state=42\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
|
"train_data = pd.DataFrame({\"Sentence\": train_sentences, \"Tag\": train_tags})\n",
|
||
|
"val_data = pd.DataFrame({\"Sentence\": val_sentences, \"Tag\": val_tags})\n",
|
||
|
"\n",
|
||
|
"def tokenize_column(dataframe, column):\n",
|
||
|
" return dataframe[column].apply(lambda x: x.split())\n",
|
||
|
"\n",
|
||
|
"train_data[\"tokens\"] = tokenize_column(train_data, \"Sentence\")\n",
|
||
|
"train_data[\"tag_tokens\"] = tokenize_column(train_data, \"Tag\")\n",
|
||
|
"val_data[\"tokens\"] = tokenize_column(val_data, \"Sentence\")\n",
|
||
|
"val_data[\"tag_tokens\"] = tokenize_column(val_data, \"Tag\")\n",
|
||
|
"dev_data[\"tokens\"] = tokenize_column(dev_data, \"Sentence\")\n",
|
||
|
"dev_labels[\"tag_tokens\"] = tokenize_column(dev_labels, \"Tag\")\n",
|
||
|
"test_data[\"tokens\"] = tokenize_column(test_data, \"Sentence\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def create_vocab(token_list):\n",
|
||
|
" token_counter = Counter()\n",
|
||
|
" for tokens in token_list:\n",
|
||
|
" token_counter.update(tokens)\n",
|
||
|
" return vocab(token_counter, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])\n",
|
||
|
"\n",
|
||
|
"vocab_obj = create_vocab(train_data[\"tokens\"])\n",
|
||
|
"\n",
|
||
|
"vocab_obj.set_default_index(vocab_obj[\"<unk>\"])\n",
|
||
|
"\n",
|
||
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
||
|
"\n",
|
||
|
"def convert_to_tensor(token_lists, vocab_obj, device):\n",
|
||
|
" return [\n",
|
||
|
" torch.tensor(\n",
|
||
|
" [vocab_obj[\"<bos>\"]] + [vocab_obj[token] for token in tokens] + [vocab_obj[\"<eos>\"]],\n",
|
||
|
" dtype=torch.long,\n",
|
||
|
" device=device,\n",
|
||
|
" )\n",
|
||
|
" for tokens in token_lists\n",
|
||
|
" ]\n",
|
||
|
"\n",
|
||
|
"train_tensor = convert_to_tensor(train_data[\"tokens\"], vocab_obj, device)\n",
|
||
|
"val_tensor = convert_to_tensor(val_data[\"tokens\"], vocab_obj, device)\n",
|
||
|
"dev_tensor = convert_to_tensor(dev_data[\"tokens\"], vocab_obj, device)\n",
|
||
|
"test_tensor = convert_to_tensor(test_data[\"tokens\"], vocab_obj, device)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"tag_list = [\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-MISC\", \"I-MISC\"]\n",
|
||
|
"\n",
|
||
|
"tag_to_index = {tag: idx for idx, tag in enumerate(tag_list)}\n",
|
||
|
"\n",
|
||
|
"def convert_tags_to_tensor(tag_tokens, tag_to_index, device):\n",
|
||
|
" return [\n",
|
||
|
" torch.tensor(\n",
|
||
|
" [0] + [tag_to_index[tag] for tag in tags] + [0],\n",
|
||
|
" dtype=torch.long,\n",
|
||
|
" device=device,\n",
|
||
|
" )\n",
|
||
|
" for tags in tag_tokens\n",
|
||
|
" ]\n",
|
||
|
"\n",
|
||
|
"train_tag_tensor = convert_tags_to_tensor(train_data[\"tag_tokens\"], tag_to_index, device)\n",
|
||
|
"val_tag_tensor = convert_tags_to_tensor(val_data[\"tag_tokens\"], tag_to_index, device)\n",
|
||
|
"dev_tag_tensor = convert_tags_to_tensor(dev_labels[\"tag_tokens\"], tag_to_index, device)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def calculate_metrics(true_labels, predicted_labels):\n",
|
||
|
" true_positives = 0\n",
|
||
|
" total_selected = 0\n",
|
||
|
" total_relevant = 0\n",
|
||
|
"\n",
|
||
|
" for pred, true in zip(predicted_labels, true_labels):\n",
|
||
|
" if pred == true:\n",
|
||
|
" true_positives += 1\n",
|
||
|
" if pred > 0:\n",
|
||
|
" total_selected += 1\n",
|
||
|
" if true > 0:\n",
|
||
|
" total_relevant += 1\n",
|
||
|
"\n",
|
||
|
" precision = true_positives / total_selected if total_selected > 0 else 1.0\n",
|
||
|
" recall = true_positives / total_relevant if total_relevant > 0 else 1.0\n",
|
||
|
" f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0\n",
|
||
|
"\n",
|
||
|
" return precision, recall, f1_score\n",
|
||
|
"\n",
|
||
|
"max_tag_index = max(tag_to_index.values()) + 1"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"class BiLSTMModel(torch.nn.Module):\n",
|
||
|
" def __init__(self, vocab_size, embed_size, hidden_size, num_layers, output_size):\n",
|
||
|
" super(BiLSTMModel, self).__init__()\n",
|
||
|
" self.embedding = torch.nn.Embedding(vocab_size, embed_size)\n",
|
||
|
" self.lstm = torch.nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n",
|
||
|
" self.fc = torch.nn.Linear(hidden_size * 2, output_size)\n",
|
||
|
"\n",
|
||
|
" def forward(self, x):\n",
|
||
|
" embedded = torch.relu(self.embedding(x))\n",
|
||
|
" lstm_out, _ = self.lstm(embedded)\n",
|
||
|
" logits = self.fc(lstm_out)\n",
|
||
|
" return logits\n",
|
||
|
"\n",
|
||
|
"model = BiLSTMModel(len(vocab_obj.get_itos()), 100, 100, 1, max_tag_index).to(device)\n",
|
||
|
"loss_fn = torch.nn.CrossEntropyLoss()\n",
|
||
|
"optimizer = torch.optim.Adam(model.parameters())\n",
|
||
|
"\n",
|
||
|
"def evaluate_model(tokens, labels, model):\n",
|
||
|
" true_labels = []\n",
|
||
|
" predicted_labels = []\n",
|
||
|
" for i in tqdm(range(len(labels))):\n",
|
||
|
" inputs = tokens[i].unsqueeze(0)\n",
|
||
|
" true = list(labels[i].cpu().numpy())\n",
|
||
|
" true_labels += true\n",
|
||
|
"\n",
|
||
|
" with torch.no_grad():\n",
|
||
|
" logits = model(inputs).squeeze(0)\n",
|
||
|
" predicted = torch.argmax(logits, dim=1)\n",
|
||
|
" predicted_labels += list(predicted.cpu().numpy())\n",
|
||
|
"\n",
|
||
|
" return calculate_metrics(true_labels, predicted_labels)\n",
|
||
|
"\n",
|
||
|
"def predict_labels(tokens, model, tag_to_index):\n",
|
||
|
" predictions = []\n",
|
||
|
" index_to_tag = {v: k for k, v in tag_to_index.items()}\n",
|
||
|
"\n",
|
||
|
" for i in tqdm(range(len(tokens))):\n",
|
||
|
" inputs = tokens[i].unsqueeze(0)\n",
|
||
|
" with torch.no_grad():\n",
|
||
|
" logits = model(inputs).squeeze(0)\n",
|
||
|
" predicted = torch.argmax(logits, dim=1)\n",
|
||
|
" tags = [index_to_tag[label.item()] for label in predicted[1:-1]]\n",
|
||
|
" predictions.append(\" \".join(tags))\n",
|
||
|
"\n",
|
||
|
" return predictions"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"EPOCHS = 10\n",
|
||
|
"\n",
|
||
|
"for epoch in range(EPOCHS):\n",
|
||
|
" model.train()\n",
|
||
|
" for i in tqdm(range(len(train_tag_tensor))):\n",
|
||
|
" inputs = train_tensor[i].unsqueeze(0)\n",
|
||
|
" targets = train_tag_tensor[i].unsqueeze(1)\n",
|
||
|
"\n",
|
||
|
" optimizer.zero_grad()\n",
|
||
|
" outputs = model(inputs)\n",
|
||
|
" loss = loss_fn(outputs.squeeze(0), targets.squeeze(1))\n",
|
||
|
" loss.backward()\n",
|
||
|
" optimizer.step()\n",
|
||
|
"\n",
|
||
|
" model.eval()\n",
|
||
|
" print(evaluate_model(val_tensor, val_tag_tensor, model))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"evaluate_model(val_tensor, val_tag_tensor, model)\n",
|
||
|
"evaluate_model(dev_tensor, dev_tag_tensor, model)\n",
|
||
|
"\n",
|
||
|
"dev_predictions = predict_labels(dev_tensor, model, tag_to_index)\n",
|
||
|
"dev_predictions_df = pd.DataFrame(dev_predictions, columns=[\"Tag\"])\n",
|
||
|
"dev_predictions_df.to_csv(\"dev-0/out.tsv\", index=False, header=False)\n",
|
||
|
"\n",
|
||
|
"test_predictions = predict_labels(test_tensor, model, tag_to_index)\n",
|
||
|
"test_predictions_df = pd.DataFrame(test_predictions, columns=[\"Tag\"])\n",
|
||
|
"test_predictions_df.to_csv(\"test-A/out.tsv\", index=False, header=False)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.9.19"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|