RNN/rnn_2.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['<unk>', '<pad>', '<bos>', '<eos>', 'SOCCER', '-', 'POLISH', 'FIRST', 'DIVISION', 'RESULTS', '.', '</S>', 'WARSAW', '1996-08-24', 'Results', 'of', 'Polish', 'first', 'division', 'soccer', 'matches', 'on', 'Saturday', ':', 'Amica', 'Wronki', '3', 'Hutnik', 'Krakow', '0', 'Sokol', 'Tychy', '5', 'Lech', 'Poznan', 'Rakow', 'Czestochowa', '1', 'Stomil', 'Olsztyn', '4', 'Wisla', 'Gornik', 'Zabrze', 'Slask', 'Wroclaw', 'Odra', 'Wodzislaw', 'GKS', 'Katowice', 'Polonia', 'Warsaw', 'Zaglebie', 'Lubin', '2', 'LKS', 'Lodz', 'Legia', 'Belchatow', 'CRICKET', 'POLLOCK', 'CONCLUDES', 'WARWICKSHIRE', 'CAREER', 'WITH', 'FLOURISH', 'LONDON', '1996-08-25', 'South', 'African', 'fast', 'bowler', 'Shaun', 'Pollock', 'concluded', 'his', 'Warwickshire', 'career', 'with', 'a', 'flourish', 'Sunday', 'by', 'taking', 'the', 'final', 'three', 'wickets', 'during', 'county', \"'s\", 'league', 'victory', 'over', 'Worcestershire', ',', 'who', 'returns', 'home', 'Tuesday', 'for', 'an', 'ankle', 'operation', 'took', 'last', 'in', 'nine', 'balls', 'as', 'were', 'dismissed', '154', 'After', 'hour', 'interruption', 'rain', 'then', 'reached', 'adjusted', 'target', '109', '13', 'to', 'spare', 'and', 'record', 'their', 'fifth', 'win', 'six', 'games', 'are', 'currently', 'fourth', 'position', 'behind', 'Yorkshire', 'Nottinghamshire', 'Surrey', 'captain', 'David', 'Byas', 'completed', 'third', 'century', 'side', 'swept', 'clear', 'at', 'top', 'table', 'reaching', 'best', '111', 'not', 'out', 'against', 'Lancashire', 'total', '205', 'eight', 'from', '40', 'overs', 'looked', 'reasonable', 'before', 'put', 'attack', 'sword', 'collecting', 'runs', 'just', '100', 'sixes', 'fours', 'eventually', 'only', 'four', 'down', '7.5', 'CYCLING', 'BALLANGER', 'KEEPS', 'SPRINT', 'TITLE', 'IN', 'STYLE', 'Martin', 'Ayres', 'MANCHESTER', 'England', '1996-08-30', 'Felicia', 'Ballanger', 'France', 'confirmed', 'her', 'status', 'world', 'number', 'one', 'woman', 'sprinter', 'when', 'she', 'retained', 'title', 'cycling', 'championships', 'Friday', 'beat', 'Germany', 'Annett', 'Neumann', '2-0', 'best-of-three', 'add', 'Olympic', 'gold', 'medal', 'won', 'July', 'also', 'place', 'sprint', 'Magali', 'Faure', 'defeating', 'ex-world', 'champion', 'Tanya', 'Dubnicoff', 'Canada', '25', 'will', 'be', 'aiming', 'complete', 'track', 'double', 'defends', '500', 'metres', 'time', 'trial', 'The', 'other', 'night', 'women', '24-kms', 'points', 'race', 'ended', 'success', 'reigning', 'Russia', 'Svetlana', 'Samokhalova', 'fought', 'off', 'spirited', 'challenge', 'American', 'Jane', 'Quigley', 'take', 'second', 'year', 'nation', 'have', 'two', 'riders', 'field', 'made', 'full', 'use', 'numerical', 'superiority', 'Goulnara', 'Fatkoullina', 'helped', 'build', 'unbeatable', 'lead', 'snatching', 'bronze', 'former', 'medallist', 'event', 'led', 'half', 'distance', '\"', 'I', 'went', 'so', 'close', 'this', 'but', 'having', 'certainly', 'gave', 'Russians', 'advantage', 'said', 'lapped', 'which', 'left', 'Ingrid', 'Haringa', 'Netherlands', 'seventh', 'despite', 'highest', 'score', 'Nathalie', 'Lancien', 'missed', 'winning', 'finished', 'disappointing', '10th', 'RUGBY', 'LEAGUE', 'Australian', 'rugby', 'standings', 'SYDNEY', '1996-08-26', 'premiership', 'after', 'played', 'weekend', '(', 'tabulate', 'under', 'drawn', 'lost', ')', 'Manly', '21', '17', '501', '181', '34', 'Brisbane', '16', '569', '257', '32', 'North', 'Sydney', '14', '560', '317', '30', 'City', '20', '487', '293', '29', 'Cronulla', '12', '6', '359', '258', '26', 'Canberra', '8', '502', '374', 'St', 'George', '421', '344', 'Newcastle', '11', '9', '416', '366', '23', 'Western', 'Suburbs', '382', '426', 'Auckland', '10', '406', '389', '22', 'Tigers', '309', '435', 'Parramatta', '388', '391', 'Bulldogs', '325', '356', 'Illawarra', '395', '432', 'Reds', '297', '398', 'Penrith', '339', '448', 'Queensland', '15', '266', '593', 'Gold', 'Coast', '351', '483', '304', '586', '210', '460', '--', 'Newsroom', '61-2', '9373-1800', 'TENNIS', 'AT', 'HAMLET', 'CUP', 'COMMACK', 'New', 'York', 'Hamlet', 'Cup', 'tennis', 'tournament', 'prefix', 'denotes', 'seed
      "22154\n",
      "9\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a9e31a5d56f947bab11440afba13f2b2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/850 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2c08aba575f4435ca98af63d5dc96fc0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.5546171171171171, 0.2832901926948519, 0.37502379592613744)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a042bbe87f404a0e8463a641335693b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/850 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "11115dc17f3e4f9195a1b598a5cc1feb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.5892077354959451, 0.543284440609721, 0.5653149783031572)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d0734c011bb341cd9352338fbc3be59c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/850 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c5af3a6e70d44efc9e36bce4874e18d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.5969561794804513, 0.6542996836353178, 0.6243139407244785)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "adbd98d4c11647f3bd348bca22b8e98d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/850 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "804fdc7cbd5c4d64877813c63c5339e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.5954920019389239, 0.7066436583261432, 0.6463238195449165)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "adb5363844f74ef8b792704b71366b9b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/850 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "165c7d86d6614f29a6fa4ca4ce8b9c0f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.5790436970944864, 0.727926373310325, 0.6450050968399592)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b58d5293d2624cceb1e71bb7aebbb915",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/95 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ca543149052943ac9dc1bb054ab08ec3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/215 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "064d06a3ae394f2698deffd4727a7ec0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/215 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc5d4d0379a9489db22c8d3b727403ad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/230 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from torch.utils.data import DataLoader, Dataset\n",
    "from collections import Counter\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from torchtext.vocab import vocab\n",
    "from tqdm.notebook import tqdm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Funkcja do wczytywania danych\n",
    "def load_datasets():\n",
    "    train_df = pd.read_csv(\n",
    "        \"train/train.tsv.xz\", compression=\"xz\", sep=\"\\t\", names=[\"Category\", \"Sentence\"]\n",
    "    )\n",
    "    dev_df = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",
    "    dev_labels = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", names=[\"Category\"])\n",
    "    test_df = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",
    "    return train_df, dev_df, dev_labels, test_df\n",
    "\n",
    "train_df, dev_df, dev_labels, test_df = load_datasets()\n",
    "train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
    "    train_df[\"Sentence\"], train_df[\"Category\"], test_size=0.1, random_state=42\n",
    ")\n",
    "train_df = pd.DataFrame({\"Sentence\": train_texts, \"Category\": train_labels})\n",
    "val_df = pd.DataFrame({\"Sentence\": val_texts, \"Category\": val_labels})\n",
    "\n",
    "# Tokenizacja danych\n",
    "train_df[\"tokens\"] = train_df[\"Sentence\"].apply(lambda x: x.split())\n",
    "train_df[\"label_tokens\"] = train_df[\"Category\"].apply(lambda x: x.split())\n",
    "test_df[\"tokens\"] = test_df[\"Sentence\"].apply(lambda x: x.split())\n",
    "val_df[\"tokens\"] = val_df[\"Sentence\"].apply(lambda x: x.split())\n",
    "val_df[\"label_tokens\"] = val_df[\"Category\"].apply(lambda x: x.split())\n",
    "dev_df[\"tokens\"] = dev_df[\"Sentence\"].apply(lambda x: x.split())\n",
    "dev_df[\"label_tokens\"] = dev_labels[\"Category\"].apply(lambda x: x.split())\n",
    "\n",
    "# Budowanie słownika\n",
    "def create_vocab(token_list):\n",
    "    count = Counter()\n",
    "    for tokens in token_list:\n",
    "        count.update(tokens)\n",
    "    return vocab(count, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])\n",
    "\n",
    "vocabulary = create_vocab(train_df[\"tokens\"])\n",
    "index_to_string = vocabulary.get_itos()\n",
    "print(index_to_string)\n",
    "print(len(index_to_string))\n",
    "\n",
    "vocabulary.set_default_index(vocabulary[\"<unk>\"])\n",
    "\n",
    "# Przetwarzanie danych na wektory\n",
    "def vectorize_data(data_tokens):\n",
    "    return [\n",
    "        torch.tensor([vocabulary[\"<bos>\"]] + [vocabulary[token] for token in tokens] + [vocabulary[\"<eos>\"]], dtype=torch.long)\n",
    "        for tokens in data_tokens\n",
    "    ]\n",
    "\n",
    "def vectorize_labels(data_labels, label_map):\n",
    "    return [\n",
    "        torch.tensor([0] + [label_map[label] for label in labels] + [0], dtype=torch.long, device=device)\n",
    "        for labels in data_labels\n",
    "    ]\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "train_tokens = vectorize_data(train_df[\"tokens\"])\n",
    "test_tokens = vectorize_data(test_df[\"tokens\"])\n",
    "val_tokens = vectorize_data(val_df[\"tokens\"])\n",
    "dev_tokens = vectorize_data(dev_df[\"tokens\"])\n",
    "\n",
    "label_list = [\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-MISC\", \"I-MISC\"]\n",
    "label_map = {label: idx for idx, label in enumerate(label_list)}\n",
    "\n",
    "train_label_tokens = vectorize_labels(train_df[\"label_tokens\"], label_map)\n",
    "val_label_tokens = vectorize_labels(val_df[\"label_tokens\"], label_map)\n",
    "dev_label_tokens = vectorize_labels(dev_df[\"label_tokens\"], label_map)\n",
    "\n",
    "# Funkcja do obliczania metryk\n",
    "def calculate_metrics(true_labels, pred_labels):\n",
    "    accuracy = 0\n",
    "    true_positive = 0\n",
    "    false_positive = 0\n",
    "    total_selected = 0\n",
    "    total_relevant = 0\n",
    "\n",
    "    for pred, true in zip(pred_labels, true_labels):\n",
    "        if pred == true:\n",
    "            accuracy += 1\n",
    "\n",
    "        if pred > 0 and pred == true:\n",
    "            true_positive += 1\n",
    "\n",
    "        if pred > 0:\n",
    "            total_selected += 1\n",
    "\n",
    "        if true > 0:\n",
    "            total_relevant += 1\n",
    "\n",
    "    precision = true_positive / total_selected if total_selected > 0 else 1.0\n",
    "    recall = true_positive / total_relevant if total_relevant > 0 else 1.0\n",
    "    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0\n",
    "\n",
    "    return precision, recall, f1_score\n",
    "\n",
    "label_indices = [label_map[label] for labels in train_df[\"label_tokens\"] for label in labels]\n",
    "num_classes = max(label_indices) + 1\n",
    "print(num_classes)\n",
    "\n",
    "# Definicja modelu LSTM\n",
    "class BiLSTM(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes):\n",
    "        super(BiLSTM, self).__init__()\n",
    "        self.embedding = nn.Embedding(vocab_size, embed_dim)\n",
    "        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)\n",
    "        self.fc = nn.Linear(hidden_dim * 2, num_classes)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x_embed = torch.relu(self.embedding(x))\n",
    "        lstm_out, _ = self.lstm(x_embed)\n",
    "        logits = self.fc(lstm_out)\n",
    "        return logits\n",
    "\n",
    "model = BiLSTM(len(index_to_string), 100, 100, 1, num_classes).to(device)\n",
    "loss_function = nn.CrossEntropyLoss()\n",
    "optimizer = optim.Adam(model.parameters())\n",
    "\n",
    "# Funkcja do ewaluacji modelu\n",
    "def evaluate_model(data_tokens, data_labels, model):\n",
    "    true_labels = []\n",
    "    pred_labels = []\n",
    "    for i in tqdm(range(len(data_labels))):\n",
    "        tokens = data_tokens[i].unsqueeze(0)\n",
    "        true_labels_batch = list(data_labels[i].cpu().numpy())\n",
    "        true_labels += true_labels_batch\n",
    "\n",
    "        pred_logits = model(tokens).squeeze(0)\n",
    "        pred_batch = torch.argmax(pred_logits, 1)\n",
    "        pred_labels += list(pred_batch.cpu().numpy())\n",
    "\n",
    "    return calculate_metrics(true_labels, pred_labels)\n",
    "\n",
    "# Funkcja do predykcji etykiet\n",
    "def predict_labels(data_tokens, model, label_map):\n",
    "    pred_labels = []\n",
    "    inv_label_map = {v: k for k, v in label_map.items()}\n",
    "\n",
    "    for i in tqdm(range(len(data_tokens))):\n",
    "        tokens = data_tokens[i].unsqueeze(0)\n",
    "        pred_logits = model(tokens).squeeze(0)\n",
    "        pred_batch = torch.argmax(pred_logits, 1)\n",
    "        pred_label_list = [inv_label_map[label.item()] for label in pred_batch]\n",
    "        pred_label_list = pred_label_list[1:-1]\n",
    "        pred_labels.append(\" \".join(pred_label_list))\n",
    "\n",
    "    return pred_labels\n",
    "\n",
    "# Trening modelu\n",
    "EPOCHS = 5\n",
    "for epoch in range(EPOCHS):\n",
    "    model.train()\n",
    "    for i in tqdm(range(len(train_label_tokens))):\n",
    "        tokens = train_tokens[i].unsqueeze(0)\n",
    "        true_labels = train_label_tokens[i].unsqueeze(1)\n",
    "\n",
    "        pred_labels = model(tokens)\n",
    "        optimizer.zero_grad()\n",
    "        loss = loss_function(pred_labels.squeeze(0), true_labels.squeeze(1))\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "    model.eval()\n",
    "    print(evaluate_model(val_tokens, val_label_tokens, model))\n",
    "\n",
    "# Ewaluacja na zbiorze walidacyjnym i deweloperskim\n",
    "evaluate_model(val_tokens, val_label_tokens, model)\n",
    "evaluate_model(dev_tokens, dev_label_tokens, model)\n",
    "\n",
    "# Generowanie predykcji dla zbiorów testowych\n",
    "dev_predictions = predict_labels(dev_tokens, model, label_map)\n",
    "dev_predictions_df = pd.DataFrame(dev_predictions, columns=[\"Category\"])\n",
    "dev_predictions_df.to_csv(\"dev-0/out.tsv\", index=False, header=False)\n",
    "\n",
    "test_predictions = predict_labels(test_tokens, model, label_map)\n",
    "test_predictions_df = pd.DataFrame(test_predictions, columns=[\"Category\"])\n",
    "test_predictions_df.to_csv(\"test-A/out.tsv\", index=False, header=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
first RNN commit 2024-05-27 14:39:47 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			"['<unk>', '<pad>', '<bos>', '<eos>', 'SOCCER', '-', 'POLISH', 'FIRST', 'DIVISION', 'RESULTS', '.', '</S>', 'WARSAW', '1996-08-24', 'Results', 'of', 'Polish', 'first', 'division', 'soccer', 'matches', 'on', 'Saturday', ':', 'Amica', 'Wronki', '3', 'Hutnik', 'Krakow', '0', 'Sokol', 'Tychy', '5', 'Lech', 'Poznan', 'Rakow', 'Czestochowa', '1', 'Stomil', 'Olsztyn', '4', 'Wisla', 'Gornik', 'Zabrze', 'Slask', 'Wroclaw', 'Odra', 'Wodzislaw', 'GKS', 'Katowice', 'Polonia', 'Warsaw', 'Zaglebie', 'Lubin', '2', 'LKS', 'Lodz', 'Legia', 'Belchatow', 'CRICKET', 'POLLOCK', 'CONCLUDES', 'WARWICKSHIRE', 'CAREER', 'WITH', 'FLOURISH', 'LONDON', '1996-08-25', 'South', 'African', 'fast', 'bowler', 'Shaun', 'Pollock', 'concluded', 'his', 'Warwickshire', 'career', 'with', 'a', 'flourish', 'Sunday', 'by', 'taking', 'the', 'final', 'three', 'wickets', 'during', 'county', \"'s\", 'league', 'victory', 'over', 'Worcestershire', ',', 'who', 'returns', 'home', 'Tuesday', 'for', 'an', 'ankle', 'operation', 'took', 'last', 'in', 'nine', 'balls', 'as', 'were', 'dismissed', '154', 'After', 'hour', 'interruption', 'rain', 'then', 'reached', 'adjusted', 'target', '109', '13', 'to', 'spare', 'and', 'record', 'their', 'fifth', 'win', 'six', 'games', 'are', 'currently', 'fourth', 'position', 'behind', 'Yorkshire', 'Nottinghamshire', 'Surrey', 'captain', 'David', 'Byas', 'completed', 'third', 'century', 'side', 'swept', 'clear', 'at', 'top', 'table', 'reaching', 'best', '111', 'not', 'out', 'against', 'Lancashire', 'total', '205', 'eight', 'from', '40', 'overs', 'looked', 'reasonable', 'before', 'put', 'attack', 'sword', 'collecting', 'runs', 'just', '100', 'sixes', 'fours', 'eventually', 'only', 'four', 'down', '7.5', 'CYCLING', 'BALLANGER', 'KEEPS', 'SPRINT', 'TITLE', 'IN', 'STYLE', 'Martin', 'Ayres', 'MANCHESTER', 'England', '1996-08-30', 'Felicia', 'Ballanger', 'France', 'confirmed', 'her', 'status', 'world', 'number', 'one', 'woman', 'sprinter', 'when', 'she', 'retained', 'title', 'cycling', 'championships', 'Friday', 'beat', 'Germany', 'Annett', 'Neumann', '2-0', 'best-of-three', 'add', 'Olympic', 'gold', 'medal', 'won', 'July', 'also', 'place', 'sprint', 'Magali', 'Faure', 'defeating', 'ex-world', 'champion', 'Tanya', 'Dubnicoff', 'Canada', '25', 'will', 'be', 'aiming', 'complete', 'track', 'double', 'defends', '500', 'metres', 'time', 'trial', 'The', 'other', 'night', 'women', '24-kms', 'points', 'race', 'ended', 'success', 'reigning', 'Russia', 'Svetlana', 'Samokhalova', 'fought', 'off', 'spirited', 'challenge', 'American', 'Jane', 'Quigley', 'take', 'second', 'year', 'nation', 'have', 'two', 'riders', 'field', 'made', 'full', 'use', 'numerical', 'superiority', 'Goulnara', 'Fatkoullina', 'helped', 'build', 'unbeatable', 'lead', 'snatching', 'bronze', 'former', 'medallist', 'event', 'led', 'half', 'distance', '\"', 'I', 'went', 'so', 'close', 'this', 'but', 'having', 'certainly', 'gave', 'Russians', 'advantage', 'said', 'lapped', 'which', 'left', 'Ingrid', 'Haringa', 'Netherlands', 'seventh', 'despite', 'highest', 'score', 'Nathalie', 'Lancien', 'missed', 'winning', 'finished', 'disappointing', '10th', 'RUGBY', 'LEAGUE', 'Australian', 'rugby', 'standings', 'SYDNEY', '1996-08-26', 'premiership', 'after', 'played', 'weekend', '(', 'tabulate', 'under', 'drawn', 'lost', ')', 'Manly', '21', '17', '501', '181', '34', 'Brisbane', '16', '569', '257', '32', 'North', 'Sydney', '14', '560', '317', '30', 'City', '20', '487', '293', '29', 'Cronulla', '12', '6', '359', '258', '26', 'Canberra', '8', '502', '374', 'St', 'George', '421', '344', 'Newcastle', '11', '9', '416', '366', '23', 'Western', 'Suburbs', '382', '426', 'Auckland', '10', '406', '389', '22', 'Tigers', '309', '435', 'Parramatta', '388', '391', 'Bulldogs', '325', '356', 'Illawarra', '395', '432', 'Reds', '297', '398', 'Penrith', '339', '448', 'Queensland', '15', '266', '593', 'Gold', 'Coast', '351', '483', '304', '586', '210', '460', '--', 'Newsroom', '61-2', '9373-1800', 'TENNIS', 'AT', 'HAMLET', 'CUP', 'COMMACK', 'New', 'York', 'Hamlet', 'Cup', 'tennis', 'tournament', 'prefix', 'denotes', 'seed
			`"22154\n",`
			`"9\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "a9e31a5d56f947bab11440afba13f2b2",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/850 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "2c08aba575f4435ca98af63d5dc96fc0",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"(0.5546171171171171, 0.2832901926948519, 0.37502379592613744)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "a042bbe87f404a0e8463a641335693b6",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/850 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "11115dc17f3e4f9195a1b598a5cc1feb",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"(0.5892077354959451, 0.543284440609721, 0.5653149783031572)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "d0734c011bb341cd9352338fbc3be59c",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/850 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "c5af3a6e70d44efc9e36bce4874e18d9",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"(0.5969561794804513, 0.6542996836353178, 0.6243139407244785)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "adbd98d4c11647f3bd348bca22b8e98d",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/850 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "804fdc7cbd5c4d64877813c63c5339e4",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"(0.5954920019389239, 0.7066436583261432, 0.6463238195449165)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "adb5363844f74ef8b792704b71366b9b",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/850 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "165c7d86d6614f29a6fa4ca4ce8b9c0f",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"(0.5790436970944864, 0.727926373310325, 0.6450050968399592)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "b58d5293d2624cceb1e71bb7aebbb915",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/95 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "ca543149052943ac9dc1bb054ab08ec3",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/215 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "064d06a3ae394f2698deffd4727a7ec0",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/215 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "dc5d4d0379a9489db22c8d3b727403ad",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/230 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`}`
			`],`
			`"source": [`
			`"import torch\n",`
			`"import torch.nn as nn\n",`
			`"import torch.optim as optim\n",`
			`"from torch.utils.data import DataLoader, Dataset\n",`
			`"from collections import Counter\n",`
			`"from sklearn.model_selection import train_test_split\n",`
			`"\n",`
			`"from torchtext.vocab import vocab\n",`
			`"from tqdm.notebook import tqdm\n",`
			`"import pandas as pd\n",`
			`"import numpy as np\n",`
			`"\n",`
			`"# Funkcja do wczytywania danych\n",`
			`"def load_datasets():\n",`
			`" train_df = pd.read_csv(\n",`
			`" \"train/train.tsv.xz\", compression=\"xz\", sep=\"\\t\", names=[\"Category\", \"Sentence\"]\n",`
			`" )\n",`
			`" dev_df = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",`
			`" dev_labels = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", names=[\"Category\"])\n",`
			`" test_df = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", names=[\"Sentence\"])\n",`
			`" return train_df, dev_df, dev_labels, test_df\n",`
			`"\n",`
			`"train_df, dev_df, dev_labels, test_df = load_datasets()\n",`
			`"train_texts, val_texts, train_labels, val_labels = train_test_split(\n",`
			`" train_df[\"Sentence\"], train_df[\"Category\"], test_size=0.1, random_state=42\n",`
			`")\n",`
			`"train_df = pd.DataFrame({\"Sentence\": train_texts, \"Category\": train_labels})\n",`
			`"val_df = pd.DataFrame({\"Sentence\": val_texts, \"Category\": val_labels})\n",`
			`"\n",`
			`"# Tokenizacja danych\n",`
			`"train_df[\"tokens\"] = train_df[\"Sentence\"].apply(lambda x: x.split())\n",`
			`"train_df[\"label_tokens\"] = train_df[\"Category\"].apply(lambda x: x.split())\n",`
			`"test_df[\"tokens\"] = test_df[\"Sentence\"].apply(lambda x: x.split())\n",`
			`"val_df[\"tokens\"] = val_df[\"Sentence\"].apply(lambda x: x.split())\n",`
			`"val_df[\"label_tokens\"] = val_df[\"Category\"].apply(lambda x: x.split())\n",`
			`"dev_df[\"tokens\"] = dev_df[\"Sentence\"].apply(lambda x: x.split())\n",`
			`"dev_df[\"label_tokens\"] = dev_labels[\"Category\"].apply(lambda x: x.split())\n",`
			`"\n",`
			`"# Budowanie słownika\n",`
			`"def create_vocab(token_list):\n",`
			`" count = Counter()\n",`
			`" for tokens in token_list:\n",`
			`" count.update(tokens)\n",`
			`" return vocab(count, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])\n",`
			`"\n",`
			`"vocabulary = create_vocab(train_df[\"tokens\"])\n",`
			`"index_to_string = vocabulary.get_itos()\n",`
			`"print(index_to_string)\n",`
			`"print(len(index_to_string))\n",`
			`"\n",`
			`"vocabulary.set_default_index(vocabulary[\"<unk>\"])\n",`
			`"\n",`
			`"# Przetwarzanie danych na wektory\n",`
			`"def vectorize_data(data_tokens):\n",`
			`" return [\n",`
			`" torch.tensor([vocabulary[\"<bos>\"]] + [vocabulary[token] for token in tokens] + [vocabulary[\"<eos>\"]], dtype=torch.long)\n",`
			`" for tokens in data_tokens\n",`
			`" ]\n",`
			`"\n",`
			`"def vectorize_labels(data_labels, label_map):\n",`
			`" return [\n",`
			`" torch.tensor([0] + [label_map[label] for label in labels] + [0], dtype=torch.long, device=device)\n",`
			`" for labels in data_labels\n",`
			`" ]\n",`
			`"\n",`
			`"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",`
			`"train_tokens = vectorize_data(train_df[\"tokens\"])\n",`
			`"test_tokens = vectorize_data(test_df[\"tokens\"])\n",`
			`"val_tokens = vectorize_data(val_df[\"tokens\"])\n",`
			`"dev_tokens = vectorize_data(dev_df[\"tokens\"])\n",`
			`"\n",`
			`"label_list = [\"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-MISC\", \"I-MISC\"]\n",`
			`"label_map = {label: idx for idx, label in enumerate(label_list)}\n",`
			`"\n",`
			`"train_label_tokens = vectorize_labels(train_df[\"label_tokens\"], label_map)\n",`
			`"val_label_tokens = vectorize_labels(val_df[\"label_tokens\"], label_map)\n",`
			`"dev_label_tokens = vectorize_labels(dev_df[\"label_tokens\"], label_map)\n",`
			`"\n",`
			`"# Funkcja do obliczania metryk\n",`
			`"def calculate_metrics(true_labels, pred_labels):\n",`
			`" accuracy = 0\n",`
			`" true_positive = 0\n",`
			`" false_positive = 0\n",`
			`" total_selected = 0\n",`
			`" total_relevant = 0\n",`
			`"\n",`
			`" for pred, true in zip(pred_labels, true_labels):\n",`
			`" if pred == true:\n",`
			`" accuracy += 1\n",`
			`"\n",`
			`" if pred > 0 and pred == true:\n",`
			`" true_positive += 1\n",`
			`"\n",`
			`" if pred > 0:\n",`
			`" total_selected += 1\n",`
			`"\n",`
			`" if true > 0:\n",`
			`" total_relevant += 1\n",`
			`"\n",`
			`" precision = true_positive / total_selected if total_selected > 0 else 1.0\n",`
			`" recall = true_positive / total_relevant if total_relevant > 0 else 1.0\n",`
			`" f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0\n",`
			`"\n",`
			`" return precision, recall, f1_score\n",`
			`"\n",`
			`"label_indices = [label_map[label] for labels in train_df[\"label_tokens\"] for label in labels]\n",`
			`"num_classes = max(label_indices) + 1\n",`
			`"print(num_classes)\n",`
			`"\n",`
			`"# Definicja modelu LSTM\n",`
			`"class BiLSTM(nn.Module):\n",`
			`" def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes):\n",`
			`" super(BiLSTM, self).__init__()\n",`
			`" self.embedding = nn.Embedding(vocab_size, embed_dim)\n",`
			`" self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)\n",`
			`" self.fc = nn.Linear(hidden_dim * 2, num_classes)\n",`
			`"\n",`
			`" def forward(self, x):\n",`
			`" x_embed = torch.relu(self.embedding(x))\n",`
			`" lstm_out, _ = self.lstm(x_embed)\n",`
			`" logits = self.fc(lstm_out)\n",`
			`" return logits\n",`
			`"\n",`
			`"model = BiLSTM(len(index_to_string), 100, 100, 1, num_classes).to(device)\n",`
			`"loss_function = nn.CrossEntropyLoss()\n",`
			`"optimizer = optim.Adam(model.parameters())\n",`
			`"\n",`
			`"# Funkcja do ewaluacji modelu\n",`
			`"def evaluate_model(data_tokens, data_labels, model):\n",`
			`" true_labels = []\n",`
			`" pred_labels = []\n",`
			`" for i in tqdm(range(len(data_labels))):\n",`
			`" tokens = data_tokens[i].unsqueeze(0)\n",`
			`" true_labels_batch = list(data_labels[i].cpu().numpy())\n",`
			`" true_labels += true_labels_batch\n",`
			`"\n",`
			`" pred_logits = model(tokens).squeeze(0)\n",`
			`" pred_batch = torch.argmax(pred_logits, 1)\n",`
			`" pred_labels += list(pred_batch.cpu().numpy())\n",`
			`"\n",`
			`" return calculate_metrics(true_labels, pred_labels)\n",`
			`"\n",`
			`"# Funkcja do predykcji etykiet\n",`
			`"def predict_labels(data_tokens, model, label_map):\n",`
			`" pred_labels = []\n",`
			`" inv_label_map = {v: k for k, v in label_map.items()}\n",`
			`"\n",`
			`" for i in tqdm(range(len(data_tokens))):\n",`
			`" tokens = data_tokens[i].unsqueeze(0)\n",`
			`" pred_logits = model(tokens).squeeze(0)\n",`
			`" pred_batch = torch.argmax(pred_logits, 1)\n",`
			`" pred_label_list = [inv_label_map[label.item()] for label in pred_batch]\n",`
			`" pred_label_list = pred_label_list[1:-1]\n",`
			`" pred_labels.append(\" \".join(pred_label_list))\n",`
			`"\n",`
			`" return pred_labels\n",`
			`"\n",`
			`"# Trening modelu\n",`
			`"EPOCHS = 5\n",`
			`"for epoch in range(EPOCHS):\n",`
			`" model.train()\n",`
			`" for i in tqdm(range(len(train_label_tokens))):\n",`
			`" tokens = train_tokens[i].unsqueeze(0)\n",`
			`" true_labels = train_label_tokens[i].unsqueeze(1)\n",`
			`"\n",`
			`" pred_labels = model(tokens)\n",`
			`" optimizer.zero_grad()\n",`
			`" loss = loss_function(pred_labels.squeeze(0), true_labels.squeeze(1))\n",`
			`" loss.backward()\n",`
			`" optimizer.step()\n",`
			`"\n",`
			`" model.eval()\n",`
			`" print(evaluate_model(val_tokens, val_label_tokens, model))\n",`
			`"\n",`
			`"# Ewaluacja na zbiorze walidacyjnym i deweloperskim\n",`
			`"evaluate_model(val_tokens, val_label_tokens, model)\n",`
			`"evaluate_model(dev_tokens, dev_label_tokens, model)\n",`
			`"\n",`
			`"# Generowanie predykcji dla zbiorów testowych\n",`
			`"dev_predictions = predict_labels(dev_tokens, model, label_map)\n",`
			`"dev_predictions_df = pd.DataFrame(dev_predictions, columns=[\"Category\"])\n",`
			`"dev_predictions_df.to_csv(\"dev-0/out.tsv\", index=False, header=False)\n",`
			`"\n",`
			`"test_predictions = predict_labels(test_tokens, model, label_map)\n",`
			`"test_predictions_df = pd.DataFrame(test_predictions, columns=[\"Category\"])\n",`
			`"test_predictions_df.to_csv(\"test-A/out.tsv\", index=False, header=False)\n"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "myenv",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.0"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`