From 6579b7add362f997525adbd41f3b9414d9bd8ae0 Mon Sep 17 00:00:00 2001 From: "KB94\\KamBor" Date: Fri, 12 Jan 2024 22:56:13 +0100 Subject: [PATCH] initial version --- Projekt.ipynb | 394 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 Projekt.ipynb diff --git a/Projekt.ipynb b/Projekt.ipynb new file mode 100644 index 0000000..e29eda5 --- /dev/null +++ b/Projekt.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for dair-ai/emotion contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/dair-ai/emotion\n", + "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", + "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", + " warnings.warn(\n", + "Downloading builder script: 100%|██████████| 3.97k/3.97k [00:00 46\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 47\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[0;32m 49\u001b[0m average_loss \u001b[38;5;241m=\u001b[39m total_loss \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mlen\u001b[39m(train_loader)\n", + "File \u001b[1;32mc:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\torch\\_tensor.py:492\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m 482\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 483\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m 484\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m 485\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 490\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m 491\u001b[0m )\n\u001b[1;32m--> 492\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m 494\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\torch\\autograd\\__init__.py:251\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m 246\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m 248\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m 249\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m 250\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 251\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[0;32m 252\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 253\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset('dair-ai/emotion')\n", + "\n", + "print(dataset)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6) # 6 - liczba klas (0-5)\n", + "\n", + "\n", + "max_len = 128 \n", + "\n", + "train_data = dataset['train']\n", + "train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n", + "train_labels = torch.tensor(train_data['label'])\n", + "\n", + "train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n", + "train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=2e-5)\n", + "\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + "num_epochs = 3\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0.0\n", + "\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " optimizer.zero_grad()\n", + "\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = outputs.loss\n", + " total_loss += loss.item()\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " average_loss = total_loss / len(train_loader)\n", + " print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n", + "\n", + "model.save_pretrained('emotion_model')\n", + "\n", + "model.eval()\n", + "all_labels = []\n", + "all_predictions = []\n", + "\n", + "with torch.no_grad():\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " outputs = model(input_ids, attention_mask=attention_mask)\n", + " predictions = torch.argmax(outputs.logits, dim=1)\n", + "\n", + " all_labels.extend(labels.numpy())\n", + " all_predictions.extend(predictions.numpy())\n", + "\n", + "accuracy = accuracy_score(all_labels, all_predictions)\n", + "classification_report_str = classification_report(all_labels, all_predictions)\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print('Classification Report:')\n", + "print(classification_report_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset('dair-ai/emotion')\n", + "\n", + "print(dataset)\n", + "\n", + "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", + "model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=6) # 6 - liczba klas (0-5)\n", + "\n", + "max_len = 128\n", + "\n", + "train_data = dataset['train']\n", + "train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n", + "train_labels = torch.tensor(train_data['label'])\n", + "\n", + "train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n", + "train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=2e-5)\n", + "\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + "num_epochs = 3\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0.0\n", + "\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " optimizer.zero_grad()\n", + "\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = outputs.loss\n", + " total_loss += loss.item()\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " average_loss = total_loss / len(train_loader)\n", + " print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n", + "\n", + "model.save_pretrained('emotion_gpt2_model')\n", + "\n", + "model.eval()\n", + "all_labels = []\n", + "all_predictions = []\n", + "\n", + "with torch.no_grad():\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " outputs = model(input_ids, attention_mask=attention_mask)\n", + " predictions = torch.argmax(outputs.logits, dim=1)\n", + "\n", + " all_labels.extend(labels.numpy())\n", + " all_predictions.extend(predictions.numpy())\n", + "\n", + "accuracy = accuracy_score(all_labels, all_predictions)\n", + "classification_report_str = classification_report(all_labels, all_predictions)\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print('Classification Report:')\n", + "print(classification_report_str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset('dair-ai/emotion')\n", + "\n", + "print(dataset)\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n", + "model = T5ForConditionalGeneration.from_pretrained('t5-small')\n", + "\n", + "max_len = 128\n", + "\n", + "train_data = dataset['train']\n", + "train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n", + "train_labels = torch.tensor(train_data['label'])\n", + "\n", + "train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n", + "train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=2e-5)\n", + "\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + "num_epochs = 3\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0.0\n", + "\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " optimizer.zero_grad()\n", + "\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = outputs.loss\n", + " total_loss += loss.item()\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " average_loss = total_loss / len(train_loader)\n", + " print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n", + "\n", + "model.save_pretrained('emotion_t5_model')\n", + "\n", + "model.eval()\n", + "all_labels = []\n", + "all_predictions = []\n", + "\n", + "with torch.no_grad():\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " predictions = torch.argmax(outputs.logits, dim=1)\n", + "\n", + " all_labels.extend(labels.numpy())\n", + " all_predictions.extend(predictions.numpy())\n", + "\n", + "accuracy = accuracy_score(all_labels, all_predictions)\n", + "classification_report_str = classification_report(all_labels, all_predictions)\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print('Classification Report:')\n", + "print(classification_report_str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset('dair-ai/emotion')\n", + "\n", + "print(dataset)\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-small')\n", + "model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')\n", + "\n", + "max_len = 128\n", + "\n", + "train_data = dataset['train']\n", + "train_labels = train_data['label']\n", + "\n", + "train_prompts = [f'emotion: {text}' for text in train_data['text']]\n", + "\n", + "train_encodings = tokenizer(train_prompts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n", + "train_labels = torch.tensor(train_labels)\n", + "\n", + "train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n", + "train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=2e-5)\n", + "\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + "num_epochs = 3\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0.0\n", + "\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " optimizer.zero_grad()\n", + "\n", + " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", + " loss = outputs.loss\n", + " total_loss += loss.item()\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " average_loss = total_loss / len(train_loader)\n", + " print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n", + "\n", + "# Zapisz model\n", + "model.save_pretrained('emotion_flant5_model')\n", + "\n", + "# Ewaluacja modelu\n", + "model.eval()\n", + "all_labels = []\n", + "all_predictions = []\n", + "\n", + "with torch.no_grad():\n", + " for input_ids, attention_mask, labels in train_loader:\n", + " outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=1)\n", + " predictions = torch.argmax(outputs, dim=1)\n", + "\n", + " all_labels.extend(labels.numpy())\n", + " all_predictions.extend(predictions.numpy())\n", + "\n", + "# Oblicz metryki ewaluacyjne\n", + "accuracy = accuracy_score(all_labels, all_predictions)\n", + "classification_report_str = classification_report(all_labels, all_predictions)\n", + "\n", + "print(f'Accuracy: {accuracy}')\n", + "print('Classification Report:')\n", + "print(classification_report_str)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}