initial version

This commit is contained in:
KB94\KamBor 2024-01-12 22:56:13 +01:00
parent 33a704ce62
commit 6579b7add3

394
Projekt.ipynb Normal file
View File

@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for dair-ai/emotion contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/dair-ai/emotion\n",
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
" warnings.warn(\n",
"Downloading builder script: 100%|██████████| 3.97k/3.97k [00:00<?, ?B/s]\n",
"Downloading metadata: 100%|██████████| 3.28k/3.28k [00:00<?, ?B/s]\n",
"Downloading readme: 100%|██████████| 8.78k/8.78k [00:00<?, ?B/s]\n",
"Downloading data: 100%|██████████| 592k/592k [00:00<00:00, 7.52MB/s]\n",
"Downloading data: 100%|██████████| 74.0k/74.0k [00:00<00:00, 4.35MB/s]\n",
"Downloading data: 100%|██████████| 74.9k/74.9k [00:00<00:00, 5.10MB/s]\n",
"Generating train split: 100%|██████████| 16000/16000 [00:00<00:00, 86118.15 examples/s]\n",
"Generating validation split: 100%|██████████| 2000/2000 [00:00<00:00, 70724.89 examples/s]\n",
"Generating test split: 100%|██████████| 2000/2000 [00:00<00:00, 69908.56 examples/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['text', 'label'],\n",
" num_rows: 16000\n",
" })\n",
" validation: Dataset({\n",
" features: ['text', 'label'],\n",
" num_rows: 2000\n",
" })\n",
" test: Dataset({\n",
" features: ['text', 'label'],\n",
" num_rows: 2000\n",
" })\n",
"})\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]\n",
"c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\huggingface_hub\\file_download.py:149: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\KamBo\\.cache\\huggingface\\hub\\models--bert-base-uncased. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
"To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
" warnings.warn(message)\n",
"vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 983kB/s]\n",
"tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.40MB/s]\n",
"config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] \n",
"model.safetensors: 100%|██████████| 440M/440M [00:11<00:00, 37.8MB/s] \n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"c:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\optimization.py:429: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 46\u001b[0m\n\u001b[0;32m 43\u001b[0m loss \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mloss\n\u001b[0;32m 44\u001b[0m total_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mitem()\n\u001b[1;32m---> 46\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 47\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[0;32m 49\u001b[0m average_loss \u001b[38;5;241m=\u001b[39m total_loss \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mlen\u001b[39m(train_loader)\n",
"File \u001b[1;32mc:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\torch\\_tensor.py:492\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m 482\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 483\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m 484\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m 485\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 490\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m 491\u001b[0m )\n\u001b[1;32m--> 492\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m 494\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\KamBo\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\torch\\autograd\\__init__.py:251\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m 246\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m 248\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m 249\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m 250\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 251\u001b[0m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[0;32m 252\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 253\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 259\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import BertTokenizer, BertForSequenceClassification, AdamW\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset('dair-ai/emotion')\n",
"\n",
"print(dataset)\n",
"\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
"model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6) # 6 - liczba klas (0-5)\n",
"\n",
"\n",
"max_len = 128 \n",
"\n",
"train_data = dataset['train']\n",
"train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n",
"train_labels = torch.tensor(train_data['label'])\n",
"\n",
"train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n",
"train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=2e-5)\n",
"\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"\n",
"num_epochs = 3\n",
"for epoch in range(num_epochs):\n",
" model.train()\n",
" total_loss = 0.0\n",
"\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" optimizer.zero_grad()\n",
"\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n",
"\n",
"model.save_pretrained('emotion_model')\n",
"\n",
"model.eval()\n",
"all_labels = []\n",
"all_predictions = []\n",
"\n",
"with torch.no_grad():\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" predictions = torch.argmax(outputs.logits, dim=1)\n",
"\n",
" all_labels.extend(labels.numpy())\n",
" all_predictions.extend(predictions.numpy())\n",
"\n",
"accuracy = accuracy_score(all_labels, all_predictions)\n",
"classification_report_str = classification_report(all_labels, all_predictions)\n",
"\n",
"print(f'Accuracy: {accuracy}')\n",
"print('Classification Report:')\n",
"print(classification_report_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset('dair-ai/emotion')\n",
"\n",
"print(dataset)\n",
"\n",
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n",
"model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=6) # 6 - liczba klas (0-5)\n",
"\n",
"max_len = 128\n",
"\n",
"train_data = dataset['train']\n",
"train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n",
"train_labels = torch.tensor(train_data['label'])\n",
"\n",
"train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n",
"train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=2e-5)\n",
"\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"\n",
"num_epochs = 3\n",
"for epoch in range(num_epochs):\n",
" model.train()\n",
" total_loss = 0.0\n",
"\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" optimizer.zero_grad()\n",
"\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n",
"\n",
"model.save_pretrained('emotion_gpt2_model')\n",
"\n",
"model.eval()\n",
"all_labels = []\n",
"all_predictions = []\n",
"\n",
"with torch.no_grad():\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" outputs = model(input_ids, attention_mask=attention_mask)\n",
" predictions = torch.argmax(outputs.logits, dim=1)\n",
"\n",
" all_labels.extend(labels.numpy())\n",
" all_predictions.extend(predictions.numpy())\n",
"\n",
"accuracy = accuracy_score(all_labels, all_predictions)\n",
"classification_report_str = classification_report(all_labels, all_predictions)\n",
"\n",
"print(f'Accuracy: {accuracy}')\n",
"print('Classification Report:')\n",
"print(classification_report_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset('dair-ai/emotion')\n",
"\n",
"print(dataset)\n",
"\n",
"tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
"model = T5ForConditionalGeneration.from_pretrained('t5-small')\n",
"\n",
"max_len = 128\n",
"\n",
"train_data = dataset['train']\n",
"train_encodings = tokenizer(train_data['text'], truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n",
"train_labels = torch.tensor(train_data['label'])\n",
"\n",
"train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n",
"train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=2e-5)\n",
"\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"\n",
"num_epochs = 3\n",
"for epoch in range(num_epochs):\n",
" model.train()\n",
" total_loss = 0.0\n",
"\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" optimizer.zero_grad()\n",
"\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n",
"\n",
"model.save_pretrained('emotion_t5_model')\n",
"\n",
"model.eval()\n",
"all_labels = []\n",
"all_predictions = []\n",
"\n",
"with torch.no_grad():\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" predictions = torch.argmax(outputs.logits, dim=1)\n",
"\n",
" all_labels.extend(labels.numpy())\n",
" all_predictions.extend(predictions.numpy())\n",
"\n",
"accuracy = accuracy_score(all_labels, all_predictions)\n",
"classification_report_str = classification_report(all_labels, all_predictions)\n",
"\n",
"print(f'Accuracy: {accuracy}')\n",
"print('Classification Report:')\n",
"print(classification_report_str)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset('dair-ai/emotion')\n",
"\n",
"print(dataset)\n",
"\n",
"tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-small')\n",
"model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')\n",
"\n",
"max_len = 128\n",
"\n",
"train_data = dataset['train']\n",
"train_labels = train_data['label']\n",
"\n",
"train_prompts = [f'emotion: {text}' for text in train_data['text']]\n",
"\n",
"train_encodings = tokenizer(train_prompts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')\n",
"train_labels = torch.tensor(train_labels)\n",
"\n",
"train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\n",
"train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=2e-5)\n",
"\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"\n",
"num_epochs = 3\n",
"for epoch in range(num_epochs):\n",
" model.train()\n",
" total_loss = 0.0\n",
"\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" optimizer.zero_grad()\n",
"\n",
" outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" average_loss = total_loss / len(train_loader)\n",
" print(f'Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}')\n",
"\n",
"# Zapisz model\n",
"model.save_pretrained('emotion_flant5_model')\n",
"\n",
"# Ewaluacja modelu\n",
"model.eval()\n",
"all_labels = []\n",
"all_predictions = []\n",
"\n",
"with torch.no_grad():\n",
" for input_ids, attention_mask, labels in train_loader:\n",
" outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=1)\n",
" predictions = torch.argmax(outputs, dim=1)\n",
"\n",
" all_labels.extend(labels.numpy())\n",
" all_predictions.extend(predictions.numpy())\n",
"\n",
"# Oblicz metryki ewaluacyjne\n",
"accuracy = accuracy_score(all_labels, all_predictions)\n",
"classification_report_str = classification_report(all_labels, all_predictions)\n",
"\n",
"print(f'Accuracy: {accuracy}')\n",
"print('Classification Report:')\n",
"print(classification_report_str)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}