diff --git a/SDMockup.ipynb b/SDMockup.ipynb index 917dbe7..ecae5dc 100644 --- a/SDMockup.ipynb +++ b/SDMockup.ipynb @@ -1,5 +1,89 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tensorflow_addons\\utils\\tfa_eol_msg.py:23: UserWarning: \n", + "\n", + "TensorFlow Addons (TFA) has ended development and introduction of new features.\n", + "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n", + "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n", + "\n", + "For more information see: https://github.com/tensorflow/addons/issues/2807 \n", + "\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.keras.models import load_model\n", + "import tensorflow_addons as tfa\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\generation_utils.py:24: FutureWarning: Importing `GenerationMixin` from `src/transformers/generation_utils.py` is deprecated and will be removed in Transformers v5. Import as `from transformers import GenerationMixin` instead.\n", + " warnings.warn(\n", + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\generation_tf_utils.py:24: FutureWarning: Importing `TFGenerationMixin` from `src/transformers/generation_tf_utils.py` is deprecated and will be removed in Transformers v5. Import as `from transformers import TFGenerationMixin` instead.\n", + " warnings.warn(\n", + "loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n", + "loading file added_tokens.json from cache at None\n", + "loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n", + "loading file tokenizer_config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\tokenizer_config.json\n", + "loading configuration file config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\config.json\n", + "Model config BertConfig {\n", + " \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n", + " \"architectures\": [\n", + " \"BertForMaskedLM\",\n", + " \"BertForPreTraining\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"classifier_dropout\": null,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"max_position_embeddings\": 512,\n", + " \"model_type\": \"bert\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.28.1\",\n", + " \"type_vocab_size\": 2,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 60000\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "loaded_model = tf.keras.models.load_model('model')\n", + "from transformers import *\n", + "tokenizer = BertTokenizer.from_pretrained(\"dkleczek/bert-base-polish-uncased-v1\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -29,60 +113,62 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "class NaturalLanguageUnderstanding:\n", - " acts: dict[list[str], str] = {\n", - " ( \"potwierdzam\", \"dobrze\", \"ok\" ): \"ack\",\n", - " (\"do widziena\", \"czesc\", \"koniec\", \"do zobaczenia\"): \"bye\",\n", - " (\"cześć\", \"dzień dobry\", \"hello\", \"hej\"): \"hello\",\n", - " (\"pomóc\", \"pomocy\", \"pomoc\"): \"help\",\n", - " (\"zaprzeczam\", \"odrzucam\"): \"negate\",\n", - " (\"alternatywny\", \"inne\", \"alternatywa\", \"inna\"): \"requalts\",\n", - " (\"szczegółów\", \"informacji\", \"info\", \"informacje\"): \"reqmore\",\n", - " (\"restart\"): \"restart\",\n", - " (\"dziękuję\", \"dzięki\"): \"thankyou\",\n", - " (\"tak\", \"chcę\"): \"confirm\",\n", - " (\"nie chce\"): \"deny\",\n", - " (\"basen\", \"parking\", \"śniadania\", \"osoby\"): \"inform\",\n", - " (\"jaki\",\"?\", \"czy\", \"jak\", \"ile\", \"co\", \"gdzie\"): \"request\"\n", - " }\n", + "class NLU:\n", " def __init__(self, text: str):\n", " self.text = text\n", " self.act = \"\"\n", - " \n", - " \n", + "\n", " def get_dialog_act(self): \n", - " for word in self.text.lower().split():\n", - " for key in NaturalLanguageUnderstanding.acts:\n", - " if word in key:\n", - " self.act = NaturalLanguageUnderstanding.acts[key]\n", - " return\n", - " self.act = \"null\"\n", - " \n", - "\n" + " predicted_classes_names=[]\n", + " input = [self.text]\n", + " encoded_input = tokenizer.batch_encode_plus(input, padding=True, truncation=True, return_tensors='tf')\n", + " dataset = tf.data.Dataset.from_tensor_slices({\n", + " 'input_ids': encoded_input['input_ids'],\n", + " 'attention_mask': encoded_input['attention_mask'],\n", + " 'token_type_ids': encoded_input['token_type_ids']\n", + " }).batch(2)\n", + " predictions = loaded_model.predict(dataset)\n", + " classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]\n", + " for prediction in predictions: #trying to get predictions, if none it take maximum\n", + " predicted_classes = (predictions[prediction]> 0.5).astype(\"int32\")\n", + " if predicted_classes.sum()==0:\n", + " predicted_classes=max(predictions[prediction])\n", + " predicted_classes_indexes= np.where(predicted_classes==1)[1]\n", + " for p_classes in predicted_classes_indexes:\n", + " predicted_classes_names.append(classes[p_classes])\n", + " self.act=predicted_classes_names\n", + " return self.act\n" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1/1 [==============================] - 0s 58ms/step\n" + ] + }, { "data": { "text/plain": [ - "'request'" + "['request']" ] }, - "execution_count": 33, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "nlu = NaturalLanguageUnderstanding(\"Jaki pokój proponujesz w tym hotelu?\")\n", + "nlu = NLU(\"Jaki pokój proponujesz w tym hotelu?\")\n", "nlu.get_dialog_act()\n", "nlu.act" ] @@ -97,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -111,9 +197,9 @@ " (\"pokój\", \"pokoje\"): \"room\"\n", " }\n", " \n", - " def __init__(self, nlu: NaturalLanguageUnderstanding):\n", + " def __init__(self, nlu: NLU):\n", " self.slots = []\n", - " self.act = nlu.act\n", + " self.act = nlu\n", " self.text = nlu.text\n", " \n", " def get_dialog_slots(self):\n", @@ -126,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -135,7 +221,7 @@ "['room']" ] }, - "execution_count": 54, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -303,7 +389,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.2" }, "orig_nbformat": 4 }, diff --git a/eval.ipynb b/eval.ipynb index a2e7b78..43fb717 100644 --- a/eval.ipynb +++ b/eval.ipynb @@ -2,9 +2,25 @@ "cells": [ { "cell_type": "code", - "execution_count": 38, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tensorflow_addons\\utils\\tfa_eol_msg.py:23: UserWarning: \n", + "\n", + "TensorFlow Addons (TFA) has ended development and introduction of new features.\n", + "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n", + "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n", + "\n", + "For more information see: https://github.com/tensorflow/addons/issues/2807 \n", + "\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import pickle\n", "import pandas as pd\n", @@ -16,88 +32,48 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Model config BertConfig {\n", - " \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n", - " \"architectures\": [\n", - " \"BertForMaskedLM\",\n", - " \"BertForPreTraining\"\n", - " ],\n", - " \"attention_probs_dropout_prob\": 0.1,\n", - " \"classifier_dropout\": null,\n", - " \"hidden_act\": \"gelu\",\n", - " \"hidden_dropout_prob\": 0.1,\n", - " \"hidden_size\": 768,\n", - " \"id2label\": {\n", - " \"0\": \"LABEL_0\",\n", - " \"1\": \"LABEL_1\",\n", - " \"2\": \"LABEL_2\",\n", - " \"3\": \"LABEL_3\",\n", - " \"4\": \"LABEL_4\",\n", - " \"5\": \"LABEL_5\",\n", - " \"6\": \"LABEL_6\",\n", - " \"7\": \"LABEL_7\",\n", - " \"8\": \"LABEL_8\",\n", - " \"9\": \"LABEL_9\",\n", - " \"10\": \"LABEL_10\",\n", - " \"11\": \"LABEL_11\",\n", - " \"12\": \"LABEL_12\",\n", - " \"13\": \"LABEL_13\",\n", - " \"14\": \"LABEL_14\",\n", - " \"15\": \"LABEL_15\",\n", - " \"16\": \"LABEL_16\"\n", - " },\n", - " \"initializer_range\": 0.02,\n", - " \"intermediate_size\": 3072,\n", - " \"label2id\": {\n", - " \"LABEL_0\": 0,\n", - " \"LABEL_1\": 1,\n", - " \"LABEL_10\": 10,\n", - " \"LABEL_11\": 11,\n", - " \"LABEL_12\": 12,\n", - " \"LABEL_13\": 13,\n", - " \"LABEL_14\": 14,\n", - " \"LABEL_15\": 15,\n", - " \"LABEL_16\": 16,\n", - " \"LABEL_2\": 2,\n", - " \"LABEL_3\": 3,\n", - " \"LABEL_4\": 4,\n", - " \"LABEL_5\": 5,\n", - " \"LABEL_6\": 6,\n", - " \"LABEL_7\": 7,\n", - " \"LABEL_8\": 8,\n", - " \"LABEL_9\": 9\n", - " },\n", - " \"layer_norm_eps\": 1e-12,\n", - " \"max_position_embeddings\": 512,\n", - " \"model_type\": \"bert\",\n", - " \"num_attention_heads\": 12,\n", - " \"num_hidden_layers\": 12,\n", - " \"output_past\": true,\n", - " \"pad_token_id\": 0,\n", - " \"position_embedding_type\": \"absolute\",\n", - " \"transformers_version\": \"4.28.1\",\n", - " \"type_vocab_size\": 2,\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 60000\n", - "}\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "loaded_model = tf.keras.models.load_model('model')" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: \"tf_bert_for_sequence_classification\"\n", + "_________________________________________________________________\n", + " Layer (type) Output Shape Param # \n", + "=================================================================\n", + " bert (Custom>TFBertMainLaye multiple 132121344 \n", + " r) \n", + " \n", + " dropout_37 (Dropout) multiple 0 \n", + " \n", + " classifier (Dense) multiple 13073 \n", + " \n", + "=================================================================\n", + "Total params: 132,134,417\n", + "Trainable params: 132,134,417\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n" + ] + } + ], + "source": [ + "loaded_model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -106,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -116,13 +92,19 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\generation_utils.py:24: FutureWarning: Importing `GenerationMixin` from `src/transformers/generation_utils.py` is deprecated and will be removed in Transformers v5. Import as `from transformers import GenerationMixin` instead.\n", + " warnings.warn(\n", + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\transformers\\generation_tf_utils.py:24: FutureWarning: Importing `TFGenerationMixin` from `src/transformers/generation_tf_utils.py` is deprecated and will be removed in Transformers v5. Import as `from transformers import TFGenerationMixin` instead.\n", + " warnings.warn(\n", "loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n", "loading file added_tokens.json from cache at None\n", "loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n", @@ -165,28 +147,14 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:6 out of the last 8 calls to .predict_function at 0x00000247C45EE2A0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n", - "80/80 [==============================] - 14s 160ms/step\n", - "{'logits': array([[0.0429822 , 0.07436842, 0.06289113, ..., 0.07107946, 0.22445329,\n", - " 0.17556868],\n", - " [0.05423082, 0.04940203, 0.08606787, ..., 0.06320965, 0.09646532,\n", - " 0.85783374],\n", - " [0.02925512, 0.04107895, 0.04539371, ..., 0.04229825, 0.891557 ,\n", - " 0.05482448],\n", - " ...,\n", - " [0.07066443, 0.06370321, 0.08790383, ..., 0.08178279, 0.10815965,\n", - " 0.16227055],\n", - " [0.04984152, 0.03513726, 0.06702502, ..., 0.04850706, 0.08503693,\n", - " 0.10317416],\n", - " [0.1308529 , 0.0802078 , 0.8544387 , ..., 0.08336826, 0.08602922,\n", - " 0.08140229]], dtype=float32)}\n" + "80/80 [==============================] - 14s 170ms/step\n" ] } ], @@ -205,7 +173,26 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def predict(text):\n", + " input = [ text ]\n", + " encoded_input = tokenizer.batch_encode_plus(input, padding=True, truncation=True, return_tensors='tf')\n", + " dataset = tf.data.Dataset.from_tensor_slices({\n", + " 'input_ids': encoded_input['input_ids'],\n", + " 'attention_mask': encoded_input['attention_mask'],\n", + " 'token_type_ids': encoded_input['token_type_ids']\n", + " }).batch(2)\n", + " predictions = loaded_model.predict(dataset)\n", + " return predictions\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -230,7 +217,7 @@ " [0, 0, 1, ..., 0, 0, 0]])" ] }, - "execution_count": 97, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -241,12 +228,128 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]" ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "true_acts = acts.drop(acts.columns[0],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "true= true_acts.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "results = abs(predicted_classes-true)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "23" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "all=results.size" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "not_predicted = results.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "accuracy = (all-not_predicted)/all" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\macty\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1609: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, \"true nor predicted\", \"F-score is\", len(true_sum))\n" + ] + } + ], + "source": [ + "from sklearn.metrics import f1_score\n", + "micro_f1 = f1_score(true, predicted_classes, average='micro')\n", + "macro_f1 = f1_score(true, predicted_classes, average='macro')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9362880886426593\n" + ] + } + ], + "source": [ + "print(micro_f1)" + ] } ], "metadata": { diff --git a/evaluate.py b/evaluate.py index 1890958..ebcc544 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,4 +1,3 @@ -import pickle import pandas as pd import tensorflow as tf from tensorflow.keras.models import load_model @@ -29,6 +28,18 @@ predictions = loaded_model.predict(dataset) for prediction in predictions: predicted_classes = (predictions[prediction]> 0.5).astype("int32") classes = ["ack","affirm","bye","hello","help","negate","null","repeat","reqalts","reqmore","restart","silence","thankyou","confirm","deny","inform","request"] -print(predicted_classes) -## to do - evaluating f score +true_acts = acts.drop(acts.columns[0],axis=1) +true= true_acts.to_numpy() +results = abs(predicted_classes-true) +all=results.size +not_predicted = results.sum() +accuracy = (all-not_predicted)/all +from sklearn.metrics import f1_score +micro_f1 = f1_score(true, predicted_classes, average='micro') +macro_f1 = f1_score(true, predicted_classes, average='macro') + + +print(f"Accuracy: "+{accuracy}) +print(f"micro f1 score : "+{micro_f1}) +print(f"macro f1 score : "+{macro_f1}) \ No newline at end of file