systemy_dialogowe/eval.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.models import load_model\n",
    "import tensorflow_addons as tfa\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Model config BertConfig {\n",
      "  \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",
      "  \"architectures\": [\n",
      "    \"BertForMaskedLM\",\n",
      "    \"BertForPreTraining\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"id2label\": {\n",
      "    \"0\": \"LABEL_0\",\n",
      "    \"1\": \"LABEL_1\",\n",
      "    \"2\": \"LABEL_2\",\n",
      "    \"3\": \"LABEL_3\",\n",
      "    \"4\": \"LABEL_4\",\n",
      "    \"5\": \"LABEL_5\",\n",
      "    \"6\": \"LABEL_6\",\n",
      "    \"7\": \"LABEL_7\",\n",
      "    \"8\": \"LABEL_8\",\n",
      "    \"9\": \"LABEL_9\",\n",
      "    \"10\": \"LABEL_10\",\n",
      "    \"11\": \"LABEL_11\",\n",
      "    \"12\": \"LABEL_12\",\n",
      "    \"13\": \"LABEL_13\",\n",
      "    \"14\": \"LABEL_14\",\n",
      "    \"15\": \"LABEL_15\",\n",
      "    \"16\": \"LABEL_16\"\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"label2id\": {\n",
      "    \"LABEL_0\": 0,\n",
      "    \"LABEL_1\": 1,\n",
      "    \"LABEL_10\": 10,\n",
      "    \"LABEL_11\": 11,\n",
      "    \"LABEL_12\": 12,\n",
      "    \"LABEL_13\": 13,\n",
      "    \"LABEL_14\": 14,\n",
      "    \"LABEL_15\": 15,\n",
      "    \"LABEL_16\": 16,\n",
      "    \"LABEL_2\": 2,\n",
      "    \"LABEL_3\": 3,\n",
      "    \"LABEL_4\": 4,\n",
      "    \"LABEL_5\": 5,\n",
      "    \"LABEL_6\": 6,\n",
      "    \"LABEL_7\": 7,\n",
      "    \"LABEL_8\": 8,\n",
      "    \"LABEL_9\": 9\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"model_type\": \"bert\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.28.1\",\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 60000\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "loaded_model = tf.keras.models.load_model('model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "acts=pd.read_csv('user_acts_one_hot.csv', index_col=\"Unnamed: 0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "acts=acts.drop([\"Agent\"],axis=1)\n",
    "acts=acts.drop([\"Act\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n",
      "loading file added_tokens.json from cache at None\n",
      "loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n",
      "loading file tokenizer_config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\tokenizer_config.json\n",
      "loading configuration file config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\config.json\n",
      "Model config BertConfig {\n",
      "  \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",
      "  \"architectures\": [\n",
      "    \"BertForMaskedLM\",\n",
      "    \"BertForPreTraining\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"max_position_embeddings\": 512,\n",
      "  \"model_type\": \"bert\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.28.1\",\n",
      "  \"type_vocab_size\": 2,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 60000\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from transformers import *\n",
    "tokenizer = BertTokenizer.from_pretrained(\"dkleczek/bert-base-polish-uncased-v1\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:6 out of the last 8 calls to <function Model.make_predict_function.<locals>.predict_function at 0x00000247C45EE2A0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.\n",
      "80/80 [==============================] - 14s 160ms/step\n",
      "{'logits': array([[0.0429822 , 0.07436842, 0.06289113, ..., 0.07107946, 0.22445329,\n",
      "        0.17556868],\n",
      "       [0.05423082, 0.04940203, 0.08606787, ..., 0.06320965, 0.09646532,\n",
      "        0.85783374],\n",
      "       [0.02925512, 0.04107895, 0.04539371, ..., 0.04229825, 0.891557  ,\n",
      "        0.05482448],\n",
      "       ...,\n",
      "       [0.07066443, 0.06370321, 0.08790383, ..., 0.08178279, 0.10815965,\n",
      "        0.16227055],\n",
      "       [0.04984152, 0.03513726, 0.06702502, ..., 0.04850706, 0.08503693,\n",
      "        0.10317416],\n",
      "       [0.1308529 , 0.0802078 , 0.8544387 , ..., 0.08336826, 0.08602922,\n",
      "        0.08140229]], dtype=float32)}\n"
     ]
    }
   ],
   "source": [
    "input_data = acts[\"text\"].tolist()\n",
    "encoded_input = tokenizer.batch_encode_plus(input_data, padding=True, truncation=True, return_tensors='tf')\n",
    "dataset = tf.data.Dataset.from_tensor_slices({\n",
    "    'input_ids': encoded_input['input_ids'],\n",
    "    'attention_mask': encoded_input['attention_mask'],\n",
    "    'token_type_ids': encoded_input['token_type_ids']\n",
    "}).batch(2)\n",
    "\n",
    "# make predictions\n",
    "predictions = loaded_model.predict(dataset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "for prediction in predictions:\n",
    "    predicted_classes = (predictions[prediction]> 0.5).astype(\"int32\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 1, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 1, ..., 0, 0, 0]])"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
created model, processed user data 2023-04-20 16:28:18 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 38,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pickle\n",`
			`"import pandas as pd\n",`
			`"import tensorflow as tf\n",`
			`"from tensorflow.keras.models import load_model\n",`
			`"import tensorflow_addons as tfa\n",`
			`"import numpy as np"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 46,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"Model config BertConfig {\n",`
			`" \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",`
			`" \"architectures\": [\n",`
			`" \"BertForMaskedLM\",\n",`
			`" \"BertForPreTraining\"\n",`
			`" ],\n",`
			`" \"attention_probs_dropout_prob\": 0.1,\n",`
			`" \"classifier_dropout\": null,\n",`
			`" \"hidden_act\": \"gelu\",\n",`
			`" \"hidden_dropout_prob\": 0.1,\n",`
			`" \"hidden_size\": 768,\n",`
			`" \"id2label\": {\n",`
			`" \"0\": \"LABEL_0\",\n",`
			`" \"1\": \"LABEL_1\",\n",`
			`" \"2\": \"LABEL_2\",\n",`
			`" \"3\": \"LABEL_3\",\n",`
			`" \"4\": \"LABEL_4\",\n",`
			`" \"5\": \"LABEL_5\",\n",`
			`" \"6\": \"LABEL_6\",\n",`
			`" \"7\": \"LABEL_7\",\n",`
			`" \"8\": \"LABEL_8\",\n",`
			`" \"9\": \"LABEL_9\",\n",`
			`" \"10\": \"LABEL_10\",\n",`
			`" \"11\": \"LABEL_11\",\n",`
			`" \"12\": \"LABEL_12\",\n",`
			`" \"13\": \"LABEL_13\",\n",`
			`" \"14\": \"LABEL_14\",\n",`
			`" \"15\": \"LABEL_15\",\n",`
			`" \"16\": \"LABEL_16\"\n",`
			`" },\n",`
			`" \"initializer_range\": 0.02,\n",`
			`" \"intermediate_size\": 3072,\n",`
			`" \"label2id\": {\n",`
			`" \"LABEL_0\": 0,\n",`
			`" \"LABEL_1\": 1,\n",`
			`" \"LABEL_10\": 10,\n",`
			`" \"LABEL_11\": 11,\n",`
			`" \"LABEL_12\": 12,\n",`
			`" \"LABEL_13\": 13,\n",`
			`" \"LABEL_14\": 14,\n",`
			`" \"LABEL_15\": 15,\n",`
			`" \"LABEL_16\": 16,\n",`
			`" \"LABEL_2\": 2,\n",`
			`" \"LABEL_3\": 3,\n",`
			`" \"LABEL_4\": 4,\n",`
			`" \"LABEL_5\": 5,\n",`
			`" \"LABEL_6\": 6,\n",`
			`" \"LABEL_7\": 7,\n",`
			`" \"LABEL_8\": 8,\n",`
			`" \"LABEL_9\": 9\n",`
			`" },\n",`
			`" \"layer_norm_eps\": 1e-12,\n",`
			`" \"max_position_embeddings\": 512,\n",`
			`" \"model_type\": \"bert\",\n",`
			`" \"num_attention_heads\": 12,\n",`
			`" \"num_hidden_layers\": 12,\n",`
			`" \"output_past\": true,\n",`
			`" \"pad_token_id\": 0,\n",`
			`" \"position_embedding_type\": \"absolute\",\n",`
			`" \"transformers_version\": \"4.28.1\",\n",`
			`" \"type_vocab_size\": 2,\n",`
			`" \"use_cache\": true,\n",`
			`" \"vocab_size\": 60000\n",`
			`"}\n",`
			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"loaded_model = tf.keras.models.load_model('model')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 29,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"acts=pd.read_csv('user_acts_one_hot.csv', index_col=\"Unnamed: 0\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 30,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"acts=acts.drop([\"Agent\"],axis=1)\n",`
			`"acts=acts.drop([\"Act\"],axis=1)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 54,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n",`
			`"loading file added_tokens.json from cache at None\n",`
			`"loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n",`
			`"loading file tokenizer_config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\tokenizer_config.json\n",`
			`"loading configuration file config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\config.json\n",`
			`"Model config BertConfig {\n",`
			`" \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",`
			`" \"architectures\": [\n",`
			`" \"BertForMaskedLM\",\n",`
			`" \"BertForPreTraining\"\n",`
			`" ],\n",`
			`" \"attention_probs_dropout_prob\": 0.1,\n",`
			`" \"classifier_dropout\": null,\n",`
			`" \"hidden_act\": \"gelu\",\n",`
			`" \"hidden_dropout_prob\": 0.1,\n",`
			`" \"hidden_size\": 768,\n",`
			`" \"initializer_range\": 0.02,\n",`
			`" \"intermediate_size\": 3072,\n",`
			`" \"layer_norm_eps\": 1e-12,\n",`
			`" \"max_position_embeddings\": 512,\n",`
			`" \"model_type\": \"bert\",\n",`
			`" \"num_attention_heads\": 12,\n",`
			`" \"num_hidden_layers\": 12,\n",`
			`" \"output_past\": true,\n",`
			`" \"pad_token_id\": 0,\n",`
			`" \"position_embedding_type\": \"absolute\",\n",`
			`" \"transformers_version\": \"4.28.1\",\n",`
			`" \"type_vocab_size\": 2,\n",`
			`" \"use_cache\": true,\n",`
			`" \"vocab_size\": 60000\n",`
			`"}\n",`
			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from transformers import *\n",`
			`"tokenizer = BertTokenizer.from_pretrained(\"dkleczek/bert-base-polish-uncased-v1\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 80,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			"WARNING:tensorflow:6 out of the last 8 calls to <function Model.make_predict_function.<locals>.predict_function at 0x00000247C45EE2A0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n",
			`"80/80 [==============================] - 14s 160ms/step\n",`
			`"{'logits': array([[0.0429822 , 0.07436842, 0.06289113, ..., 0.07107946, 0.22445329,\n",`
			`" 0.17556868],\n",`
			`" [0.05423082, 0.04940203, 0.08606787, ..., 0.06320965, 0.09646532,\n",`
			`" 0.85783374],\n",`
			`" [0.02925512, 0.04107895, 0.04539371, ..., 0.04229825, 0.891557 ,\n",`
			`" 0.05482448],\n",`
			`" ...,\n",`
			`" [0.07066443, 0.06370321, 0.08790383, ..., 0.08178279, 0.10815965,\n",`
			`" 0.16227055],\n",`
			`" [0.04984152, 0.03513726, 0.06702502, ..., 0.04850706, 0.08503693,\n",`
			`" 0.10317416],\n",`
			`" [0.1308529 , 0.0802078 , 0.8544387 , ..., 0.08336826, 0.08602922,\n",`
			`" 0.08140229]], dtype=float32)}\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"input_data = acts[\"text\"].tolist()\n",`
			`"encoded_input = tokenizer.batch_encode_plus(input_data, padding=True, truncation=True, return_tensors='tf')\n",`
			`"dataset = tf.data.Dataset.from_tensor_slices({\n",`
			`" 'input_ids': encoded_input['input_ids'],\n",`
			`" 'attention_mask': encoded_input['attention_mask'],\n",`
			`" 'token_type_ids': encoded_input['token_type_ids']\n",`
			`"}).batch(2)\n",`
			`"\n",`
			`"# make predictions\n",`
			`"predictions = loaded_model.predict(dataset)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 96,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"for prediction in predictions:\n",`
			`" predicted_classes = (predictions[prediction]> 0.5).astype(\"int32\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 97,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"array([[0, 0, 0, ..., 0, 0, 0],\n",`
			`" [0, 0, 0, ..., 0, 0, 1],\n",`
			`" [0, 0, 0, ..., 0, 1, 0],\n",`
			`" ...,\n",`
			`" [0, 0, 0, ..., 0, 0, 0],\n",`
			`" [0, 0, 0, ..., 0, 0, 0],\n",`
			`" [0, 0, 1, ..., 0, 0, 0]])"`
			`]`
			`},`
			`"execution_count": 97,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"predicted_classes"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 98,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.2"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`