{ "cells": [ { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd\n", "import tensorflow as tf\n", "from tensorflow.keras.models import load_model\n", "import tensorflow_addons as tfa\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Model config BertConfig {\n", " \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n", " \"architectures\": [\n", " \"BertForMaskedLM\",\n", " \"BertForPreTraining\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\",\n", " \"2\": \"LABEL_2\",\n", " \"3\": \"LABEL_3\",\n", " \"4\": \"LABEL_4\",\n", " \"5\": \"LABEL_5\",\n", " \"6\": \"LABEL_6\",\n", " \"7\": \"LABEL_7\",\n", " \"8\": \"LABEL_8\",\n", " \"9\": \"LABEL_9\",\n", " \"10\": \"LABEL_10\",\n", " \"11\": \"LABEL_11\",\n", " \"12\": \"LABEL_12\",\n", " \"13\": \"LABEL_13\",\n", " \"14\": \"LABEL_14\",\n", " \"15\": \"LABEL_15\",\n", " \"16\": \"LABEL_16\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1,\n", " \"LABEL_10\": 10,\n", " \"LABEL_11\": 11,\n", " \"LABEL_12\": 12,\n", " \"LABEL_13\": 13,\n", " \"LABEL_14\": 14,\n", " \"LABEL_15\": 15,\n", " \"LABEL_16\": 16,\n", " \"LABEL_2\": 2,\n", " \"LABEL_3\": 3,\n", " \"LABEL_4\": 4,\n", " \"LABEL_5\": 5,\n", " \"LABEL_6\": 6,\n", " \"LABEL_7\": 7,\n", " \"LABEL_8\": 8,\n", " \"LABEL_9\": 9\n", " },\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.28.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 60000\n", "}\n", "\n" ] } ], "source": [ "loaded_model = tf.keras.models.load_model('model')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "acts=pd.read_csv('user_acts_one_hot.csv', index_col=\"Unnamed: 0\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "acts=acts.drop([\"Agent\"],axis=1)\n", "acts=acts.drop([\"Act\"],axis=1)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n", "loading file added_tokens.json from cache at None\n", "loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n", "loading file tokenizer_config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\tokenizer_config.json\n", "loading configuration file config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\config.json\n", "Model config BertConfig {\n", " \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n", " \"architectures\": [\n", " \"BertForMaskedLM\",\n", " \"BertForPreTraining\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.28.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 60000\n", "}\n", "\n" ] } ], "source": [ "from transformers import *\n", "tokenizer = BertTokenizer.from_pretrained(\"dkleczek/bert-base-polish-uncased-v1\")\n" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:6 out of the last 8 calls to .predict_function at 0x00000247C45EE2A0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n", "80/80 [==============================] - 14s 160ms/step\n", "{'logits': array([[0.0429822 , 0.07436842, 0.06289113, ..., 0.07107946, 0.22445329,\n", " 0.17556868],\n", " [0.05423082, 0.04940203, 0.08606787, ..., 0.06320965, 0.09646532,\n", " 0.85783374],\n", " [0.02925512, 0.04107895, 0.04539371, ..., 0.04229825, 0.891557 ,\n", " 0.05482448],\n", " ...,\n", " [0.07066443, 0.06370321, 0.08790383, ..., 0.08178279, 0.10815965,\n", " 0.16227055],\n", " [0.04984152, 0.03513726, 0.06702502, ..., 0.04850706, 0.08503693,\n", " 0.10317416],\n", " [0.1308529 , 0.0802078 , 0.8544387 , ..., 0.08336826, 0.08602922,\n", " 0.08140229]], dtype=float32)}\n" ] } ], "source": [ "input_data = acts[\"text\"].tolist()\n", "encoded_input = tokenizer.batch_encode_plus(input_data, padding=True, truncation=True, return_tensors='tf')\n", "dataset = tf.data.Dataset.from_tensor_slices({\n", " 'input_ids': encoded_input['input_ids'],\n", " 'attention_mask': encoded_input['attention_mask'],\n", " 'token_type_ids': encoded_input['token_type_ids']\n", "}).batch(2)\n", "\n", "# make predictions\n", "predictions = loaded_model.predict(dataset)\n" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "for prediction in predictions:\n", " predicted_classes = (predictions[prediction]> 0.5).astype(\"int32\")\n" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 1],\n", " [0, 0, 0, ..., 0, 1, 0],\n", " ...,\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 1, ..., 0, 0, 0]])" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predicted_classes" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }