275 lines
9.2 KiB
Plaintext
275 lines
9.2 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 38,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pickle\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import tensorflow as tf\n",
|
||
|
"from tensorflow.keras.models import load_model\n",
|
||
|
"import tensorflow_addons as tfa\n",
|
||
|
"import numpy as np"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 46,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Model config BertConfig {\n",
|
||
|
" \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",
|
||
|
" \"architectures\": [\n",
|
||
|
" \"BertForMaskedLM\",\n",
|
||
|
" \"BertForPreTraining\"\n",
|
||
|
" ],\n",
|
||
|
" \"attention_probs_dropout_prob\": 0.1,\n",
|
||
|
" \"classifier_dropout\": null,\n",
|
||
|
" \"hidden_act\": \"gelu\",\n",
|
||
|
" \"hidden_dropout_prob\": 0.1,\n",
|
||
|
" \"hidden_size\": 768,\n",
|
||
|
" \"id2label\": {\n",
|
||
|
" \"0\": \"LABEL_0\",\n",
|
||
|
" \"1\": \"LABEL_1\",\n",
|
||
|
" \"2\": \"LABEL_2\",\n",
|
||
|
" \"3\": \"LABEL_3\",\n",
|
||
|
" \"4\": \"LABEL_4\",\n",
|
||
|
" \"5\": \"LABEL_5\",\n",
|
||
|
" \"6\": \"LABEL_6\",\n",
|
||
|
" \"7\": \"LABEL_7\",\n",
|
||
|
" \"8\": \"LABEL_8\",\n",
|
||
|
" \"9\": \"LABEL_9\",\n",
|
||
|
" \"10\": \"LABEL_10\",\n",
|
||
|
" \"11\": \"LABEL_11\",\n",
|
||
|
" \"12\": \"LABEL_12\",\n",
|
||
|
" \"13\": \"LABEL_13\",\n",
|
||
|
" \"14\": \"LABEL_14\",\n",
|
||
|
" \"15\": \"LABEL_15\",\n",
|
||
|
" \"16\": \"LABEL_16\"\n",
|
||
|
" },\n",
|
||
|
" \"initializer_range\": 0.02,\n",
|
||
|
" \"intermediate_size\": 3072,\n",
|
||
|
" \"label2id\": {\n",
|
||
|
" \"LABEL_0\": 0,\n",
|
||
|
" \"LABEL_1\": 1,\n",
|
||
|
" \"LABEL_10\": 10,\n",
|
||
|
" \"LABEL_11\": 11,\n",
|
||
|
" \"LABEL_12\": 12,\n",
|
||
|
" \"LABEL_13\": 13,\n",
|
||
|
" \"LABEL_14\": 14,\n",
|
||
|
" \"LABEL_15\": 15,\n",
|
||
|
" \"LABEL_16\": 16,\n",
|
||
|
" \"LABEL_2\": 2,\n",
|
||
|
" \"LABEL_3\": 3,\n",
|
||
|
" \"LABEL_4\": 4,\n",
|
||
|
" \"LABEL_5\": 5,\n",
|
||
|
" \"LABEL_6\": 6,\n",
|
||
|
" \"LABEL_7\": 7,\n",
|
||
|
" \"LABEL_8\": 8,\n",
|
||
|
" \"LABEL_9\": 9\n",
|
||
|
" },\n",
|
||
|
" \"layer_norm_eps\": 1e-12,\n",
|
||
|
" \"max_position_embeddings\": 512,\n",
|
||
|
" \"model_type\": \"bert\",\n",
|
||
|
" \"num_attention_heads\": 12,\n",
|
||
|
" \"num_hidden_layers\": 12,\n",
|
||
|
" \"output_past\": true,\n",
|
||
|
" \"pad_token_id\": 0,\n",
|
||
|
" \"position_embedding_type\": \"absolute\",\n",
|
||
|
" \"transformers_version\": \"4.28.1\",\n",
|
||
|
" \"type_vocab_size\": 2,\n",
|
||
|
" \"use_cache\": true,\n",
|
||
|
" \"vocab_size\": 60000\n",
|
||
|
"}\n",
|
||
|
"\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"loaded_model = tf.keras.models.load_model('model')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 29,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"acts=pd.read_csv('user_acts_one_hot.csv', index_col=\"Unnamed: 0\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"acts=acts.drop([\"Agent\"],axis=1)\n",
|
||
|
"acts=acts.drop([\"Act\"],axis=1)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 54,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"loading file vocab.txt from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\vocab.txt\n",
|
||
|
"loading file added_tokens.json from cache at None\n",
|
||
|
"loading file special_tokens_map.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\special_tokens_map.json\n",
|
||
|
"loading file tokenizer_config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\tokenizer_config.json\n",
|
||
|
"loading configuration file config.json from cache at C:\\Users\\macty/.cache\\huggingface\\hub\\models--dkleczek--bert-base-polish-uncased-v1\\snapshots\\62be9821055981deafb23f217b68cc41f38cdb76\\config.json\n",
|
||
|
"Model config BertConfig {\n",
|
||
|
" \"_name_or_path\": \"dkleczek/bert-base-polish-uncased-v1\",\n",
|
||
|
" \"architectures\": [\n",
|
||
|
" \"BertForMaskedLM\",\n",
|
||
|
" \"BertForPreTraining\"\n",
|
||
|
" ],\n",
|
||
|
" \"attention_probs_dropout_prob\": 0.1,\n",
|
||
|
" \"classifier_dropout\": null,\n",
|
||
|
" \"hidden_act\": \"gelu\",\n",
|
||
|
" \"hidden_dropout_prob\": 0.1,\n",
|
||
|
" \"hidden_size\": 768,\n",
|
||
|
" \"initializer_range\": 0.02,\n",
|
||
|
" \"intermediate_size\": 3072,\n",
|
||
|
" \"layer_norm_eps\": 1e-12,\n",
|
||
|
" \"max_position_embeddings\": 512,\n",
|
||
|
" \"model_type\": \"bert\",\n",
|
||
|
" \"num_attention_heads\": 12,\n",
|
||
|
" \"num_hidden_layers\": 12,\n",
|
||
|
" \"output_past\": true,\n",
|
||
|
" \"pad_token_id\": 0,\n",
|
||
|
" \"position_embedding_type\": \"absolute\",\n",
|
||
|
" \"transformers_version\": \"4.28.1\",\n",
|
||
|
" \"type_vocab_size\": 2,\n",
|
||
|
" \"use_cache\": true,\n",
|
||
|
" \"vocab_size\": 60000\n",
|
||
|
"}\n",
|
||
|
"\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from transformers import *\n",
|
||
|
"tokenizer = BertTokenizer.from_pretrained(\"dkleczek/bert-base-polish-uncased-v1\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 80,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"WARNING:tensorflow:6 out of the last 8 calls to <function Model.make_predict_function.<locals>.predict_function at 0x00000247C45EE2A0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n",
|
||
|
"80/80 [==============================] - 14s 160ms/step\n",
|
||
|
"{'logits': array([[0.0429822 , 0.07436842, 0.06289113, ..., 0.07107946, 0.22445329,\n",
|
||
|
" 0.17556868],\n",
|
||
|
" [0.05423082, 0.04940203, 0.08606787, ..., 0.06320965, 0.09646532,\n",
|
||
|
" 0.85783374],\n",
|
||
|
" [0.02925512, 0.04107895, 0.04539371, ..., 0.04229825, 0.891557 ,\n",
|
||
|
" 0.05482448],\n",
|
||
|
" ...,\n",
|
||
|
" [0.07066443, 0.06370321, 0.08790383, ..., 0.08178279, 0.10815965,\n",
|
||
|
" 0.16227055],\n",
|
||
|
" [0.04984152, 0.03513726, 0.06702502, ..., 0.04850706, 0.08503693,\n",
|
||
|
" 0.10317416],\n",
|
||
|
" [0.1308529 , 0.0802078 , 0.8544387 , ..., 0.08336826, 0.08602922,\n",
|
||
|
" 0.08140229]], dtype=float32)}\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"input_data = acts[\"text\"].tolist()\n",
|
||
|
"encoded_input = tokenizer.batch_encode_plus(input_data, padding=True, truncation=True, return_tensors='tf')\n",
|
||
|
"dataset = tf.data.Dataset.from_tensor_slices({\n",
|
||
|
" 'input_ids': encoded_input['input_ids'],\n",
|
||
|
" 'attention_mask': encoded_input['attention_mask'],\n",
|
||
|
" 'token_type_ids': encoded_input['token_type_ids']\n",
|
||
|
"}).batch(2)\n",
|
||
|
"\n",
|
||
|
"# make predictions\n",
|
||
|
"predictions = loaded_model.predict(dataset)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 96,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"for prediction in predictions:\n",
|
||
|
" predicted_classes = (predictions[prediction]> 0.5).astype(\"int32\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 97,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array([[0, 0, 0, ..., 0, 0, 0],\n",
|
||
|
" [0, 0, 0, ..., 0, 0, 1],\n",
|
||
|
" [0, 0, 0, ..., 0, 1, 0],\n",
|
||
|
" ...,\n",
|
||
|
" [0, 0, 0, ..., 0, 0, 0],\n",
|
||
|
" [0, 0, 0, ..., 0, 0, 0],\n",
|
||
|
" [0, 0, 1, ..., 0, 0, 0]])"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 97,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"predicted_classes"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 98,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"classes = [\"ack\",\"affirm\",\"bye\",\"hello\",\"help\",\"negate\",\"null\",\"repeat\",\"reqalts\",\"reqmore\",\"restart\",\"silence\",\"thankyou\",\"confirm\",\"deny\",\"inform\",\"request\"]"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.11.2"
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|