requirements analysis update
This commit is contained in:
parent
0647105fc0
commit
8633cd534e
583
.ipynb_checkpoints/DL_Chatbot_ver_1_0-checkpoint.ipynb
Normal file
583
.ipynb_checkpoints/DL_Chatbot_ver_1_0-checkpoint.ipynb
Normal file
@ -0,0 +1,583 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "HxtCFj1hfXw6"
|
||||
},
|
||||
"source": [
|
||||
"# 0. Instalacja i importowanie modułów"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "enDE5aTIgN-v"
|
||||
},
|
||||
"source": [
|
||||
"##### 0.1. Ogólne"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "D7_8XDfpfH-X"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting tflearn==0.5 (from -r requirements.txt (line 1))\n",
|
||||
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/e7/3c/0b156d08ef3d4e2a8009ecab2af1ad2e304f6fb99562b6271c68a74a4397/tflearn-0.5.0.tar.gz (107kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 112kB 1.7MB/s eta 0:00:01\n",
|
||||
"\u001b[?25hCollecting tensorflow (from -r requirements.txt (line 2))\n",
|
||||
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/70/dc/e8c5e7983866fa4ef3fd619faa35f660b95b01a2ab62b3884f038ccab542/tensorflow-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl (394.3MB)\n",
|
||||
"\u001b[K |█████████████████████▍ | 263.6MB 2.0MB/s eta 0:01:06 |▉ | 10.3MB 2.0MB/s eta 0:03:11 |██ | 24.7MB 2.3MB/s eta 0:02:38 |██▏ | 26.4MB 2.1MB/s eta 0:02:59 |███▍ | 42.2MB 1.8MB/s eta 0:03:14 |█████████▊ | 120.4MB 2.7MB/s eta 0:01:42 |██████████▉ | 133.4MB 2.4MB/s eta 0:01:49 |███████████████▍ | 190.0MB 3.0MB/s eta 0:01:08 |█████████████████ | 209.0MB 2.5MB/s eta 0:01:15 |██████████████████▌ | 227.7MB 2.8MB/s eta 0:01:00 |██████████████████▉ | 232.4MB 2.6MB/s eta 0:01:03 |███████████████████▊ | 242.5MB 3.2MB/s eta 0:00:47 |███████████████████▊ | 242.9MB 3.2MB/s eta 0:00:47"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install -r requirements.txt --user\n",
|
||||
"!pip list"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "GOGs4hL6fwwK"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import tflearn\n",
|
||||
"import tensorflow\n",
|
||||
"import random\n",
|
||||
"import json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Mr0ZD1L2gCWw"
|
||||
},
|
||||
"source": [
|
||||
"##### 0.2. Angielski Stemmer: https://www.nltk.org/_modules/nltk/stem/lancaster.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "jy4-9guXgBY3"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import nltk\n",
|
||||
"\n",
|
||||
"nltk.download('punkt')\n",
|
||||
"from nltk.stem.lancaster import LancasterStemmer\n",
|
||||
"stemmer_en = LancasterStemmer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "uPpcNQa_ggUl"
|
||||
},
|
||||
"source": [
|
||||
"##### 0.3. Polski Stemmer **(Docelowy)**: https://pypi.org/project/pystempel/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "XBpvJXn1gBDi"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from stempel import StempelStemmer\n",
|
||||
"\n",
|
||||
"stemmer_pl = StempelStemmer.default() #może wersja \".polimorf()\" jest lepsza?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Lg_3MO_3hQV_"
|
||||
},
|
||||
"source": [
|
||||
"# 1. Załadowanie plików **.json** z bazą słów"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "BzBo1657hn3w"
|
||||
},
|
||||
"source": [
|
||||
"##### 1.1. Docelowa baza słów polskich do nauki modelu (10 rodzajów odp - PL)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "jKsIW7hHhepB",
|
||||
"outputId": "09ba1cb1-bb0e-44ee-9d28-017209902934"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"intents_pl.json\", encoding='utf-8') as file:\n",
|
||||
" data_pl = json.load(file)\n",
|
||||
"\n",
|
||||
"print(data_pl)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "g94eHpqshoat"
|
||||
},
|
||||
"source": [
|
||||
"##### 1.2. Skrócona baza słów (4 rodzaje odp - PL)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "gJbm_CtRhNOK",
|
||||
"outputId": "157196fc-6a25-4a70-aca3-9d886c743f6c"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"intents_pl_short.json\", encoding='utf-8') as file:\n",
|
||||
" data_pl_short = json.load(file)\n",
|
||||
"\n",
|
||||
"print(data_pl_short)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "HjkIUMy2ho6C"
|
||||
},
|
||||
"source": [
|
||||
"##### 1.3. Testowa baza słów angielskich (6 rodzajów odp - EN)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "vW5FyoRqhfIc",
|
||||
"outputId": "378d8894-9c9c-46be-ade1-b6491f095179"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"intents_en.json\", encoding='utf-8') as file:\n",
|
||||
" data_en = json.load(file)\n",
|
||||
"\n",
|
||||
"print(data_en)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "4BnsOkqqjBlr"
|
||||
},
|
||||
"source": [
|
||||
"# 2. Przygotowanie danych do nauki modelu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "gy6p55-DjLyY"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"words = []\n",
|
||||
"labels = []\n",
|
||||
"docs_x = []\n",
|
||||
"docs_y = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "XxZX-JQA5zjL"
|
||||
},
|
||||
"source": [
|
||||
"##### 2.1 Stworzenie tablicy ze wszystkimi możliwymi inputami użytkownika (+ labele)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "nBUKwSr_kVSd"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for intent in data_pl_short[\"intents\"]: #Loop przez cały json\n",
|
||||
" for pattern in intent[\"patterns\"]: #loop przez wszystkie możliwe rodzaje przykładowego inputu użytkownika\n",
|
||||
" wrds = nltk.word_tokenize(pattern) #Tokenizing every word\n",
|
||||
" words.extend(wrds) #Add every single tokenized word\n",
|
||||
" docs_x.append(wrds) #Add the whole tokenized sentence\n",
|
||||
" docs_y.append(intent[\"tag\"]) #Pattern x coresponds to the tag y. Potrzebne do ustalenia relacji słowa z odpowiedzią\n",
|
||||
"\n",
|
||||
" if intent[\"tag\"] not in labels:\n",
|
||||
" labels.append(intent[\"tag\"]) #Add the tag"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "wOyP5lbikV1e"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"words = [stemmer_pl.stem(w.lower()) for w in words if w not in \"?\"] #stemming -> take each word and bring it to the \"root\" form. Only the stemmed version of the word is important to us\n",
|
||||
"words = sorted(list(set(words))) #Sorting\n",
|
||||
"\n",
|
||||
"labels = sorted(labels) #sorting\n",
|
||||
"\n",
|
||||
"training = []\n",
|
||||
"output = []\n",
|
||||
"\n",
|
||||
"out_empty = [0 for _ in range(len(labels))]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Podgląd zmiennych\n",
|
||||
"print(f\"Words:\\n{words}\")\n",
|
||||
"print(f\"labels:\\n{labels}\")\n",
|
||||
"print(f\"docs_y:\\n{docs_y}\")\n",
|
||||
"print(f\"docs_x:\\n{docs_x}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "WewUeunf5_Za"
|
||||
},
|
||||
"source": [
|
||||
"##### 3.2. Przypisywanie słów do danej kategorii (ie. \"Cześć\" do Greetings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "1Q43_qtZ6KNP"
|
||||
},
|
||||
"source": [
|
||||
"W przypadku data_pl_short są tylko 4 rodzaje odpowiedzi. \"Cześć\" które zostane przypisane do labela \"greeting\" będzie miało formę końcowego outputu \"1000\" jeżeli label \"greetings\" jest pierwszy do wyboru."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "doFER5OS7CC_"
|
||||
},
|
||||
"source": [
|
||||
"Warto też dodać, że sieć neuronowa nie przyjmuje teksu. To jest główny powód czemu przypisujemy słowa do kategorii"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "8FDKrjpjkYsE"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for x, doc in enumerate(docs_x): #Przejście przez wszystkie słowa\n",
|
||||
" bag =[]\n",
|
||||
"\n",
|
||||
" wrds = [stemmer_pl.stem(w) for w in doc] #podział wszystkich słów w danym zdaniu\n",
|
||||
"\n",
|
||||
" for w in words:\n",
|
||||
" if w in wrds:\n",
|
||||
" bag.append(1) #this word exist\n",
|
||||
" else:\n",
|
||||
" bag.append(0) #do not exist\n",
|
||||
" \n",
|
||||
" output_row = out_empty[:] #kopia\n",
|
||||
" output_row[labels.index(docs_y[x])] = 1\n",
|
||||
"\n",
|
||||
" training.append(bag) #dodajemy nowe wyrażenie zamienione na ciąg binarny\n",
|
||||
" output.append(output_row)\n",
|
||||
"\n",
|
||||
"training = np.array(training) #Zbiór treningowy\n",
|
||||
"output = np.array(output) #Zbiór outputów"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "cJKUjbkC72-f",
|
||||
"outputId": "7e2bff96-78ce-49ff-b27b-eee77752228d"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(training) #dla pl_short mamy 44 słowa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "Kx43VDgS7-yN",
|
||||
"outputId": "4fa6f6fe-dc58-4e76-bb26-38c1784ab79c"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output[0]) #Które można przypisać do 4 kategorii"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(training)\n",
|
||||
"print(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "yCFKTbjZ12wh"
|
||||
},
|
||||
"source": [
|
||||
"# 3. Model i jego ćwiczenie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "MDA435sI1-Xl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training = np.array(training) #zamiana typu dla sieci neuronowej\n",
|
||||
"output = np.array(output) #zamiana typu dla sieci neuronowej"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "SvBURQCc3PBj"
|
||||
},
|
||||
"source": [
|
||||
"##### 3.1. Stworzenie DLN i inicjacja modelu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "XaQJh1aG2hbj",
|
||||
"outputId": "80420df0-3a78-4583-9563-2165e968713d"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorflow.compat.v1.reset_default_graph() #Reset na wszelki wypadek (w sumie nie wiem czy to jakaś super ważna linijka kodu)\n",
|
||||
"\n",
|
||||
"net = tflearn.input_data(shape=[None, len(training[0])]) #Input layer\n",
|
||||
"net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n",
|
||||
"net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n",
|
||||
"#net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n",
|
||||
"net = tflearn.fully_connected(net, len(output[0]), activation=\"softmax\") #len(output) neurons for output layer + Softmax jako najlepsze wyjście dla tego typu danych\n",
|
||||
"net = tflearn.regression(net)\n",
|
||||
"\n",
|
||||
"model = tflearn.DNN(net)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "Ktd1OcBa3PmQ"
|
||||
},
|
||||
"source": [
|
||||
"##### 3.2. Trening Modelu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "REzkJL_r2hwl",
|
||||
"outputId": "7ab2b0c5-944f-4e22-d478-1e35b41f87db"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)\n",
|
||||
"\n",
|
||||
"#Zapis Modelu\n",
|
||||
"#model.save(\"model.tflearn\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "G-L6TV_63iYs"
|
||||
},
|
||||
"source": [
|
||||
"# 4. Input Użytkownika"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "c6UvIrWu-a38"
|
||||
},
|
||||
"source": [
|
||||
"##### 4.1 Funkcja **\"bag_of_words(s, words)\"** do stemmowania twojego zdania, i przypisania mu formy binarnej"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "1IQyV1v33lC7"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def bag_of_words(s, words):\n",
|
||||
" bag = [0 for _ in range(len(words))]\n",
|
||||
"\n",
|
||||
" s_words = nltk.word_tokenize(s)\n",
|
||||
" s_words = [stemmer_pl.stem(word.lower()) for word in s_words]\n",
|
||||
"\n",
|
||||
" for se in s_words:\n",
|
||||
" for i, w in enumerate(words):\n",
|
||||
" if w == se:\n",
|
||||
" bag[i] = 1\n",
|
||||
" return np.array(bag)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "rXq-wj-F-5DE"
|
||||
},
|
||||
"source": [
|
||||
"##### 4.2 Funkcja **\"chat()\"** do rozmowy z botem"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "Je6OSZ679-KL"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def chat():\n",
|
||||
" print(\"Możesz rozpocząć rozmowę z Botem! (type quit to stop)\")\n",
|
||||
" while True: #Ciągła rozmowa\n",
|
||||
" inp = input(\"Ty: \")\n",
|
||||
" if inp.lower() == \"quit\": #Quit by wyjść z loopa\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" result = model.predict([bag_of_words(inp,words)]) #Predictowanie przy pomocy wyćwiczonego modelu\n",
|
||||
" result_index = np.argmax(result)\n",
|
||||
" tag = labels[result_index]\n",
|
||||
" \n",
|
||||
" for tg in data_pl_short[\"intents\"]: #znalezienie poprawnego tagu do zdania\n",
|
||||
" if tg['tag'] == tag:\n",
|
||||
" responses = tg['responses']\n",
|
||||
" \n",
|
||||
" print(random.choice(responses)) #Wyprintuj losową odpowiedz z danego zbioru odpowiedzi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "ifvjglbO_SEA"
|
||||
},
|
||||
"source": [
|
||||
"# 5. Rozmowa z botem!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "VZf_aCUM-Amm",
|
||||
"outputId": "9e3fcf7b-b9b3-47b0-acb5-48214f07f363"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chat()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "DL_Chatbot_ver_1_0.ipynb",
|
||||
"provenance": [],
|
||||
"toc_visible": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
Loading…
Reference in New Issue
Block a user