{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "HxtCFj1hfXw6" }, "source": [ "# 0. Instalacja i importowanie modułów" ] }, { "cell_type": "markdown", "metadata": { "id": "enDE5aTIgN-v" }, "source": [ "##### 0.1. Ogólne" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "D7_8XDfpfH-X" }, "outputs": [], "source": [ "!pip install -r requirements.txt --user\n", "!pip list" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GOGs4hL6fwwK" }, "outputs": [], "source": [ "import numpy as np\n", "import tflearn\n", "import tensorflow\n", "import random\n", "import json" ] }, { "cell_type": "markdown", "metadata": { "id": "Mr0ZD1L2gCWw" }, "source": [ "##### 0.2. Angielski Stemmer: https://www.nltk.org/_modules/nltk/stem/lancaster.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jy4-9guXgBY3" }, "outputs": [], "source": [ "import nltk\n", "\n", "nltk.download('punkt')\n", "from nltk.stem.lancaster import LancasterStemmer\n", "stemmer_en = LancasterStemmer()" ] }, { "cell_type": "markdown", "metadata": { "id": "uPpcNQa_ggUl" }, "source": [ "##### 0.3. Polski Stemmer **(Docelowy)**: https://pypi.org/project/pystempel/" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XBpvJXn1gBDi" }, "outputs": [], "source": [ "from stempel import StempelStemmer\n", "\n", "stemmer_pl = StempelStemmer.default() #może wersja \".polimorf()\" jest lepsza?" ] }, { "cell_type": "markdown", "metadata": { "id": "Lg_3MO_3hQV_" }, "source": [ "# 1. Załadowanie plików **.json** z bazą słów" ] }, { "cell_type": "markdown", "metadata": { "id": "BzBo1657hn3w" }, "source": [ "##### 1.1. Docelowa baza słów polskich do nauki modelu (10 rodzajów odp - PL)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jKsIW7hHhepB", "outputId": "09ba1cb1-bb0e-44ee-9d28-017209902934" }, "outputs": [], "source": [ "with open(\"intents_pl.json\", encoding='utf-8') as file:\n", " data_pl = json.load(file)\n", "\n", "print(data_pl)" ] }, { "cell_type": "markdown", "metadata": { "id": "g94eHpqshoat" }, "source": [ "##### 1.2. Skrócona baza słów (4 rodzaje odp - PL)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gJbm_CtRhNOK", "outputId": "157196fc-6a25-4a70-aca3-9d886c743f6c" }, "outputs": [], "source": [ "with open(\"intents_pl_short.json\", encoding='utf-8') as file:\n", " data_pl_short = json.load(file)\n", "\n", "print(data_pl_short)" ] }, { "cell_type": "markdown", "metadata": { "id": "HjkIUMy2ho6C" }, "source": [ "##### 1.3. Testowa baza słów angielskich (6 rodzajów odp - EN)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vW5FyoRqhfIc", "outputId": "378d8894-9c9c-46be-ade1-b6491f095179" }, "outputs": [], "source": [ "with open(\"intents_en.json\", encoding='utf-8') as file:\n", " data_en = json.load(file)\n", "\n", "print(data_en)" ] }, { "cell_type": "markdown", "metadata": { "id": "4BnsOkqqjBlr" }, "source": [ "# 2. Przygotowanie danych do nauki modelu" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gy6p55-DjLyY" }, "outputs": [], "source": [ "words = []\n", "labels = []\n", "docs_x = []\n", "docs_y = []" ] }, { "cell_type": "markdown", "metadata": { "id": "XxZX-JQA5zjL" }, "source": [ "##### 2.1 Stworzenie tablicy ze wszystkimi możliwymi inputami użytkownika (+ labele)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nBUKwSr_kVSd" }, "outputs": [], "source": [ "for intent in data_pl_short[\"intents\"]: #Loop przez cały json\n", " for pattern in intent[\"patterns\"]: #loop przez wszystkie możliwe rodzaje przykładowego inputu użytkownika\n", " wrds = nltk.word_tokenize(pattern) #Tokenizing every word\n", " words.extend(wrds) #Add every single tokenized word\n", " docs_x.append(wrds) #Add the whole tokenized sentence\n", " docs_y.append(intent[\"tag\"]) #Pattern x coresponds to the tag y. Potrzebne do ustalenia relacji słowa z odpowiedzią\n", "\n", " if intent[\"tag\"] not in labels:\n", " labels.append(intent[\"tag\"]) #Add the tag" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wOyP5lbikV1e" }, "outputs": [], "source": [ "words = [stemmer_pl.stem(w.lower()) for w in words if w not in \"?\"] #stemming -> take each word and bring it to the \"root\" form. Only the stemmed version of the word is important to us\n", "words = sorted(list(set(words))) #Sorting\n", "\n", "labels = sorted(labels) #sorting\n", "\n", "training = []\n", "output = []\n", "\n", "out_empty = [0 for _ in range(len(labels))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Podgląd zmiennych\n", "print(f\"Words:\\n{words}\")\n", "print(f\"labels:\\n{labels}\")\n", "print(f\"docs_y:\\n{docs_y}\")\n", "print(f\"docs_x:\\n{docs_x}\")" ] }, { "cell_type": "markdown", "metadata": { "id": "WewUeunf5_Za" }, "source": [ "##### 3.2. Przypisywanie słów do danej kategorii (ie. \"Cześć\" do Greetings)" ] }, { "cell_type": "markdown", "metadata": { "id": "1Q43_qtZ6KNP" }, "source": [ "W przypadku data_pl_short są tylko 4 rodzaje odpowiedzi. \"Cześć\" które zostane przypisane do labela \"greeting\" będzie miało formę końcowego outputu \"1000\" jeżeli label \"greetings\" jest pierwszy do wyboru." ] }, { "cell_type": "markdown", "metadata": { "id": "doFER5OS7CC_" }, "source": [ "Warto też dodać, że sieć neuronowa nie przyjmuje teksu. To jest główny powód czemu przypisujemy słowa do kategorii" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8FDKrjpjkYsE" }, "outputs": [], "source": [ "for x, doc in enumerate(docs_x): #Przejście przez wszystkie słowa\n", " bag =[]\n", "\n", " wrds = [stemmer_pl.stem(w) for w in doc] #podział wszystkich słów w danym zdaniu\n", "\n", " for w in words:\n", " if w in wrds:\n", " bag.append(1) #this word exist\n", " else:\n", " bag.append(0) #do not exist\n", " \n", " output_row = out_empty[:] #kopia\n", " output_row[labels.index(docs_y[x])] = 1\n", "\n", " training.append(bag) #dodajemy nowe wyrażenie zamienione na ciąg binarny\n", " output.append(output_row)\n", "\n", "training = np.array(training) #Zbiór treningowy\n", "output = np.array(output) #Zbiór outputów" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cJKUjbkC72-f", "outputId": "7e2bff96-78ce-49ff-b27b-eee77752228d" }, "outputs": [], "source": [ "len(training) #dla pl_short mamy 44 słowa" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Kx43VDgS7-yN", "outputId": "4fa6f6fe-dc58-4e76-bb26-38c1784ab79c" }, "outputs": [], "source": [ "len(output[0]) #Które można przypisać do 4 kategorii" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(training)\n", "print(output)" ] }, { "cell_type": "markdown", "metadata": { "id": "yCFKTbjZ12wh" }, "source": [ "# 3. Model i jego ćwiczenie" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "MDA435sI1-Xl" }, "outputs": [], "source": [ "training = np.array(training) #zamiana typu dla sieci neuronowej\n", "output = np.array(output) #zamiana typu dla sieci neuronowej" ] }, { "cell_type": "markdown", "metadata": { "id": "SvBURQCc3PBj" }, "source": [ "##### 3.1. Stworzenie DLN i inicjacja modelu" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XaQJh1aG2hbj", "outputId": "80420df0-3a78-4583-9563-2165e968713d" }, "outputs": [], "source": [ "tensorflow.compat.v1.reset_default_graph() #Reset na wszelki wypadek (w sumie nie wiem czy to jakaś super ważna linijka kodu)\n", "\n", "net = tflearn.input_data(shape=[None, len(training[0])]) #Input layer\n", "net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n", "net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n", "#net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer\n", "net = tflearn.fully_connected(net, len(output[0]), activation=\"softmax\") #len(output) neurons for output layer + Softmax jako najlepsze wyjście dla tego typu danych\n", "net = tflearn.regression(net)\n", "\n", "model = tflearn.DNN(net)\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "Ktd1OcBa3PmQ" }, "source": [ "##### 3.2. Trening Modelu" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "REzkJL_r2hwl", "outputId": "7ab2b0c5-944f-4e22-d478-1e35b41f87db" }, "outputs": [], "source": [ "model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)\n", "\n", "#Zapis Modelu\n", "#model.save(\"model.tflearn\")" ] }, { "cell_type": "markdown", "metadata": { "id": "G-L6TV_63iYs" }, "source": [ "# 4. Input Użytkownika" ] }, { "cell_type": "markdown", "metadata": { "id": "c6UvIrWu-a38" }, "source": [ "##### 4.1 Funkcja **\"bag_of_words(s, words)\"** do stemmowania twojego zdania, i przypisania mu formy binarnej" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1IQyV1v33lC7" }, "outputs": [], "source": [ "def bag_of_words(s, words):\n", " bag = [0 for _ in range(len(words))]\n", "\n", " s_words = nltk.word_tokenize(s)\n", " s_words = [stemmer_pl.stem(word.lower()) for word in s_words]\n", "\n", " for se in s_words:\n", " for i, w in enumerate(words):\n", " if w == se:\n", " bag[i] = 1\n", " return np.array(bag)" ] }, { "cell_type": "markdown", "metadata": { "id": "rXq-wj-F-5DE" }, "source": [ "##### 4.2 Funkcja **\"chat()\"** do rozmowy z botem" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Je6OSZ679-KL" }, "outputs": [], "source": [ "def chat():\n", " print(\"Możesz rozpocząć rozmowę z Botem! (type quit to stop)\")\n", " while True: #Ciągła rozmowa\n", " inp = input(\"Ty: \")\n", " if inp.lower() == \"quit\": #Quit by wyjść z loopa\n", " break\n", "\n", " result = model.predict([bag_of_words(inp,words)]) #Predictowanie przy pomocy wyćwiczonego modelu\n", " result_index = np.argmax(result)\n", " tag = labels[result_index]\n", " \n", " for tg in data_pl_short[\"intents\"]: #znalezienie poprawnego tagu do zdania\n", " if tg['tag'] == tag:\n", " responses = tg['responses']\n", " \n", " print(random.choice(responses)) #Wyprintuj losową odpowiedz z danego zbioru odpowiedzi" ] }, { "cell_type": "markdown", "metadata": { "id": "ifvjglbO_SEA" }, "source": [ "# 5. Rozmowa z botem!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VZf_aCUM-Amm", "outputId": "9e3fcf7b-b9b3-47b0-acb5-48214f07f363" }, "outputs": [], "source": [ "chat()" ] } ], "metadata": { "colab": { "name": "DL_Chatbot_ver_1_0.ipynb", "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }