s434695 8633cd534e requirements analysis update

2021-03-27 12:51:50 +01:00

15 KiB

Raw Blame History

0. Instalacja i importowanie modułów

0.1. Ogólne

!pip install -r requirements.txt --user
!pip list

Collecting tflearn==0.5 (from -r requirements.txt (line 1))
[?25l  Downloading https://files.pythonhosted.org/packages/e7/3c/0b156d08ef3d4e2a8009ecab2af1ad2e304f6fb99562b6271c68a74a4397/tflearn-0.5.0.tar.gz (107kB)
[K     |████████████████████████████████| 112kB 1.7MB/s eta 0:00:01
[?25hCollecting tensorflow (from -r requirements.txt (line 2))
[?25l  Downloading https://files.pythonhosted.org/packages/70/dc/e8c5e7983866fa4ef3fd619faa35f660b95b01a2ab62b3884f038ccab542/tensorflow-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl (394.3MB)
[K     |█████████████████████▍          | 263.6MB 2.0MB/s eta 0:01:06    |▉                               | 10.3MB 2.0MB/s eta 0:03:11     |██                              | 24.7MB 2.3MB/s eta 0:02:38     |██▏                             | 26.4MB 2.1MB/s eta 0:02:59     |███▍                            | 42.2MB 1.8MB/s eta 0:03:14     |█████████▊                      | 120.4MB 2.7MB/s eta 0:01:42     |██████████▉                     | 133.4MB 2.4MB/s eta 0:01:49     |███████████████▍                | 190.0MB 3.0MB/s eta 0:01:08     |█████████████████               | 209.0MB 2.5MB/s eta 0:01:15     |██████████████████▌             | 227.7MB 2.8MB/s eta 0:01:00     |██████████████████▉             | 232.4MB 2.6MB/s eta 0:01:03     |███████████████████▊            | 242.5MB 3.2MB/s eta 0:00:47     |███████████████████▊            | 242.9MB 3.2MB/s eta 0:00:47

import numpy as np
import tflearn
import tensorflow
import random
import json

0.2. Angielski Stemmer: https://www.nltk.org/_modules/nltk/stem/lancaster.html

import nltk

nltk.download('punkt')
from nltk.stem.lancaster import LancasterStemmer
stemmer_en = LancasterStemmer()

0.3. Polski Stemmer (Docelowy): https://pypi.org/project/pystempel/

from stempel import StempelStemmer

stemmer_pl = StempelStemmer.default() #może wersja ".polimorf()" jest lepsza?

1. Załadowanie plików .json z bazą słów

1.1. Docelowa baza słów polskich do nauki modelu (10 rodzajów odp - PL)

with open("intents_pl.json", encoding='utf-8') as file:
    data_pl = json.load(file)

print(data_pl)

1.2. Skrócona baza słów (4 rodzaje odp - PL)

with open("intents_pl_short.json", encoding='utf-8') as file:
  data_pl_short = json.load(file)

print(data_pl_short)

1.3. Testowa baza słów angielskich (6 rodzajów odp - EN)

with open("intents_en.json", encoding='utf-8') as file:
  data_en = json.load(file)

print(data_en)

2. Przygotowanie danych do nauki modelu

words = []
labels = []
docs_x = []
docs_y = []

2.1 Stworzenie tablicy ze wszystkimi możliwymi inputami użytkownika (+ labele)

for intent in data_pl_short["intents"]: #Loop przez cały json
  for pattern in intent["patterns"]: #loop przez wszystkie możliwe rodzaje przykładowego inputu użytkownika
    wrds = nltk.word_tokenize(pattern) #Tokenizing every word
    words.extend(wrds) #Add every single tokenized word
    docs_x.append(wrds) #Add the whole tokenized sentence
    docs_y.append(intent["tag"]) #Pattern x coresponds to the tag y. Potrzebne do ustalenia relacji słowa z odpowiedzią

  if intent["tag"] not in labels:
    labels.append(intent["tag"]) #Add the tag

words = [stemmer_pl.stem(w.lower()) for w in words if w not in "?"] #stemming -> take each word and bring it to the "root" form. Only the stemmed version of the word is important to us
words = sorted(list(set(words))) #Sorting

labels = sorted(labels) #sorting

training = []
output = []

out_empty = [0 for _ in range(len(labels))]

#Podgląd zmiennych
print(f"Words:\n{words}")
print(f"labels:\n{labels}")
print(f"docs_y:\n{docs_y}")
print(f"docs_x:\n{docs_x}")

3.2. Przypisywanie słów do danej kategorii (ie. "Cześć" do Greetings)

W przypadku data_pl_short są tylko 4 rodzaje odpowiedzi. "Cześć" które zostane przypisane do labela "greeting" będzie miało formę końcowego outputu "1000" jeżeli label "greetings" jest pierwszy do wyboru.

Warto też dodać, że sieć neuronowa nie przyjmuje teksu. To jest główny powód czemu przypisujemy słowa do kategorii

for x, doc in enumerate(docs_x): #Przejście przez wszystkie słowa
  bag =[]

  wrds = [stemmer_pl.stem(w) for w in doc] #podział wszystkich słów w danym zdaniu

  for w in words:
    if w in wrds:
      bag.append(1) #this word exist
    else:
      bag.append(0) #do not exist
    
  output_row = out_empty[:] #kopia
  output_row[labels.index(docs_y[x])] = 1

  training.append(bag) #dodajemy nowe wyrażenie zamienione na ciąg binarny
  output.append(output_row)

training = np.array(training) #Zbiór treningowy
output = np.array(output) #Zbiór outputów

len(training) #dla pl_short mamy 44 słowa

len(output[0]) #Które można przypisać do 4 kategorii

print(training)
print(output)

3. Model i jego ćwiczenie

training = np.array(training) #zamiana typu dla sieci neuronowej
output = np.array(output) #zamiana typu dla sieci neuronowej

3.1. Stworzenie DLN i inicjacja modelu

tensorflow.compat.v1.reset_default_graph() #Reset na wszelki wypadek (w sumie nie wiem czy to jakaś super ważna linijka kodu)

net = tflearn.input_data(shape=[None, len(training[0])]) #Input layer
net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer
net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer
#net = tflearn.fully_connected(net, 8) #8 neurons for hidden layer
net = tflearn.fully_connected(net, len(output[0]), activation="softmax") #len(output) neurons for output layer + Softmax jako najlepsze wyjście dla tego typu danych
net = tflearn.regression(net)

model = tflearn.DNN(net)

3.2. Trening Modelu

model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)

#Zapis Modelu
#model.save("model.tflearn")

4. Input Użytkownika

4.1 Funkcja "bag_of_words(s, words)" do stemmowania twojego zdania, i przypisania mu formy binarnej

def bag_of_words(s, words):
  bag = [0 for _ in range(len(words))]

  s_words = nltk.word_tokenize(s)
  s_words = [stemmer_pl.stem(word.lower()) for word in s_words]

  for se in s_words:
    for i, w in enumerate(words):
      if w == se:
        bag[i] = 1
  return np.array(bag)

4.2 Funkcja "chat()" do rozmowy z botem

def chat():
  print("Możesz rozpocząć rozmowę z Botem! (type quit to stop)")
  while True: #Ciągła rozmowa
    inp = input("Ty: ")
    if inp.lower() == "quit": #Quit by wyjść z loopa
      break

    result = model.predict([bag_of_words(inp,words)]) #Predictowanie przy pomocy wyćwiczonego modelu
    result_index = np.argmax(result)
    tag = labels[result_index]
    
    for tg in data_pl_short["intents"]: #znalezienie poprawnego tagu do zdania
      if tg['tag'] == tag:
        responses = tg['responses']
      
    print(random.choice(responses)) #Wyprintuj losową odpowiedz z danego zbioru odpowiedzi

5. Rozmowa z botem!

chat()

15 KiB Raw Blame History Unescape Escape