Uczenie_Glebokie/Projekt/project.ipynb

11 KiB

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from keras.models import load_model
data_pan_tadeusz = open('pan-tadeusz.txt', encoding="utf8").read()
data_SI = open('SI_data.txt', encoding="utf8").read()
def create_corpus(data):
    corpus = data.lower().split("\n")
    corpus = [element.strip() for element in corpus if element]
    return corpus
corpus_pan_tadeusz = create_corpus(data_pan_tadeusz)[:4000]
corpus_SI = create_corpus(data_SI)
corpus = corpus_pan_tadeusz + corpus_SI
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)
def create_input_sequences(corpus):
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences
input_sequences_pan_tadeusz = create_input_sequences(corpus_pan_tadeusz)
input_sequences_SI = create_input_sequences(corpus_SI)
input_sequences = create_input_sequences(corpus)
max_sequence_len = max([len(x) for x in input_sequences])
def create_predictors_label(input_sequences, max_sequence_len):
    input_sequences = np.array(pad_sequences(input_sequences,
                                            maxlen=max_sequence_len,
                                            padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words+1)
    return predictors, label
predictors_pan_tadeusz, label_pan_tadeusz = create_predictors_label(input_sequences_pan_tadeusz, max_sequence_len)
predictors_SI, label_SI = create_predictors_label(input_sequences_SI, max_sequence_len)
predictors, label = create_predictors_label(input_sequences, max_sequence_len)
# model = Sequential()
# model.add(Embedding(total_words+1, 100,
#                     input_length=max_sequence_len-1))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words+1/2, activation='relu',
#                 kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dense(total_words+1, activation='softmax'))
# model.compile(loss='categorical_crossentropy',
#               optimizer='adam', metrics=['accuracy'])
# print(model.summary())
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 75, 100)           1072800   
                                                                 
 bidirectional_2 (Bidirecti  (None, 75, 300)           301200    
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 75, 300)           0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               160400    
                                                                 
 dense_4 (Dense)             (None, 10727)             1083427   
                                                                 
 dense_5 (Dense)             (None, 10728)             115089984 
                                                                 
=================================================================
Total params: 117707811 (449.02 MB)
Trainable params: 117707811 (449.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
model = load_model('my_model.h5')
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.

history = model.fit(predictors_pan_tadeusz, label_pan_tadeusz, epochs=1, verbose=1)
744/744 [==============================] - 1501s 2s/step - loss: 1.4722 - accuracy: 0.7626
history = model.fit(predictors_SI, label_SI, epochs=3, verbose=1)
Epoch 1/3
55/55 [==============================] - 98s 2s/step - loss: 4.6245 - accuracy: 0.2131
Epoch 2/3
55/55 [==============================] - 97s 2s/step - loss: 3.9096 - accuracy: 0.2921
Epoch 3/3
55/55 [==============================] - 111s 2s/step - loss: 3.4379 - accuracy: 0.3603
history = model.fit(predictors, label, epochs=1, verbose=1)
799/799 [==============================] - 1105s 1s/step - loss: 1.7071 - accuracy: 0.7451
model.save('my_model.h5')
def predict(text, next_words=25):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len-1,
        padding='pre')
        predicted = np.argmax(model.predict(token_list,
                                            verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        text += " " + output_word
    return text
predict("CNN", 24)
'CNN «wielmożni nieruchomi głowę lecz weźmiem na świat ich umiała się wtłoczyć na końcu które w w chleba gałeczki sieci neuronowych i zdolność do generowania'
predict("GANy", 17)
'GANy i w dawnej surowości prawidłach wychował zakazy żołnierszczyzny na sklepieniu sieci neuronowych w w przetwarzaniu języka naturalnego'