11 KiB
11 KiB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from keras.models import load_model
data_pan_tadeusz = open('pan-tadeusz.txt', encoding="utf8").read()
data_SI = open('SI_data.txt', encoding="utf8").read()
def create_corpus(data):
corpus = data.lower().split("\n")
corpus = [element.strip() for element in corpus if element]
return corpus
corpus_pan_tadeusz = create_corpus(data_pan_tadeusz)[:4000]
corpus_SI = create_corpus(data_SI)
corpus = corpus_pan_tadeusz + corpus_SI
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)
def create_input_sequences(corpus):
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
return input_sequences
input_sequences_pan_tadeusz = create_input_sequences(corpus_pan_tadeusz)
input_sequences_SI = create_input_sequences(corpus_SI)
input_sequences = create_input_sequences(corpus)
max_sequence_len = max([len(x) for x in input_sequences])
def create_predictors_label(input_sequences, max_sequence_len):
input_sequences = np.array(pad_sequences(input_sequences,
maxlen=max_sequence_len,
padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = ku.to_categorical(label, num_classes=total_words+1)
return predictors, label
predictors_pan_tadeusz, label_pan_tadeusz = create_predictors_label(input_sequences_pan_tadeusz, max_sequence_len)
predictors_SI, label_SI = create_predictors_label(input_sequences_SI, max_sequence_len)
predictors, label = create_predictors_label(input_sequences, max_sequence_len)
# model = Sequential()
# model.add(Embedding(total_words+1, 100,
# input_length=max_sequence_len-1))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words+1/2, activation='relu',
# kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dense(total_words+1, activation='softmax'))
# model.compile(loss='categorical_crossentropy',
# optimizer='adam', metrics=['accuracy'])
# print(model.summary())
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_2 (Embedding) (None, 75, 100) 1072800 bidirectional_2 (Bidirecti (None, 75, 300) 301200 onal) dropout_2 (Dropout) (None, 75, 300) 0 lstm_5 (LSTM) (None, 100) 160400 dense_4 (Dense) (None, 10727) 1083427 dense_5 (Dense) (None, 10728) 115089984 ================================================================= Total params: 117707811 (449.02 MB) Trainable params: 117707811 (449.02 MB) Non-trainable params: 0 (0.00 Byte) _________________________________________________________________ None
model = load_model('my_model.h5')
WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.
history = model.fit(predictors_pan_tadeusz, label_pan_tadeusz, epochs=1, verbose=1)
744/744 [==============================] - 1501s 2s/step - loss: 1.4722 - accuracy: 0.7626
history = model.fit(predictors_SI, label_SI, epochs=3, verbose=1)
Epoch 1/3 55/55 [==============================] - 98s 2s/step - loss: 4.6245 - accuracy: 0.2131 Epoch 2/3 55/55 [==============================] - 97s 2s/step - loss: 3.9096 - accuracy: 0.2921 Epoch 3/3 55/55 [==============================] - 111s 2s/step - loss: 3.4379 - accuracy: 0.3603
history = model.fit(predictors, label, epochs=1, verbose=1)
799/799 [==============================] - 1105s 1s/step - loss: 1.7071 - accuracy: 0.7451
model.save('my_model.h5')
def predict(text, next_words=25):
for _ in range(next_words):
token_list = tokenizer.texts_to_sequences([text])[0]
token_list = pad_sequences(
[token_list], maxlen=max_sequence_len-1,
padding='pre')
predicted = np.argmax(model.predict(token_list,
verbose=0), axis=-1)
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
output_word = word
break
text += " " + output_word
return text
predict("CNN", 24)
'CNN «wielmożni nieruchomi głowę lecz weźmiem na świat ich umiała się wtłoczyć na końcu które w w chleba gałeczki sieci neuronowych i zdolność do generowania'
predict("GANy", 17)
'GANy i w dawnej surowości prawidłach wychował zakazy żołnierszczyzny na sklepieniu sieci neuronowych w w przetwarzaniu języka naturalnego'