Uczenie_Glebokie/project.ipynb at 0794cc9cd49662ed6a32f884ccc8ed076158a3a8

11 KiB

Raw Blame History

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku
from wordcloud import WordCloud
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from keras.models import load_model

data_pan_tadeusz = open('pan-tadeusz.txt', encoding="utf8").read()
data_SI = open('SI_data.txt', encoding="utf8").read()

def create_corpus(data):
    corpus = data.lower().split("\n")
    corpus = [element.strip() for element in corpus if element]
    return corpus

corpus_pan_tadeusz = create_corpus(data_pan_tadeusz)[:4000]
corpus_SI = create_corpus(data_SI)
corpus = corpus_pan_tadeusz + corpus_SI

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)

def create_input_sequences(corpus):
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

input_sequences_pan_tadeusz = create_input_sequences(corpus_pan_tadeusz)
input_sequences_SI = create_input_sequences(corpus_SI)
input_sequences = create_input_sequences(corpus)

max_sequence_len = max([len(x) for x in input_sequences])

def create_predictors_label(input_sequences, max_sequence_len):
    input_sequences = np.array(pad_sequences(input_sequences,
                                            maxlen=max_sequence_len,
                                            padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes=total_words+1)
    return predictors, label

predictors_pan_tadeusz, label_pan_tadeusz = create_predictors_label(input_sequences_pan_tadeusz, max_sequence_len)
predictors_SI, label_SI = create_predictors_label(input_sequences_SI, max_sequence_len)
predictors, label = create_predictors_label(input_sequences, max_sequence_len)

# model = Sequential()
# model.add(Embedding(total_words+1, 100,
#                     input_length=max_sequence_len-1))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(total_words+1/2, activation='relu',
#                 kernel_regularizer=regularizers.l2(0.01)))
# model.add(Dense(total_words+1, activation='softmax'))
# model.compile(loss='categorical_crossentropy',
#               optimizer='adam', metrics=['accuracy'])
# print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 75, 100)           1072800   
                                                                 
 bidirectional_2 (Bidirecti  (None, 75, 300)           301200    
 onal)                                                           
                                                                 
 dropout_2 (Dropout)         (None, 75, 300)           0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               160400    
                                                                 
 dense_4 (Dense)             (None, 10727)             1083427   
                                                                 
 dense_5 (Dense)             (None, 10728)             115089984 
                                                                 
=================================================================
Total params: 117707811 (449.02 MB)
Trainable params: 117707811 (449.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None

model = load_model('my_model.h5')

WARNING:tensorflow:From C:\Users\Pawel\anaconda3\Lib\site-packages\keras\src\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.

history = model.fit(predictors_pan_tadeusz, label_pan_tadeusz, epochs=1, verbose=1)

744/744 [==============================] - 1501s 2s/step - loss: 1.4722 - accuracy: 0.7626

history = model.fit(predictors_SI, label_SI, epochs=3, verbose=1)

Epoch 1/3
55/55 [==============================] - 98s 2s/step - loss: 4.6245 - accuracy: 0.2131
Epoch 2/3
55/55 [==============================] - 97s 2s/step - loss: 3.9096 - accuracy: 0.2921
Epoch 3/3
55/55 [==============================] - 111s 2s/step - loss: 3.4379 - accuracy: 0.3603

history = model.fit(predictors, label, epochs=1, verbose=1)

799/799 [==============================] - 1105s 1s/step - loss: 1.7071 - accuracy: 0.7451

model.save('my_model.h5')

def predict(text, next_words=25):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len-1,
        padding='pre')
        predicted = np.argmax(model.predict(token_list,
                                            verbose=0), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        text += " " + output_word
    return text

predict("CNN", 24)

'CNN «wielmożni nieruchomi głowę lecz weźmiem na świat ich umiała się wtłoczyć na końcu które w w chleba gałeczki sieci neuronowych i zdolność do generowania'

predict("GANy", 17)

'GANy i w dawnej surowości prawidłach wychował zakazy żołnierszczyzny na sklepieniu sieci neuronowych w w przetwarzaniu języka naturalnego'

11 KiB Raw Blame History

11 KiB

Raw Blame History