{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": { "id": "sMvlO4r-2-dQ" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import tensorflow.keras.utils as ku\n", "from wordcloud import WordCloud\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.optimizers import Adam\n", "from tensorflow.keras import regularizers\n", "from keras.models import load_model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "Ib8MIaQ33Kqk" }, "outputs": [], "source": [ "data_pan_tadeusz = open('pan-tadeusz.txt', encoding=\"utf8\").read()\n", "data_SI = open('SI_data.txt', encoding=\"utf8\").read()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "wquLVQVj5Tdx" }, "outputs": [], "source": [ "def create_corpus(data):\n", " corpus = data.lower().split(\"\\n\")\n", " corpus = [element.strip() for element in corpus if element]\n", " return corpus" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "ZFiZmIeX8Ifi" }, "outputs": [], "source": [ "corpus_pan_tadeusz = create_corpus(data_pan_tadeusz)[:4000]\n", "corpus_SI = create_corpus(data_SI)\n", "corpus = corpus_pan_tadeusz + corpus_SI" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "2zw0S_vw8Ksf" }, "outputs": [], "source": [ "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(corpus)\n", "total_words = len(tokenizer.word_index)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "VHXURJSO7fBk" }, "outputs": [], "source": [ "def create_input_sequences(corpus):\n", " input_sequences = []\n", " for line in corpus:\n", " token_list = tokenizer.texts_to_sequences([line])[0]\n", "\n", " for i in range(1, len(token_list)):\n", " n_gram_sequence = token_list[:i+1]\n", " input_sequences.append(n_gram_sequence)\n", " return input_sequences" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "id": "Jl3rcom57ptg" }, "outputs": [], "source": [ "input_sequences_pan_tadeusz = create_input_sequences(corpus_pan_tadeusz)\n", "input_sequences_SI = create_input_sequences(corpus_SI)\n", "input_sequences = create_input_sequences(corpus)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "5_ah83de7yfc" }, "outputs": [], "source": [ "max_sequence_len = max([len(x) for x in input_sequences])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "xyz8co7B8SJa" }, "outputs": [], "source": [ "def create_predictors_label(input_sequences, max_sequence_len):\n", " input_sequences = np.array(pad_sequences(input_sequences,\n", " maxlen=max_sequence_len,\n", " padding='pre'))\n", " predictors, label = input_sequences[:, :-1], input_sequences[:, -1]\n", " label = ku.to_categorical(label, num_classes=total_words+1)\n", " return predictors, label" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "id": "pn8D_IT97BWy" }, "outputs": [], "source": [ "predictors_pan_tadeusz, label_pan_tadeusz = create_predictors_label(input_sequences_pan_tadeusz, max_sequence_len)\n", "predictors_SI, label_SI = create_predictors_label(input_sequences_SI, max_sequence_len)\n", "predictors, label = create_predictors_label(input_sequences, max_sequence_len)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "j6gmo0fd8Tvq", "outputId": "a17d4649-9916-42f6-f7dd-75dbeb0dcbd2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_2\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding_2 (Embedding) (None, 75, 100) 1072800 \n", " \n", " bidirectional_2 (Bidirecti (None, 75, 300) 301200 \n", " onal) \n", " \n", " dropout_2 (Dropout) (None, 75, 300) 0 \n", " \n", " lstm_5 (LSTM) (None, 100) 160400 \n", " \n", " dense_4 (Dense) (None, 10727) 1083427 \n", " \n", " dense_5 (Dense) (None, 10728) 115089984 \n", " \n", "=================================================================\n", "Total params: 117707811 (449.02 MB)\n", "Trainable params: 117707811 (449.02 MB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n", "None\n" ] } ], "source": [ "# model = Sequential()\n", "# model.add(Embedding(total_words+1, 100,\n", "# input_length=max_sequence_len-1))\n", "# model.add(Bidirectional(LSTM(150, return_sequences=True)))\n", "# model.add(Dropout(0.2))\n", "# model.add(LSTM(100))\n", "# model.add(Dense(total_words+1/2, activation='relu',\n", "# kernel_regularizer=regularizers.l2(0.01)))\n", "# model.add(Dense(total_words+1, activation='softmax'))\n", "# model.compile(loss='categorical_crossentropy',\n", "# optimizer='adam', metrics=['accuracy'])\n", "# print(model.summary())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\Pawel\\anaconda3\\Lib\\site-packages\\keras\\src\\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n", "\n" ] } ], "source": [ "model = load_model('my_model.h5')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UdZmXNVS8aJk", "outputId": "3664d91a-a866-4bee-d6fc-4c320d68f118" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "744/744 [==============================] - 1501s 2s/step - loss: 1.4722 - accuracy: 0.7626\n" ] } ], "source": [ "history = model.fit(predictors_pan_tadeusz, label_pan_tadeusz, epochs=1, verbose=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ykyvfDET-PdY", "outputId": "71835132-bd74-4feb-c272-815fa05f8661" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/3\n", "55/55 [==============================] - 98s 2s/step - loss: 4.6245 - accuracy: 0.2131\n", "Epoch 2/3\n", "55/55 [==============================] - 97s 2s/step - loss: 3.9096 - accuracy: 0.2921\n", "Epoch 3/3\n", "55/55 [==============================] - 111s 2s/step - loss: 3.4379 - accuracy: 0.3603\n" ] } ], "source": [ "history = model.fit(predictors_SI, label_SI, epochs=3, verbose=1)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "799/799 [==============================] - 1105s 1s/step - loss: 1.7071 - accuracy: 0.7451\n" ] } ], "source": [ "history = model.fit(predictors, label, epochs=1, verbose=1)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HYnWu0yWRA0l", "outputId": "604a19b7-028f-4ac9-a562-67a35965f53d" }, "outputs": [], "source": [ "model.save('my_model.h5')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "bWWKKkKk8d3i" }, "outputs": [], "source": [ "def predict(text, next_words=25):\n", " for _ in range(next_words):\n", " token_list = tokenizer.texts_to_sequences([text])[0]\n", " token_list = pad_sequences(\n", " [token_list], maxlen=max_sequence_len-1,\n", " padding='pre')\n", " predicted = np.argmax(model.predict(token_list,\n", " verbose=0), axis=-1)\n", " output_word = \"\"\n", " for word, index in tokenizer.word_index.items():\n", " if index == predicted:\n", " output_word = word\n", " break\n", "\n", " text += \" \" + output_word\n", " return text" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 53 }, "id": "bMcMgTh3-EkL", "outputId": "1423e627-4e33-4c41-af41-3a88a53a3b38" }, "outputs": [ { "data": { "text/plain": [ "'CNN «wielmożni nieruchomi głowę lecz weźmiem na świat ich umiała się wtłoczyć na końcu które w w chleba gałeczki sieci neuronowych i zdolność do generowania'" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict(\"CNN\", 24)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'GANy i w dawnej surowości prawidłach wychował zakazy żołnierszczyzny na sklepieniu sieci neuronowych w w przetwarzaniu języka naturalnego'" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict(\"GANy\", 17)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 1 }