408 lines
11 KiB
Plaintext
408 lines
11 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {
|
|
"id": "sMvlO4r-2-dQ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import tensorflow.keras.utils as ku\n",
|
|
"from wordcloud import WordCloud\n",
|
|
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
|
"from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n",
|
|
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
|
"from tensorflow.keras.models import Sequential\n",
|
|
"from tensorflow.keras.optimizers import Adam\n",
|
|
"from tensorflow.keras import regularizers\n",
|
|
"from keras.models import load_model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"id": "Ib8MIaQ33Kqk"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_pan_tadeusz = open('pan-tadeusz.txt', encoding=\"utf8\").read()\n",
|
|
"data_SI = open('SI_data.txt', encoding=\"utf8\").read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"id": "wquLVQVj5Tdx"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_corpus(data):\n",
|
|
" corpus = data.lower().split(\"\\n\")\n",
|
|
" corpus = [element.strip() for element in corpus if element]\n",
|
|
" return corpus"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"id": "ZFiZmIeX8Ifi"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"corpus_pan_tadeusz = create_corpus(data_pan_tadeusz)[:4000]\n",
|
|
"corpus_SI = create_corpus(data_SI)\n",
|
|
"corpus = corpus_pan_tadeusz + corpus_SI"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"id": "2zw0S_vw8Ksf"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"tokenizer = Tokenizer()\n",
|
|
"tokenizer.fit_on_texts(corpus)\n",
|
|
"total_words = len(tokenizer.word_index)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"id": "VHXURJSO7fBk"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_input_sequences(corpus):\n",
|
|
" input_sequences = []\n",
|
|
" for line in corpus:\n",
|
|
" token_list = tokenizer.texts_to_sequences([line])[0]\n",
|
|
"\n",
|
|
" for i in range(1, len(token_list)):\n",
|
|
" n_gram_sequence = token_list[:i+1]\n",
|
|
" input_sequences.append(n_gram_sequence)\n",
|
|
" return input_sequences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {
|
|
"id": "Jl3rcom57ptg"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"input_sequences_pan_tadeusz = create_input_sequences(corpus_pan_tadeusz)\n",
|
|
"input_sequences_SI = create_input_sequences(corpus_SI)\n",
|
|
"input_sequences = create_input_sequences(corpus)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {
|
|
"id": "5_ah83de7yfc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"max_sequence_len = max([len(x) for x in input_sequences])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"id": "xyz8co7B8SJa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def create_predictors_label(input_sequences, max_sequence_len):\n",
|
|
" input_sequences = np.array(pad_sequences(input_sequences,\n",
|
|
" maxlen=max_sequence_len,\n",
|
|
" padding='pre'))\n",
|
|
" predictors, label = input_sequences[:, :-1], input_sequences[:, -1]\n",
|
|
" label = ku.to_categorical(label, num_classes=total_words+1)\n",
|
|
" return predictors, label"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {
|
|
"id": "pn8D_IT97BWy"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"predictors_pan_tadeusz, label_pan_tadeusz = create_predictors_label(input_sequences_pan_tadeusz, max_sequence_len)\n",
|
|
"predictors_SI, label_SI = create_predictors_label(input_sequences_SI, max_sequence_len)\n",
|
|
"predictors, label = create_predictors_label(input_sequences, max_sequence_len)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "j6gmo0fd8Tvq",
|
|
"outputId": "a17d4649-9916-42f6-f7dd-75dbeb0dcbd2"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model: \"sequential_2\"\n",
|
|
"_________________________________________________________________\n",
|
|
" Layer (type) Output Shape Param # \n",
|
|
"=================================================================\n",
|
|
" embedding_2 (Embedding) (None, 75, 100) 1072800 \n",
|
|
" \n",
|
|
" bidirectional_2 (Bidirecti (None, 75, 300) 301200 \n",
|
|
" onal) \n",
|
|
" \n",
|
|
" dropout_2 (Dropout) (None, 75, 300) 0 \n",
|
|
" \n",
|
|
" lstm_5 (LSTM) (None, 100) 160400 \n",
|
|
" \n",
|
|
" dense_4 (Dense) (None, 10727) 1083427 \n",
|
|
" \n",
|
|
" dense_5 (Dense) (None, 10728) 115089984 \n",
|
|
" \n",
|
|
"=================================================================\n",
|
|
"Total params: 117707811 (449.02 MB)\n",
|
|
"Trainable params: 117707811 (449.02 MB)\n",
|
|
"Non-trainable params: 0 (0.00 Byte)\n",
|
|
"_________________________________________________________________\n",
|
|
"None\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# model = Sequential()\n",
|
|
"# model.add(Embedding(total_words+1, 100,\n",
|
|
"# input_length=max_sequence_len-1))\n",
|
|
"# model.add(Bidirectional(LSTM(150, return_sequences=True)))\n",
|
|
"# model.add(Dropout(0.2))\n",
|
|
"# model.add(LSTM(100))\n",
|
|
"# model.add(Dense(total_words+1/2, activation='relu',\n",
|
|
"# kernel_regularizer=regularizers.l2(0.01)))\n",
|
|
"# model.add(Dense(total_words+1, activation='softmax'))\n",
|
|
"# model.compile(loss='categorical_crossentropy',\n",
|
|
"# optimizer='adam', metrics=['accuracy'])\n",
|
|
"# print(model.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"WARNING:tensorflow:From C:\\Users\\Pawel\\anaconda3\\Lib\\site-packages\\keras\\src\\backend.py:1398: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"model = load_model('my_model.h5')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "UdZmXNVS8aJk",
|
|
"outputId": "3664d91a-a866-4bee-d6fc-4c320d68f118"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"744/744 [==============================] - 1501s 2s/step - loss: 1.4722 - accuracy: 0.7626\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"history = model.fit(predictors_pan_tadeusz, label_pan_tadeusz, epochs=1, verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "ykyvfDET-PdY",
|
|
"outputId": "71835132-bd74-4feb-c272-815fa05f8661"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/3\n",
|
|
"55/55 [==============================] - 98s 2s/step - loss: 4.6245 - accuracy: 0.2131\n",
|
|
"Epoch 2/3\n",
|
|
"55/55 [==============================] - 97s 2s/step - loss: 3.9096 - accuracy: 0.2921\n",
|
|
"Epoch 3/3\n",
|
|
"55/55 [==============================] - 111s 2s/step - loss: 3.4379 - accuracy: 0.3603\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"history = model.fit(predictors_SI, label_SI, epochs=3, verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"799/799 [==============================] - 1105s 1s/step - loss: 1.7071 - accuracy: 0.7451\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"history = model.fit(predictors, label, epochs=1, verbose=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "HYnWu0yWRA0l",
|
|
"outputId": "604a19b7-028f-4ac9-a562-67a35965f53d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.save('my_model.h5')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {
|
|
"id": "bWWKKkKk8d3i"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def predict(text, next_words=25):\n",
|
|
" for _ in range(next_words):\n",
|
|
" token_list = tokenizer.texts_to_sequences([text])[0]\n",
|
|
" token_list = pad_sequences(\n",
|
|
" [token_list], maxlen=max_sequence_len-1,\n",
|
|
" padding='pre')\n",
|
|
" predicted = np.argmax(model.predict(token_list,\n",
|
|
" verbose=0), axis=-1)\n",
|
|
" output_word = \"\"\n",
|
|
" for word, index in tokenizer.word_index.items():\n",
|
|
" if index == predicted:\n",
|
|
" output_word = word\n",
|
|
" break\n",
|
|
"\n",
|
|
" text += \" \" + output_word\n",
|
|
" return text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 53
|
|
},
|
|
"id": "bMcMgTh3-EkL",
|
|
"outputId": "1423e627-4e33-4c41-af41-3a88a53a3b38"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'CNN «wielmożni nieruchomi głowę lecz weźmiem na świat ich umiała się wtłoczyć na końcu które w w chleba gałeczki sieci neuronowych i zdolność do generowania'"
|
|
]
|
|
},
|
|
"execution_count": 56,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"predict(\"CNN\", 24)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 55,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'GANy i w dawnej surowości prawidłach wychował zakazy żołnierszczyzny na sklepieniu sieci neuronowych w w przetwarzaniu języka naturalnego'"
|
|
]
|
|
},
|
|
"execution_count": 55,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"predict(\"GANy\", 17)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"gpuType": "T4",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|