{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Word2Vec" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import bibliotek" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from gensim.models import KeyedVectors\n", "from gensim.utils import simple_preprocess\n", "import pandas as pd\n", "import numpy as np\n", "from keras.models import Sequential\n", "from keras.layers import Dense\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Wczytanie danych" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0Mindaugas Budzinauskas wierzy w odbudowę formy...
1Przyjmujący reprezentacji Polski wrócił do PGE...
2FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
3Aleksander Filipiak: Czuję się dobrze w nowym ...
4Victoria Carl i Aleksiej Czerwotkin mistrzami ...
\n", "
" ], "text/plain": [ " Text\n", "0 Mindaugas Budzinauskas wierzy w odbudowę formy...\n", "1 Przyjmujący reprezentacji Polski wrócił do PGE...\n", "2 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n", "3 Aleksander Filipiak: Czuję się dobrze w nowym ...\n", "4 Victoria Carl i Aleksiej Czerwotkin mistrzami ..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0ATP Sztokholm: Juergen Zopp wykorzystał szansę...
1Krowicki z reprezentacją kobiet aż do igrzysk ...
2Wielki powrót Łukasza Kubota Odradza się zawsz...
3Marcel Hirscher wygrał ostatni slalom gigant m...
4Polki do Czarnogóry z pełnią zaangażowania. Sy...
\n", "
" ], "text/plain": [ " Text\n", "0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n", "1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n", "2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n", "3 Marcel Hirscher wygrał ostatni slalom gigant m...\n", "4 Polki do Czarnogóry z pełnią zaangażowania. Sy..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0Mundial 2018. Były reprezentant Anglii trenere...
1Liga Mistrzyń: Podopieczne Kima Rasmussena bli...
2Wyczerpujące treningi biegowe Justyny Kowalczy...
3Mundial 2018. Zagraniczne media zareagowały na...
4BCL. Artur Gronek: Musimy grać twardziej. Pope...
\n", "
" ], "text/plain": [ " Text\n", "0 Mundial 2018. Były reprezentant Anglii trenere...\n", "1 Liga Mistrzyń: Podopieczne Kima Rasmussena bli...\n", "2 Wyczerpujące treningi biegowe Justyny Kowalczy...\n", "3 Mundial 2018. Zagraniczne media zareagowały na...\n", "4 BCL. Artur Gronek: Musimy grać twardziej. Pope..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Label
01
11
20
31
40
\n", "
" ], "text/plain": [ " Label\n", "0 1\n", "1 1\n", "2 0\n", "3 1\n", "4 0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Label
01
11
20
31
41
\n", "
" ], "text/plain": [ " Label\n", "0 1\n", "1 1\n", "2 0\n", "3 1\n", "4 1" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data_train = pd.read_csv('train/train.tsv', sep=\"\\t\", names=[\"Text\"], usecols=[1])\n", "data_test = pd.read_csv('test-A/in.tsv', sep=\"\\t\", names=[\"Text\"])\n", "data_dev = pd.read_csv('dev-0/in.tsv', sep=\"\\t\", names=[\"Text\"])\n", "\n", "labels_train = pd.read_csv('train/train.tsv', sep='\\t', header=None, names=['Label'], usecols=[0])\n", "labels_dev = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None, names=['Label'])\n", "\n", "display(data_train.head())\n", "display(data_test.head())\n", "display(data_dev.head())\n", "display(labels_train.head())\n", "display(labels_dev.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Załadowanie wektorów Word2Vec" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "W2V_model = KeyedVectors.load('fasttext_100_3_polish.bin')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Funkcja przekształcania tekstu na wektory" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def text_to_vector(text, word2vec, vector_size):\n", " words = simple_preprocess(text)\n", " text_vector = np.zeros(vector_size)\n", " word_count = 0\n", " for word in words:\n", " if word in word2vec.wv:\n", " text_vector += word2vec.wv[word]\n", " word_count += 1\n", " if word_count > 0:\n", " text_vector /= word_count\n", " return text_vector" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dostosowanie formatu danych do modelu" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# Zamiana tekstów na wektory\n", "train_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_train['Text']])\n", "dev_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_dev['Text']])\n", "test_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_test['Text']])\n", "\n", "# Zamiana etykiet na liczby\n", "label_encoder = LabelEncoder()\n", "train_labels_enc = label_encoder.fit_transform(labels_train['Label'])\n", "dev_labels_enc = label_encoder.transform(labels_dev['Label'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stworzenie modelu" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\nkaro\\AppData\\Roaming\\Python\\Python311\\site-packages\\keras\\src\\layers\\core\\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", " super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 897us/step - accuracy: 0.9072 - loss: 0.2176 - val_accuracy: 0.9563 - val_loss: 0.1158\n", "Epoch 2/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 847us/step - accuracy: 0.9524 - loss: 0.1215 - val_accuracy: 0.9574 - val_loss: 0.1047\n", "Epoch 3/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 858us/step - accuracy: 0.9581 - loss: 0.1080 - val_accuracy: 0.9618 - val_loss: 0.0956\n", "Epoch 4/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 839us/step - accuracy: 0.9610 - loss: 0.1008 - val_accuracy: 0.9648 - val_loss: 0.0949\n", "Epoch 5/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 850us/step - accuracy: 0.9640 - loss: 0.0951 - val_accuracy: 0.9547 - val_loss: 0.1071\n", "Epoch 6/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 872us/step - accuracy: 0.9643 - loss: 0.0928 - val_accuracy: 0.9631 - val_loss: 0.0913\n", "Epoch 7/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 845us/step - accuracy: 0.9654 - loss: 0.0886 - val_accuracy: 0.9659 - val_loss: 0.0911\n", "Epoch 8/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 841us/step - accuracy: 0.9669 - loss: 0.0860 - val_accuracy: 0.9642 - val_loss: 0.0889\n", "Epoch 9/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 853us/step - accuracy: 0.9666 - loss: 0.0856 - val_accuracy: 0.9642 - val_loss: 0.0855\n", "Epoch 10/10\n", "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 841us/step - accuracy: 0.9676 - loss: 0.0821 - val_accuracy: 0.9666 - val_loss: 0.0883\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Stworzenie modelu\n", "model = Sequential()\n", "model.add(Dense(128, input_dim=100, activation='relu'))\n", "model.add(Dense(64, activation='relu'))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "\n", "# Trening modelu\n", "model.fit(train_vectors, train_labels_enc, epochs=10, batch_size=32, validation_data=(dev_vectors, dev_labels_enc))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Predykcja i zapis danych wyjścowych" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n", "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 638us/step\n" ] } ], "source": [ "# Predykcje dla danych walidacyjnych\n", "dev_predictions = model.predict(dev_vectors)\n", "dev_predictions = (dev_predictions > 0.5).astype(int)\n", "\n", "# Predykcje dla danych testowych\n", "test_predictions = model.predict(test_vectors)\n", "test_predictions = (test_predictions > 0.5).astype(int)\n", "\n", "# Zapisanie wyników do plików\n", "pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\\t', index=False, header=False)\n", "pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\\t', index=False, header=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 2 }