{ "cells": [ { "cell_type": "markdown", "source": [ "### Importy" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true, "ExecuteTime": { "start_time": "2024-05-19T18:21:27.211216Z", "end_time": "2024-05-19T18:21:27.318205Z" } }, "outputs": [], "source": [ "import gzip\n", "import math\n", "import re\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from gensim.models import KeyedVectors\n", "from keras.layers import Dense, Dropout\n", "from keras.models import Sequential\n", "from keras.regularizers import l2\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "source": [ "### Wczytywanie oraz czyszczenie danych" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 19, "outputs": [], "source": [ "def load_and_filter_data(file_path):\n", " texts = []\n", " labels = []\n", " with gzip.open(file_path, 'rt', encoding='utf-8') as f:\n", " for line in f:\n", " parts = line.strip().split('\\t')\n", " if len(parts) == 2:\n", " labels.append(int(parts[0]))\n", " texts.append(parts[1])\n", " data = pd.DataFrame({'label': labels, 'text': texts})\n", " return data\n", "\n", "def load_and_filter_tsv(file_path):\n", " texts = []\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " for line in f:\n", " parts = line.strip().split('\\t')\n", " if len(parts) == 1:\n", " texts.append(parts[0])\n", " data = pd.DataFrame({'text': texts})\n", " return data\n", "\n", "def load_labels(file_path):\n", " labels = []\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " for line in f:\n", " labels.append(int(line.strip()))\n", " return np.array(labels)\n", "\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r'\\d+', '', text)\n", " text = re.sub(r'\\s+', ' ', text)\n", " text = re.sub(r'[^\\w\\s]', '', text)\n", " return text" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:21:27.231204Z", "end_time": "2024-05-19T18:21:27.377342Z" } } }, { "cell_type": "markdown", "source": [ "### Wczytywanie danych treningowych oraz testowych" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 20, "outputs": [], "source": [ "train_data = load_and_filter_data('train/train.tsv.gz')\n", "train_data['text'] = train_data['text'].apply(clean_text)\n", "dev_data = load_and_filter_tsv('dev-0/in.tsv')\n", "dev_data['text'] = dev_data['text'].apply(clean_text)\n", "dev_labels = load_labels('dev-0/expected.tsv')\n", "test_data = load_and_filter_tsv('test-A/in.tsv')\n", "test_data['text'] = test_data['text'].apply(clean_text)" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:21:27.241222Z", "end_time": "2024-05-19T18:21:31.160229Z" } } }, { "cell_type": "markdown", "source": [ "### Wczytywanie modelu word2vec" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 21, "outputs": [], "source": [ "word2vec_model = KeyedVectors.load(\"word2vec_100_3_polish.bin\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:21:31.161230Z", "end_time": "2024-05-19T18:21:54.895038Z" } } }, { "cell_type": "markdown", "source": [ "### Przekształcenie danych na wektory" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 22, "outputs": [], "source": [ "def text_to_vector(text, model):\n", " words = text.split()\n", " word_vecs = [model[word] for word in words if word in model]\n", " return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:21:54.900047Z", "end_time": "2024-05-19T18:21:54.909040Z" } } }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])\n", "y_train = np.array(train_data['label'])\n", "X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])\n", "X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:21:54.913039Z", "end_time": "2024-05-19T18:22:03.870813Z" } } }, { "cell_type": "markdown", "source": [ "### Model" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 24, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\adamw\\PycharmProjects\\pythonProject\\venv\\lib\\site-packages\\keras\\src\\layers\\core\\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n", " super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n" ] } ], "source": [ "model = Sequential()\n", "model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))\n", "model.add(Dropout(0.5))\n", "model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))\n", "model.add(Dropout(0.5))\n", "model.add(Dense(1, activation='sigmoid'))" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:22:03.872859Z", "end_time": "2024-05-19T18:22:04.122687Z" } } }, { "cell_type": "code", "execution_count": 25, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m6s\u001B[0m 2ms/step - accuracy: 0.8769 - loss: 0.4540 - val_accuracy: 0.9310 - val_loss: 0.2222\n", "Epoch 2/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9270 - loss: 0.2362 - val_accuracy: 0.9303 - val_loss: 0.2106\n", "Epoch 3/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9320 - loss: 0.2191 - val_accuracy: 0.9415 - val_loss: 0.1890\n", "Epoch 4/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9306 - loss: 0.2139 - val_accuracy: 0.9406 - val_loss: 0.1850\n", "Epoch 5/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2098 - val_accuracy: 0.9395 - val_loss: 0.1883\n", "Epoch 6/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9325 - loss: 0.2074 - val_accuracy: 0.9404 - val_loss: 0.1814\n", "Epoch 7/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2093 - val_accuracy: 0.9441 - val_loss: 0.1810\n", "Epoch 8/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9326 - loss: 0.2094 - val_accuracy: 0.9441 - val_loss: 0.1804\n", "Epoch 9/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9327 - loss: 0.2064 - val_accuracy: 0.9400 - val_loss: 0.1807\n", "Epoch 10/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2073 - val_accuracy: 0.9408 - val_loss: 0.1799\n", "Epoch 11/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9324 - loss: 0.2061 - val_accuracy: 0.9391 - val_loss: 0.1826\n", "Epoch 12/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2066 - val_accuracy: 0.9433 - val_loss: 0.1814\n", "Epoch 13/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9325 - loss: 0.2066 - val_accuracy: 0.9382 - val_loss: 0.1882\n", "Epoch 14/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2045 - val_accuracy: 0.9406 - val_loss: 0.1813\n", "Epoch 15/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9316 - loss: 0.2106 - val_accuracy: 0.9408 - val_loss: 0.1831\n", "Epoch 16/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9338 - loss: 0.2036 - val_accuracy: 0.9384 - val_loss: 0.1862\n", "Epoch 17/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9330 - loss: 0.2063 - val_accuracy: 0.9398 - val_loss: 0.1862\n", "Epoch 18/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2102 - val_accuracy: 0.9408 - val_loss: 0.1802\n", "Epoch 19/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9323 - loss: 0.2059 - val_accuracy: 0.9397 - val_loss: 0.1794\n", "Epoch 20/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9338 - loss: 0.2039 - val_accuracy: 0.9431 - val_loss: 0.1728\n", "Epoch 21/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2102 - val_accuracy: 0.9415 - val_loss: 0.1787\n", "Epoch 22/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9351 - loss: 0.2034 - val_accuracy: 0.9433 - val_loss: 0.1780\n", "Epoch 23/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9330 - loss: 0.2059 - val_accuracy: 0.9404 - val_loss: 0.1759\n", "Epoch 24/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9335 - loss: 0.2042 - val_accuracy: 0.9409 - val_loss: 0.1789\n", "Epoch 25/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9341 - loss: 0.2052 - val_accuracy: 0.9389 - val_loss: 0.1813\n", "Epoch 26/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2078 - val_accuracy: 0.9406 - val_loss: 0.1813\n", "Epoch 27/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2069 - val_accuracy: 0.9283 - val_loss: 0.2017\n", "Epoch 28/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9324 - loss: 0.2083 - val_accuracy: 0.9409 - val_loss: 0.1883\n", "Epoch 29/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9411 - val_loss: 0.1791\n", "Epoch 30/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9419 - val_loss: 0.1769\n", "Epoch 31/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9343 - loss: 0.2029 - val_accuracy: 0.9439 - val_loss: 0.1756\n", "Epoch 32/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2060 - val_accuracy: 0.9384 - val_loss: 0.1805\n", "Epoch 33/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9333 - loss: 0.2023 - val_accuracy: 0.9395 - val_loss: 0.1780\n", "Epoch 34/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9347 - loss: 0.2025 - val_accuracy: 0.9408 - val_loss: 0.1806\n", "Epoch 35/35\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9315 - loss: 0.2038 - val_accuracy: 0.9419 - val_loss: 0.1762\n" ] }, { "data": { "text/plain": "" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_dev, dev_labels))" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:22:04.124694Z", "end_time": "2024-05-19T18:24:44.659379Z" } } }, { "cell_type": "markdown", "source": [ "### Ewaluacja modelu na zbiorze walidacyjnym" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 26, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 814us/step - accuracy: 0.9413 - loss: 0.1863\n", "Accuracy on validation set: 0.9418562054634094\n" ] } ], "source": [ "loss, accuracy = model.evaluate(X_dev, dev_labels)\n", "print(f'Accuracy on validation set: {accuracy}')" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:24:44.661382Z", "end_time": "2024-05-19T18:24:44.864668Z" } } }, { "cell_type": "markdown", "source": [ "### Predykcja na danych walidacyjnych oraz testowych" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 900us/step\n", "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 765us/step\n" ] } ], "source": [ "dev_predictions = model.predict(X_dev)\n", "test_predictions = model.predict(X_test)" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:24:44.863671Z", "end_time": "2024-05-19T18:24:45.395043Z" } } }, { "cell_type": "markdown", "source": [ "### Zapis wyników" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 28, "outputs": [], "source": [ "dev_predictions = (dev_predictions > 0.5).astype(int)\n", "test_predictions = (test_predictions > 0.5).astype(int)\n", "\n", "pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\\t', header=False, index=False)\n", "pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\\t', header=False, index=False)" ], "metadata": { "collapsed": false, "ExecuteTime": { "start_time": "2024-05-19T18:24:45.398007Z", "end_time": "2024-05-19T18:24:45.438575Z" } } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }