{ "cells": [ { "cell_type": "code", "execution_count": 11, "id": "ac13a243-5c5f-4896-86f9-6d0a89e3a7e4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import string\n", "import nltk\n", "import tensorflow.keras as tf\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from gensim.models import Word2Vec\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense\n", "from tensorflow.keras.models import Sequential" ] }, { "cell_type": "markdown", "id": "4fcd50b4-0c45-4430-bd9a-3b363fa5ef53", "metadata": {}, "source": [ "## Pobieranie danych i preprocessing " ] }, { "cell_type": "code", "execution_count": 12, "id": "a3bff1c2-bc03-417a-85de-3962c7a64794", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5447\n", "Mindaugas Budzinauskas wierzy w odbudowę formy Kevina Johnsona. Czy ktoś opuści Polpharmę? Mindaugas Budzinauskas w rozmowie z WP SportoweFakty opowiada o transferze Kevina Johnsona, ewentualnych odejściach z Polpharmy i kolejnym meczu PLK z Anwilem. - Potrzebowaliśmy takiego gracza, jak Johnson - podkreśla szkoleniowiec starogardzian. 1\n", "Mundial 2018. Były reprezentant Anglii trenerem Filipin Po niemal trzech latach przerwy Terry Butcher powraca na ławkę trenerską. Były obrońca reprezentacji Anglii został nowym selekcjonerem Filipin. 1\n", "(98132,) \n", "(5452,) \n", "(5447,) \n" ] } ], "source": [ "data = pd.read_csv(r'C:\\Users\\obses\\olympic-games-medals-19862018\\sport-text-classification-ball-ISI-public\\train\\train.tsv', sep='\\t', header=None)\n", "polish_stopwords = pd.read_csv(r'C:\\Users\\obses\\olympic-games-medals-19862018\\sport-text-classification-ball-ISI-public\\train\\polish.stopwords.txt', header=None)\n", "\n", "X_test = pd.read_csv(r'C:\\Users\\obses\\olympic-games-medals-19862018\\sport-text-classification-ball-ISI-public\\dev-0\\in.tsv', sep='\\t', header=None)\n", "Y_test = pd.read_csv(r'C:\\Users\\obses\\olympic-games-medals-19862018\\sport-text-classification-ball-ISI-public\\dev-0\\expected.tsv', sep='\\t', header=None)\n", "X_A_test = pd.read_csv(r'C:\\Users\\obses\\olympic-games-medals-19862018\\sport-text-classification-ball-ISI-public\\test-A\\in.tsv', sep='\\t', header=None)\n", "\n", "X_train = data[data.columns[1]]\n", "Y_train = data[data.columns[0]]\n", "X_A_test = X_A_test[X_A_test.columns[0]]\n", "\n", "print(len(X_A_test))\n", "\n", "X_test = X_test[X_test.columns[0]]\n", "Y_test = Y_test[Y_test.columns[0]]\n", "\n", "print(X_train[0], Y_train[0])\n", "print(X_test[0], Y_test[0])\n", "\n", "def preprocess(text):\n", " text = text.lower()\n", " text = ''.join([word for word in text if word not in string.punctuation])\n", " tokens = word_tokenize(text)\n", " tokens = [word for word in tokens if word not in polish_stopwords]\n", " return ' '.join(tokens)\n", "\n", "X_train = X_train.apply(preprocess)\n", "print(X_train.shape, type(X_train))\n", "\n", "X_test = X_test.apply(preprocess)\n", "print(X_test.shape, type(X_test))\n", "\n", "X_A_test = X_A_test.apply(preprocess)\n", "print(X_A_test.shape, type(X_A_test))" ] }, { "cell_type": "code", "execution_count": 13, "id": "37652a35-e672-470d-9884-27c3b2b9f6c5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "fen 9 zapowiedź walki róża gumienna vs katarzyna posiadała wideo podczas fight exclusive night 9 zmierzą się również kobiety w walce pań na zasadach k1 rękawice skrzyżują róża gumienna i katarzyna posiadała\n" ] } ], "source": [ "# Train the Word2Vec model\n", "print(X_train[2])\n", "sentences = [sentence.split() for sentence in X_train]\n", "w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)" ] }, { "cell_type": "code", "execution_count": 14, "id": "922f8b39-db22-49d7-801a-d6298b085959", "metadata": {}, "outputs": [], "source": [ "# Tokenize the text data\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(X_train)\n", "tokenizer2 = Tokenizer()\n", "tokenizer2.fit_on_texts(X_test)\n", "tokenizer3 = Tokenizer()\n", "tokenizer3.fit_on_texts(X_A_test)" ] }, { "cell_type": "code", "execution_count": 15, "id": "ae8bed11-9fac-42e8-8f34-91d1e85d6c2f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[4185, 4360, 1275, 1, 22420, 1084, 3211, 6765, 132, 2718, 1879, 5966, 4185, 4360, 1, 561, 2, 73, 88, 2322, 8, 3405, 3211, 6765, 17260, 44122, 2, 1950, 4, 541, 17, 195, 2, 2645, 12760, 1408, 3430, 31, 3076, 1607, 712, 11581]\n", "[164, 27, 96, 364, 1368, 570, 6057, 9, 1124, 165, 386, 1125, 9744, 9745, 5092, 2, 6058, 14853, 96, 1369, 63, 1368, 58, 134, 4400, 6057]\n" ] } ], "source": [ "X_train = tokenizer.texts_to_sequences(X_train)\n", "X_test = tokenizer2.texts_to_sequences(X_test)\n", "X_A_test = tokenizer3.texts_to_sequences(X_A_test)\n", "print(X_train[0])\n", "print(X_test[0])" ] }, { "cell_type": "code", "execution_count": 16, "id": "b629c46d-abdf-4f93-9c5c-bcf8dfb84868", "metadata": {}, "outputs": [], "source": [ "vocab_size = len(tokenizer.word_index) + 1" ] }, { "cell_type": "code", "execution_count": 17, "id": "31a5843c-6bac-401e-b48d-9af88ab8f7b5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 988 4063 1471 114 5 8198 7541 3 618 12447 4040 119\n", " 1887 397 8988 228 326 13865 52570 4063 1783 7311 13 1\n", " 190 1615 46 19 43 5467 12448 5 2 2167 464 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0]\n", "[ 4 90 1497 694 28 8 180 1 9758 1 105 87 5 1086\n", " 420 297 694 4 21 369 1 3109 304 180 13 23 4407 28\n", " 8 1271 87 5 817 1 9758 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0]\n" ] } ], "source": [ "# Pad the sequences to a fixed length\n", "max_length = 100\n", "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n", "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n", "X_A_test = pad_sequences(X_A_test, maxlen=max_length, padding='post')\n", "print(X_train[10])\n", "print(X_test[10])" ] }, { "cell_type": "code", "execution_count": 18, "id": "d41d15cc-8d3f-4b8a-a09b-e0bcc4f74b22", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(109453, 100)\n" ] } ], "source": [ "# Create a weight matrix for the embedding layer\n", "embedding_matrix = np.zeros((vocab_size, 100))\n", "for word, i in tokenizer.word_index.items():\n", " if word in w2v_model.wv:\n", " embedding_matrix[i] = w2v_model.wv[word]\n", "print(embedding_matrix.shape)" ] }, { "cell_type": "code", "execution_count": 19, "id": "5d0ca2d1-2826-475c-a0e3-8c716e174aaf", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\obses\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:86: UserWarning: Argument `input_length` is deprecated. Just remove it.\n", " warnings.warn(\n" ] } ], "source": [ "from keras.initializers import Constant\n", "from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout\n", "model = Sequential()\n", "\n", "# Define the model\n", "model = Sequential()\n", "\n", "# Embedding layer\n", "model.add(Embedding(\n", " input_dim=vocab_size,\n", " output_dim=100,\n", " embeddings_initializer=Constant(embedding_matrix),\n", " input_length=max_length,\n", " trainable=True # Allow fine-tuning of embeddings\n", "))\n", "\n", "# Add Convolutional layer\n", "model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))\n", "model.add(GlobalMaxPooling1D())\n", "model.add(Dropout(0.5)) #\n", "\n", "model.add(Conv1D(filters=128, kernel_size=5, activation='relu')) #\n", "model.add(GlobalMaxPooling1D()) #\n", "model.add(Dropout(0.5)) #\n", "\n", "# Additional Dense layer with Dropout for regularization\n", "model.add(Dense(units=64, activation='relu'))\n", "model.add(Dropout(0.5))\n", "\n", "# Output layer\n", "model.add(Dense(units=1, activation='sigmoid'))\n", "\n", "# Compile the model\n", "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 23, "id": "96b7acc7-49ab-4855-929d-8dd3fbbbe760", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(98132, 100) (98132,) (5452, 100) (5452,)\n" ] } ], "source": [ "print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)" ] }, { "cell_type": "code", "execution_count": 27, "id": "57022ed2-8d2d-4bbe-bdc6-489d99cda59b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/3\n", "\u001b[1m24533/24533\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1753s\u001b[0m 71ms/step - accuracy: 0.9830 - loss: 0.0604 - val_accuracy: 0.5477 - val_loss: 2.0434\n", "Epoch 2/3\n", "\u001b[1m24533/24533\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1747s\u001b[0m 71ms/step - accuracy: 0.9874 - loss: 0.0461 - val_accuracy: 0.5655 - val_loss: 1.8298\n", "Epoch 3/3\n", "\u001b[1m24533/24533\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1740s\u001b[0m 71ms/step - accuracy: 0.9894 - loss: 0.0409 - val_accuracy: 0.5921 - val_loss: 3.2375\n" ] } ], "source": [ "history = model.fit(X_train, Y_train, epochs=3, batch_size=4, validation_data=(X_test, Y_test) )" ] }, { "cell_type": "code", "execution_count": 33, "id": "e11dd1ad-29ba-4e31-b262-4eb220ebd526", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step - accuracy: 0.5817 - loss: 3.4503\n", "Test Accuracy: 59.21%\n", "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step\n", "[1 0 1 ... 1 1 1]\n", "[1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 0 1]\n", "0.5920763022743947\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sklearn import metrics\n", "loss, accuracy = model.evaluate(X_test, Y_test)\n", "print(f'Test Accuracy: {accuracy*100:.2f}%')\n", "predictions = model.predict(X_test)\n", "\n", "binary_predictions = (predictions > 0.5).astype(int)\n", "\n", "binary_predictions = binary_predictions.flatten()\n", "\n", "print(binary_predictions)\n", "\n", "print(binary_predictions[:20])\n", "results = metrics.accuracy_score(Y_test, binary_predictions)\n", "print(results)\n", "\n", "# Plot training history\n", "import matplotlib.pyplot as plt\n", "\n", "# Plot training & validation accuracy values\n", "plt.figure(figsize=(12, 4))\n", "plt.subplot(1, 2, 1)\n", "plt.plot(history.history['accuracy'])\n", "plt.plot(history.history['val_accuracy'])\n", "plt.title('Model accuracy')\n", "plt.ylabel('Accuracy')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Validation'], loc='upper left')\n", "\n", "# Plot training & validation loss values\n", "plt.subplot(1, 2, 2)\n", "plt.plot(history.history['loss'])\n", "plt.plot(history.history['val_loss'])\n", "plt.title('Model loss')\n", "plt.ylabel('Loss')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Validation'], loc='upper left')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 34, "id": "b126f1f0-1077-4e86-a71c-e351addad912", "metadata": {}, "outputs": [], "source": [ "filename = \"out.tsv\"\n", "\n", "# Open the file in write mode\n", "with open(filename, 'w') as file:\n", " # Loop through each element in the array\n", " for value in binary_predictions:\n", " # Write each element on a new line\n", " file.write(f\"{value}\\n\")" ] }, { "cell_type": "code", "execution_count": 35, "id": "676f55b8-c1bc-44f9-8486-b1fd84c83992", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TSV file 'out.tsv' created successfully.\n" ] } ], "source": [ "print(f\"TSV file '{filename}' created successfully.\")" ] }, { "cell_type": "code", "execution_count": 36, "id": "a57c39a7-baeb-4060-8cc0-78f79a71167b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 2ms/step\n", "5447\n", "[1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1] 5447\n" ] } ], "source": [ "predictions2 = model.predict(X_A_test)\n", "print(len(X_A_test))\n", "binary_predictions2 = (predictions2 > 0.5).astype(int)\n", "\n", "binary_predictions2 = binary_predictions2.flatten()\n", "\n", "print(binary_predictions2[:20], len(binary_predictions2))\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "2b05df7c-8d0f-4d3e-9411-b690c69c0baf", "metadata": {}, "outputs": [], "source": [ "filename = \"outA.tsv\"\n", "\n", "# Open the file in write mode\n", "with open(filename, 'w') as file:\n", " # Loop through each element in the array\n", " for value in binary_predictions2:\n", " # Write each element on a new line\n", " file.write(f\"{value}\\n\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }