{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 676us/step\n", "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 541us/step\n", "Accuracy: 0.9394717534849596\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.92 0.92 0.92 1983\n", " 1 0.95 0.95 0.95 3469\n", "\n", " accuracy 0.94 5452\n", " macro avg 0.93 0.93 0.93 5452\n", "weighted avg 0.94 0.94 0.94 5452\n", "\n" ] } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from gensim.models import Word2Vec\n", "from gensim.utils import simple_preprocess\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import tensorflow as tf\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense\n", "\n", "# Funkcja do przygotowania korpusu do trenowania word2vec\n", "def prepare_corpus(filepaths):\n", " corpus = []\n", " for filepath in filepaths:\n", " with open(filepath, 'r', encoding=\"utf8\") as file:\n", " for line in file:\n", " tokens = simple_preprocess(line)\n", " corpus.append(tokens)\n", " return corpus\n", "\n", "# Funkcja do zamiany tekstów na wektory przy użyciu word2vec\n", "def vectorize_text(text, model):\n", " tokens = simple_preprocess(text)\n", " vectors = [model.wv[word] for word in tokens if word in model.wv]\n", " if vectors:\n", " return np.mean(vectors, axis=0)\n", " else:\n", " return np.zeros(model.vector_size)\n", "\n", "# Funkcja do wczytywania danych tekstowych\n", "def load_data(filepath):\n", " texts = []\n", " with open(filepath, 'r', encoding=\"utf8\") as file:\n", " for line in file:\n", " texts.append(line.strip())\n", " return texts\n", "\n", "# Przygotowanie korpusu i trening modelu word2vec\n", "corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])\n", "w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n", "w2v_model.save(\"word2vec.model\")\n", "\n", "# Wczytywanie tekstów\n", "dev_texts = load_data('dev-0/in.tsv')\n", "test_texts = load_data('test-A/in.tsv')\n", "\n", "# Zamiana tekstów na wektory\n", "dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])\n", "test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])\n", "\n", "# Wczytywanie etykiet dla danych dev\n", "dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n", "dev_labels = dev_labels_df[0].values\n", "\n", "# Podział danych dev na zbiór treningowy i walidacyjny\n", "X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)\n", "\n", "# Budowa modelu sieci neuronowej\n", "model_nn = Sequential([\n", " Dense(64, activation='relu'),\n", " Dense(32, activation='relu'),\n", " Dense(1, activation='sigmoid')\n", "])\n", "\n", "model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", "\n", "# Trening modelu z walidacją\n", "history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)\n", "\n", "# Predykcje dla zbioru dev i test\n", "dev_predictions = model_nn.predict(dev_vectors)\n", "test_predictions = model_nn.predict(test_vectors)\n", "\n", "# Konwersja predykcji do binarnych klas (0 lub 1)\n", "dev_predictions = (dev_predictions > 0.5).astype(int)\n", "test_predictions = (test_predictions > 0.5).astype(int)\n", "\n", "# Zapis predykcji do plików\n", "def save_predictions(predictions, filepath):\n", " with open(filepath, 'w', encoding=\"utf8\") as file:\n", " for pred in predictions:\n", " file.write(f\"{pred[0]}\\n\")\n", "\n", "save_predictions(dev_predictions, 'dev-0/out.tsv')\n", "save_predictions(test_predictions, 'test-A/out.tsv')\n", "\n", "# Porównanie wyników z plikiem \"expected\"\n", "dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()\n", "expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None).values.flatten()\n", "\n", "# Wyświetlenie dokładności i raportu klasyfikacji\n", "accuracy = accuracy_score(expected_labels, dev_pred_labels)\n", "report = classification_report(expected_labels, dev_pred_labels)\n", "\n", "print(f'Accuracy: {accuracy}')\n", "print('Classification Report:')\n", "print(report)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }