First Word2Vec commit

2024-05-19 19:15:33 +02:00 · 2024-05-19 19:15:33 +02:00 · 73ca11f9d1
commit 73ca11f9d1
8 changed files with 125532 additions and 0 deletions
--- a/Word2Vec2.ipynb
+++ b/Word2Vec2.ipynb
@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 676us/step\n",
+      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 541us/step\n",
+      "Accuracy: 0.9394717534849596\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.92      0.92      0.92      1983\n",
+      "           1       0.95      0.95      0.95      3469\n",
+      "\n",
+      "    accuracy                           0.94      5452\n",
+      "   macro avg       0.93      0.93      0.93      5452\n",
+      "weighted avg       0.94      0.94      0.94      5452\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from gensim.models import Word2Vec\n",
+    "from gensim.utils import simple_preprocess\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "\n",
+    "# Funkcja do przygotowania korpusu do trenowania word2vec\n",
+    "def prepare_corpus(filepaths):\n",
+    "    corpus = []\n",
+    "    for filepath in filepaths:\n",
+    "        with open(filepath, 'r', encoding=\"utf8\") as file:\n",
+    "            for line in file:\n",
+    "                tokens = simple_preprocess(line)\n",
+    "                corpus.append(tokens)\n",
+    "    return corpus\n",
+    "\n",
+    "# Funkcja do zamiany tekstów na wektory przy użyciu word2vec\n",
+    "def vectorize_text(text, model):\n",
+    "    tokens = simple_preprocess(text)\n",
+    "    vectors = [model.wv[word] for word in tokens if word in model.wv]\n",
+    "    if vectors:\n",
+    "        return np.mean(vectors, axis=0)\n",
+    "    else:\n",
+    "        return np.zeros(model.vector_size)\n",
+    "\n",
+    "# Funkcja do wczytywania danych tekstowych\n",
+    "def load_data(filepath):\n",
+    "    texts = []\n",
+    "    with open(filepath, 'r', encoding=\"utf8\") as file:\n",
+    "        for line in file:\n",
+    "            texts.append(line.strip())\n",
+    "    return texts\n",
+    "\n",
+    "# Przygotowanie korpusu i trening modelu word2vec\n",
+    "corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])\n",
+    "w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n",
+    "w2v_model.save(\"word2vec.model\")\n",
+    "\n",
+    "# Wczytywanie tekstów\n",
+    "dev_texts = load_data('dev-0/in.tsv')\n",
+    "test_texts = load_data('test-A/in.tsv')\n",
+    "\n",
+    "# Zamiana tekstów na wektory\n",
+    "dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])\n",
+    "test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])\n",
+    "\n",
+    "# Wczytywanie etykiet dla danych dev\n",
+    "dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n",
+    "dev_labels = dev_labels_df[0].values\n",
+    "\n",
+    "# Podział danych dev na zbiór treningowy i walidacyjny\n",
+    "X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Budowa modelu sieci neuronowej\n",
+    "model_nn = Sequential([\n",
+    "    Dense(64, activation='relu'),\n",
+    "    Dense(32, activation='relu'),\n",
+    "    Dense(1, activation='sigmoid')\n",
+    "])\n",
+    "\n",
+    "model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
+    "\n",
+    "# Trening modelu z walidacją\n",
+    "history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)\n",
+    "\n",
+    "# Predykcje dla zbioru dev i test\n",
+    "dev_predictions = model_nn.predict(dev_vectors)\n",
+    "test_predictions = model_nn.predict(test_vectors)\n",
+    "\n",
+    "# Konwersja predykcji do binarnych klas (0 lub 1)\n",
+    "dev_predictions = (dev_predictions > 0.5).astype(int)\n",
+    "test_predictions = (test_predictions > 0.5).astype(int)\n",
+    "\n",
+    "# Zapis predykcji do plików\n",
+    "def save_predictions(predictions, filepath):\n",
+    "    with open(filepath, 'w', encoding=\"utf8\") as file:\n",
+    "        for pred in predictions:\n",
+    "            file.write(f\"{pred[0]}\\n\")\n",
+    "\n",
+    "save_predictions(dev_predictions, 'dev-0/out.tsv')\n",
+    "save_predictions(test_predictions, 'test-A/out.tsv')\n",
+    "\n",
+    "# Porównanie wyników z plikiem \"expected\"\n",
+    "dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()\n",
+    "expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None).values.flatten()\n",
+    "\n",
+    "# Wyświetlenie dokładności i raportu klasyfikacji\n",
+    "accuracy = accuracy_score(expected_labels, dev_pred_labels)\n",
+    "report = classification_report(expected_labels, dev_pred_labels)\n",
+    "\n",
+    "print(f'Accuracy: {accuracy}')\n",
+    "print('Classification Report:')\n",
+    "print(report)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/word2vec.model
+++ b/word2vec.model