First Word2Vec commit

2024-05-19 19:15:33 +02:00 · 2024-05-19 19:15:33 +02:00 · 73ca11f9d1
commit 73ca11f9d1
8 changed files with 125532 additions and 0 deletions
--- a/Word2Vec2.ipynb
+++ b/Word2Vec2.ipynb
@ -0,0 +1,150 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 676us/step\n",
      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 541us/step\n",
      "Accuracy: 0.9394717534849596\n",
      "Classification Report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.92      0.92      0.92      1983\n",
      "           1       0.95      0.95      0.95      3469\n",
      "\n",
      "    accuracy                           0.94      5452\n",
      "   macro avg       0.93      0.93      0.93      5452\n",
      "weighted avg       0.94      0.94      0.94      5452\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from gensim.models import Word2Vec\n",
    "from gensim.utils import simple_preprocess\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Dense\n",
    "\n",
    "# Funkcja do przygotowania korpusu do trenowania word2vec\n",
    "def prepare_corpus(filepaths):\n",
    "    corpus = []\n",
    "    for filepath in filepaths:\n",
    "        with open(filepath, 'r', encoding=\"utf8\") as file:\n",
    "            for line in file:\n",
    "                tokens = simple_preprocess(line)\n",
    "                corpus.append(tokens)\n",
    "    return corpus\n",
    "\n",
    "# Funkcja do zamiany tekstów na wektory przy użyciu word2vec\n",
    "def vectorize_text(text, model):\n",
    "    tokens = simple_preprocess(text)\n",
    "    vectors = [model.wv[word] for word in tokens if word in model.wv]\n",
    "    if vectors:\n",
    "        return np.mean(vectors, axis=0)\n",
    "    else:\n",
    "        return np.zeros(model.vector_size)\n",
    "\n",
    "# Funkcja do wczytywania danych tekstowych\n",
    "def load_data(filepath):\n",
    "    texts = []\n",
    "    with open(filepath, 'r', encoding=\"utf8\") as file:\n",
    "        for line in file:\n",
    "            texts.append(line.strip())\n",
    "    return texts\n",
    "\n",
    "# Przygotowanie korpusu i trening modelu word2vec\n",
    "corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])\n",
    "w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)\n",
    "w2v_model.save(\"word2vec.model\")\n",
    "\n",
    "# Wczytywanie tekstów\n",
    "dev_texts = load_data('dev-0/in.tsv')\n",
    "test_texts = load_data('test-A/in.tsv')\n",
    "\n",
    "# Zamiana tekstów na wektory\n",
    "dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])\n",
    "test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])\n",
    "\n",
    "# Wczytywanie etykiet dla danych dev\n",
    "dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None)\n",
    "dev_labels = dev_labels_df[0].values\n",
    "\n",
    "# Podział danych dev na zbiór treningowy i walidacyjny\n",
    "X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)\n",
    "\n",
    "# Budowa modelu sieci neuronowej\n",
    "model_nn = Sequential([\n",
    "    Dense(64, activation='relu'),\n",
    "    Dense(32, activation='relu'),\n",
    "    Dense(1, activation='sigmoid')\n",
    "])\n",
    "\n",
    "model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "# Trening modelu z walidacją\n",
    "history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)\n",
    "\n",
    "# Predykcje dla zbioru dev i test\n",
    "dev_predictions = model_nn.predict(dev_vectors)\n",
    "test_predictions = model_nn.predict(test_vectors)\n",
    "\n",
    "# Konwersja predykcji do binarnych klas (0 lub 1)\n",
    "dev_predictions = (dev_predictions > 0.5).astype(int)\n",
    "test_predictions = (test_predictions > 0.5).astype(int)\n",
    "\n",
    "# Zapis predykcji do plików\n",
    "def save_predictions(predictions, filepath):\n",
    "    with open(filepath, 'w', encoding=\"utf8\") as file:\n",
    "        for pred in predictions:\n",
    "            file.write(f\"{pred[0]}\\n\")\n",
    "\n",
    "save_predictions(dev_predictions, 'dev-0/out.tsv')\n",
    "save_predictions(test_predictions, 'test-A/out.tsv')\n",
    "\n",
    "# Porównanie wyników z plikiem \"expected\"\n",
    "dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()\n",
    "expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None).values.flatten()\n",
    "\n",
    "# Wyświetlenie dokładności i raportu klasyfikacji\n",
    "accuracy = accuracy_score(expected_labels, dev_pred_labels)\n",
    "report = classification_report(expected_labels, dev_pred_labels)\n",
    "\n",
    "print(f'Accuracy: {accuracy}')\n",
    "print('Classification Report:')\n",
    "print(report)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/word2vec.model
+++ b/word2vec.model