word2vec_dl/word2vec.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from gensim.models import Word2Vec\n",
    "from gensim.utils import simple_preprocess\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n",
    "test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n",
    "\n",
    "dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n",
    "test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n",
    "\n",
    "train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n",
    "expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_corpus(file_list):\n",
    "    documents = []\n",
    "    for file in file_list:\n",
    "        with open(file, 'r', encoding=\"utf8\") as f:\n",
    "            for line in f:\n",
    "                processed_line = simple_preprocess(line)\n",
    "                documents.append(processed_line)\n",
    "    return documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_to_vector(text, model):\n",
    "    tokens = simple_preprocess(text)\n",
    "    word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n",
    "    if word_vectors:\n",
    "        return np.mean(word_vectors, axis=0)\n",
    "    else:\n",
    "        return np.zeros(model.vector_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_text(filepath):\n",
    "    lines = []\n",
    "    with open(filepath, 'r', encoding=\"utf8\") as file:\n",
    "        for line in file:\n",
    "            lines.append(line.strip())\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_predictions(predictions, filepath):\n",
    "    with open(filepath, 'w', encoding=\"utf8\") as file:\n",
    "        for prediction in predictions:\n",
    "            file.write(f\"{prediction[0]}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = build_corpus([dev_0_in, test_A_in])\n",
    "w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n",
    "w2v_model.save(\"word2vec.model\")\n",
    "\n",
    "dev_texts = read_text(dev_0_in)\n",
    "test_texts = read_text(test_A_in)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n",
    "test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n",
    "\n",
    "dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n",
    "X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n",
      "Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n",
      "Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n",
      "Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n",
      "Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n",
      "Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n",
      "Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n",
      "Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n",
      "Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n",
      "Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n"
     ]
    }
   ],
   "source": [
    "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n",
    "y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n",
    "X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n",
    "y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n",
    "dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n",
    "test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n",
    "\n",
    "class SimpleNN(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(SimpleNN, self).__init__()\n",
    "        self.fc1 = nn.Linear(100, 64)\n",
    "        self.fc2 = nn.Linear(64, 32)\n",
    "        self.fc3 = nn.Linear(32, 1)\n",
    "        self.relu = nn.ReLU()\n",
    "        self.sigmoid = nn.Sigmoid()\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.relu(self.fc1(x))\n",
    "        x = self.relu(self.fc2(x))\n",
    "        x = self.sigmoid(self.fc3(x))\n",
    "        return x\n",
    "\n",
    "model = SimpleNN()\n",
    "criterion = nn.BCELoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
    "\n",
    "num_epochs = 1000\n",
    "batch_size = 32\n",
    "for epoch in range(num_epochs):\n",
    "    model.train()\n",
    "    optimizer.zero_grad()\n",
    "    \n",
    "    outputs = model(X_train_tensor)\n",
    "    loss = criterion(outputs, y_train_tensor)\n",
    "    \n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    \n",
    "    if (epoch+1) % 100 == 0:\n",
    "        model.eval()\n",
    "        with torch.no_grad():\n",
    "            valid_outputs = model(X_valid_tensor)\n",
    "            valid_loss = criterion(valid_outputs, y_valid_tensor)\n",
    "            print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    dev_predictions_raw = model(dev_features_tensor).numpy()\n",
    "    test_predictions_raw = model(test_features_tensor).numpy()\n",
    "\n",
    "dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n",
    "test_predictions = (test_predictions_raw > 0.5).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_predictions(dev_predictions, dev_0_out)\n",
    "save_predictions(test_predictions,test_A_out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dokładność: 0.8995\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.88      0.84      0.86      1983\n",
      "           1       0.91      0.93      0.92      3469\n",
      "\n",
      "    accuracy                           0.90      5452\n",
      "   macro avg       0.89      0.89      0.89      5452\n",
      "weighted avg       0.90      0.90      0.90      5452\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(dev_0_out, header=None).values.flatten()\n",
    "\n",
    "accuracy = accuracy_score(dev_labels, df)\n",
    "report = classification_report(dev_labels, df)\n",
    "\n",
    "print(f\"Dokładność: {accuracy:.4f}\")\n",
    "print(report)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
first commit 2024-09-27 04:17:42 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"import numpy as np\n",`
			`"from gensim.models import Word2Vec\n",`
			`"from gensim.utils import simple_preprocess\n",`
			`"from sklearn.metrics import accuracy_score, classification_report\n",`
			`"from sklearn.model_selection import train_test_split\n",`
			`"from sklearn.metrics import accuracy_score, classification_report\n",`
			`"import torch\n",`
			`"import torch.nn as nn\n",`
			`"import torch.optim as optim"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n",`
			`"test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n",`
			`"\n",`
			`"dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n",`
			`"test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n",`
			`"\n",`
			`"train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n",`
			`"expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def build_corpus(file_list):\n",`
			`" documents = []\n",`
			`" for file in file_list:\n",`
			`" with open(file, 'r', encoding=\"utf8\") as f:\n",`
			`" for line in f:\n",`
			`" processed_line = simple_preprocess(line)\n",`
			`" documents.append(processed_line)\n",`
			`" return documents"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def text_to_vector(text, model):\n",`
			`" tokens = simple_preprocess(text)\n",`
			`" word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n",`
			`" if word_vectors:\n",`
			`" return np.mean(word_vectors, axis=0)\n",`
			`" else:\n",`
			`" return np.zeros(model.vector_size)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def read_text(filepath):\n",`
			`" lines = []\n",`
			`" with open(filepath, 'r', encoding=\"utf8\") as file:\n",`
			`" for line in file:\n",`
			`" lines.append(line.strip())\n",`
			`" return lines"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def save_predictions(predictions, filepath):\n",`
			`" with open(filepath, 'w', encoding=\"utf8\") as file:\n",`
			`" for prediction in predictions:\n",`
			`" file.write(f\"{prediction[0]}\\n\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"documents = build_corpus([dev_0_in, test_A_in])\n",`
			`"w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n",`
			`"w2v_model.save(\"word2vec.model\")\n",`
			`"\n",`
			`"dev_texts = read_text(dev_0_in)\n",`
			`"test_texts = read_text(test_A_in)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 14,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n",`
			`"test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n",`
			`"\n",`
			`"dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n",`
			`"X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 24,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n",`
			`"Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n",`
			`"Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n",`
			`"Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n",`
			`"Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n",`
			`"Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n",`
			`"Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n",`
			`"Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n",`
			`"Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n",`
			`"Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n",`
			`"y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n",`
			`"X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n",`
			`"y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n",`
			`"dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n",`
			`"test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n",`
			`"\n",`
			`"class SimpleNN(nn.Module):\n",`
			`" def __init__(self):\n",`
			`" super(SimpleNN, self).__init__()\n",`
			`" self.fc1 = nn.Linear(100, 64)\n",`
			`" self.fc2 = nn.Linear(64, 32)\n",`
			`" self.fc3 = nn.Linear(32, 1)\n",`
			`" self.relu = nn.ReLU()\n",`
			`" self.sigmoid = nn.Sigmoid()\n",`
			`"\n",`
			`" def forward(self, x):\n",`
			`" x = self.relu(self.fc1(x))\n",`
			`" x = self.relu(self.fc2(x))\n",`
			`" x = self.sigmoid(self.fc3(x))\n",`
			`" return x\n",`
			`"\n",`
			`"model = SimpleNN()\n",`
			`"criterion = nn.BCELoss()\n",`
			`"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",`
			`"\n",`
			`"num_epochs = 1000\n",`
			`"batch_size = 32\n",`
			`"for epoch in range(num_epochs):\n",`
			`" model.train()\n",`
			`" optimizer.zero_grad()\n",`
			`" \n",`
			`" outputs = model(X_train_tensor)\n",`
			`" loss = criterion(outputs, y_train_tensor)\n",`
			`" \n",`
			`" loss.backward()\n",`
			`" optimizer.step()\n",`
			`" \n",`
			`" if (epoch+1) % 100 == 0:\n",`
			`" model.eval()\n",`
			`" with torch.no_grad():\n",`
			`" valid_outputs = model(X_valid_tensor)\n",`
			`" valid_loss = criterion(valid_outputs, y_valid_tensor)\n",`
			`" print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n",`
			`"\n",`
			`"model.eval()\n",`
			`"with torch.no_grad():\n",`
			`" dev_predictions_raw = model(dev_features_tensor).numpy()\n",`
			`" test_predictions_raw = model(test_features_tensor).numpy()\n",`
			`"\n",`
			`"dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n",`
			`"test_predictions = (test_predictions_raw > 0.5).astype(int)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 20,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"save_predictions(dev_predictions, dev_0_out)\n",`
			`"save_predictions(test_predictions,test_A_out)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 23,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Dokładność: 0.8995\n",`
			`" precision recall f1-score support\n",`
			`"\n",`
			`" 0 0.88 0.84 0.86 1983\n",`
			`" 1 0.91 0.93 0.92 3469\n",`
			`"\n",`
			`" accuracy 0.90 5452\n",`
			`" macro avg 0.89 0.89 0.89 5452\n",`
			`"weighted avg 0.90 0.90 0.90 5452\n",`
			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"df = pd.read_csv(dev_0_out, header=None).values.flatten()\n",`
			`"\n",`
			`"accuracy = accuracy_score(dev_labels, df)\n",`
			`"report = classification_report(dev_labels, df)\n",`
			`"\n",`
			`"print(f\"Dokładność: {accuracy:.4f}\")\n",`
			`"print(report)"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.12.6"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`