first commit

2024-09-27 04:17:42 +02:00 · 2024-09-27 04:17:42 +02:00 · 7f82b14c1e
commit 7f82b14c1e
10 changed files with 125645 additions and 0 deletions
--- a/sport-text-classification-ball-ISI-public/config.txt
+++ b/sport-text-classification-ball-ISI-public/config.txt
@ -0,0 +1 @@
+--metric Likelihood --metric Accuracy --precision 5
--- a/sport-text-classification-ball-ISI-public/dev-0/expected.tsv
+++ b/sport-text-classification-ball-ISI-public/dev-0/expected.tsv
--- a/sport-text-classification-ball-ISI-public/dev-0/in.tsv
+++ b/sport-text-classification-ball-ISI-public/dev-0/in.tsv
--- a/sport-text-classification-ball-ISI-public/dev-0/out.tsv
+++ b/sport-text-classification-ball-ISI-public/dev-0/out.tsv
--- a/sport-text-classification-ball-ISI-public/test-A/in.tsv
+++ b/sport-text-classification-ball-ISI-public/test-A/in.tsv
--- a/sport-text-classification-ball-ISI-public/test-A/out.tsv
+++ b/sport-text-classification-ball-ISI-public/test-A/out.tsv
--- a/sport-text-classification-ball-ISI-public/train/train.tsv
+++ b/sport-text-classification-ball-ISI-public/train/train.tsv
--- a/sport-text-classification-ball-ISI-public/train/train.tsv.gz
+++ b/sport-text-classification-ball-ISI-public/train/train.tsv.gz
--- a/word2vec.ipynb
+++ b/word2vec.ipynb
@ -0,0 +1,262 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from gensim.models import Word2Vec\n",
+    "from gensim.utils import simple_preprocess\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n",
+    "test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n",
+    "\n",
+    "dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n",
+    "test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n",
+    "\n",
+    "train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n",
+    "expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_corpus(file_list):\n",
+    "    documents = []\n",
+    "    for file in file_list:\n",
+    "        with open(file, 'r', encoding=\"utf8\") as f:\n",
+    "            for line in f:\n",
+    "                processed_line = simple_preprocess(line)\n",
+    "                documents.append(processed_line)\n",
+    "    return documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def text_to_vector(text, model):\n",
+    "    tokens = simple_preprocess(text)\n",
+    "    word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n",
+    "    if word_vectors:\n",
+    "        return np.mean(word_vectors, axis=0)\n",
+    "    else:\n",
+    "        return np.zeros(model.vector_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_text(filepath):\n",
+    "    lines = []\n",
+    "    with open(filepath, 'r', encoding=\"utf8\") as file:\n",
+    "        for line in file:\n",
+    "            lines.append(line.strip())\n",
+    "    return lines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_predictions(predictions, filepath):\n",
+    "    with open(filepath, 'w', encoding=\"utf8\") as file:\n",
+    "        for prediction in predictions:\n",
+    "            file.write(f\"{prediction[0]}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = build_corpus([dev_0_in, test_A_in])\n",
+    "w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n",
+    "w2v_model.save(\"word2vec.model\")\n",
+    "\n",
+    "dev_texts = read_text(dev_0_in)\n",
+    "test_texts = read_text(test_A_in)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n",
+    "test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n",
+    "\n",
+    "dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n",
+    "X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n",
+      "Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n",
+      "Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n",
+      "Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n",
+      "Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n",
+      "Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n",
+      "Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n",
+      "Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n",
+      "Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n",
+      "Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n",
+    "y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n",
+    "X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n",
+    "y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n",
+    "dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n",
+    "test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n",
+    "\n",
+    "class SimpleNN(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(SimpleNN, self).__init__()\n",
+    "        self.fc1 = nn.Linear(100, 64)\n",
+    "        self.fc2 = nn.Linear(64, 32)\n",
+    "        self.fc3 = nn.Linear(32, 1)\n",
+    "        self.relu = nn.ReLU()\n",
+    "        self.sigmoid = nn.Sigmoid()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.relu(self.fc1(x))\n",
+    "        x = self.relu(self.fc2(x))\n",
+    "        x = self.sigmoid(self.fc3(x))\n",
+    "        return x\n",
+    "\n",
+    "model = SimpleNN()\n",
+    "criterion = nn.BCELoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
+    "\n",
+    "num_epochs = 1000\n",
+    "batch_size = 32\n",
+    "for epoch in range(num_epochs):\n",
+    "    model.train()\n",
+    "    optimizer.zero_grad()\n",
+    "    \n",
+    "    outputs = model(X_train_tensor)\n",
+    "    loss = criterion(outputs, y_train_tensor)\n",
+    "    \n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    \n",
+    "    if (epoch+1) % 100 == 0:\n",
+    "        model.eval()\n",
+    "        with torch.no_grad():\n",
+    "            valid_outputs = model(X_valid_tensor)\n",
+    "            valid_loss = criterion(valid_outputs, y_valid_tensor)\n",
+    "            print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n",
+    "\n",
+    "model.eval()\n",
+    "with torch.no_grad():\n",
+    "    dev_predictions_raw = model(dev_features_tensor).numpy()\n",
+    "    test_predictions_raw = model(test_features_tensor).numpy()\n",
+    "\n",
+    "dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n",
+    "test_predictions = (test_predictions_raw > 0.5).astype(int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_predictions(dev_predictions, dev_0_out)\n",
+    "save_predictions(test_predictions,test_A_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dokładność: 0.8995\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.88      0.84      0.86      1983\n",
+      "           1       0.91      0.93      0.92      3469\n",
+      "\n",
+      "    accuracy                           0.90      5452\n",
+      "   macro avg       0.89      0.89      0.89      5452\n",
+      "weighted avg       0.90      0.90      0.90      5452\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(dev_0_out, header=None).values.flatten()\n",
+    "\n",
+    "accuracy = accuracy_score(dev_labels, df)\n",
+    "report = classification_report(dev_labels, df)\n",
+    "\n",
+    "print(f\"Dokładność: {accuracy:.4f}\")\n",
+    "print(report)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/word2vec.model
+++ b/word2vec.model
				`@ -0,0 +1 @@`
				`--metric Likelihood --metric Accuracy --precision 5`