{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from gensim.models import Word2Vec\n", "from gensim.utils import simple_preprocess\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n", "test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n", "\n", "dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n", "test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n", "\n", "train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n", "expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def build_corpus(file_list):\n", " documents = []\n", " for file in file_list:\n", " with open(file, 'r', encoding=\"utf8\") as f:\n", " for line in f:\n", " processed_line = simple_preprocess(line)\n", " documents.append(processed_line)\n", " return documents" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def text_to_vector(text, model):\n", " tokens = simple_preprocess(text)\n", " word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n", " if word_vectors:\n", " return np.mean(word_vectors, axis=0)\n", " else:\n", " return np.zeros(model.vector_size)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def read_text(filepath):\n", " lines = []\n", " with open(filepath, 'r', encoding=\"utf8\") as file:\n", " for line in file:\n", " lines.append(line.strip())\n", " return lines" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def save_predictions(predictions, filepath):\n", " with open(filepath, 'w', encoding=\"utf8\") as file:\n", " for prediction in predictions:\n", " file.write(f\"{prediction[0]}\\n\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "documents = build_corpus([dev_0_in, test_A_in])\n", "w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n", "w2v_model.save(\"word2vec.model\")\n", "\n", "dev_texts = read_text(dev_0_in)\n", "test_texts = read_text(test_A_in)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n", "test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n", "\n", "dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n", "X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n", "Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n", "Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n", "Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n", "Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n", "Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n", "Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n", "Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n", "Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n", "Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n" ] } ], "source": [ "X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n", "y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n", "X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n", "y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n", "dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n", "test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n", "\n", "class SimpleNN(nn.Module):\n", " def __init__(self):\n", " super(SimpleNN, self).__init__()\n", " self.fc1 = nn.Linear(100, 64)\n", " self.fc2 = nn.Linear(64, 32)\n", " self.fc3 = nn.Linear(32, 1)\n", " self.relu = nn.ReLU()\n", " self.sigmoid = nn.Sigmoid()\n", "\n", " def forward(self, x):\n", " x = self.relu(self.fc1(x))\n", " x = self.relu(self.fc2(x))\n", " x = self.sigmoid(self.fc3(x))\n", " return x\n", "\n", "model = SimpleNN()\n", "criterion = nn.BCELoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", "\n", "num_epochs = 1000\n", "batch_size = 32\n", "for epoch in range(num_epochs):\n", " model.train()\n", " optimizer.zero_grad()\n", " \n", " outputs = model(X_train_tensor)\n", " loss = criterion(outputs, y_train_tensor)\n", " \n", " loss.backward()\n", " optimizer.step()\n", " \n", " if (epoch+1) % 100 == 0:\n", " model.eval()\n", " with torch.no_grad():\n", " valid_outputs = model(X_valid_tensor)\n", " valid_loss = criterion(valid_outputs, y_valid_tensor)\n", " print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n", "\n", "model.eval()\n", "with torch.no_grad():\n", " dev_predictions_raw = model(dev_features_tensor).numpy()\n", " test_predictions_raw = model(test_features_tensor).numpy()\n", "\n", "dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n", "test_predictions = (test_predictions_raw > 0.5).astype(int)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "save_predictions(dev_predictions, dev_0_out)\n", "save_predictions(test_predictions,test_A_out)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dokładność: 0.8995\n", " precision recall f1-score support\n", "\n", " 0 0.88 0.84 0.86 1983\n", " 1 0.91 0.93 0.92 3469\n", "\n", " accuracy 0.90 5452\n", " macro avg 0.89 0.89 0.89 5452\n", "weighted avg 0.90 0.90 0.90 5452\n", "\n" ] } ], "source": [ "df = pd.read_csv(dev_0_out, header=None).values.flatten()\n", "\n", "accuracy = accuracy_score(dev_labels, df)\n", "report = classification_report(dev_labels, df)\n", "\n", "print(f\"Dokładność: {accuracy:.4f}\")\n", "print(report)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }