263 lines
8.2 KiB
Plaintext
263 lines
8.2 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import numpy as np\n",
|
||
|
"from gensim.models import Word2Vec\n",
|
||
|
"from gensim.utils import simple_preprocess\n",
|
||
|
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||
|
"import torch\n",
|
||
|
"import torch.nn as nn\n",
|
||
|
"import torch.optim as optim"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n",
|
||
|
"test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n",
|
||
|
"\n",
|
||
|
"dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n",
|
||
|
"test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n",
|
||
|
"\n",
|
||
|
"train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n",
|
||
|
"expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\""
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def build_corpus(file_list):\n",
|
||
|
" documents = []\n",
|
||
|
" for file in file_list:\n",
|
||
|
" with open(file, 'r', encoding=\"utf8\") as f:\n",
|
||
|
" for line in f:\n",
|
||
|
" processed_line = simple_preprocess(line)\n",
|
||
|
" documents.append(processed_line)\n",
|
||
|
" return documents"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def text_to_vector(text, model):\n",
|
||
|
" tokens = simple_preprocess(text)\n",
|
||
|
" word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n",
|
||
|
" if word_vectors:\n",
|
||
|
" return np.mean(word_vectors, axis=0)\n",
|
||
|
" else:\n",
|
||
|
" return np.zeros(model.vector_size)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def read_text(filepath):\n",
|
||
|
" lines = []\n",
|
||
|
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
|
||
|
" for line in file:\n",
|
||
|
" lines.append(line.strip())\n",
|
||
|
" return lines"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def save_predictions(predictions, filepath):\n",
|
||
|
" with open(filepath, 'w', encoding=\"utf8\") as file:\n",
|
||
|
" for prediction in predictions:\n",
|
||
|
" file.write(f\"{prediction[0]}\\n\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"documents = build_corpus([dev_0_in, test_A_in])\n",
|
||
|
"w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n",
|
||
|
"w2v_model.save(\"word2vec.model\")\n",
|
||
|
"\n",
|
||
|
"dev_texts = read_text(dev_0_in)\n",
|
||
|
"test_texts = read_text(test_A_in)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n",
|
||
|
"test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n",
|
||
|
"\n",
|
||
|
"dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n",
|
||
|
"X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n",
|
||
|
"Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n",
|
||
|
"Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n",
|
||
|
"Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n",
|
||
|
"Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n",
|
||
|
"Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n",
|
||
|
"Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n",
|
||
|
"Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n",
|
||
|
"Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n",
|
||
|
"Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n",
|
||
|
"y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n",
|
||
|
"X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n",
|
||
|
"y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n",
|
||
|
"dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n",
|
||
|
"test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n",
|
||
|
"\n",
|
||
|
"class SimpleNN(nn.Module):\n",
|
||
|
" def __init__(self):\n",
|
||
|
" super(SimpleNN, self).__init__()\n",
|
||
|
" self.fc1 = nn.Linear(100, 64)\n",
|
||
|
" self.fc2 = nn.Linear(64, 32)\n",
|
||
|
" self.fc3 = nn.Linear(32, 1)\n",
|
||
|
" self.relu = nn.ReLU()\n",
|
||
|
" self.sigmoid = nn.Sigmoid()\n",
|
||
|
"\n",
|
||
|
" def forward(self, x):\n",
|
||
|
" x = self.relu(self.fc1(x))\n",
|
||
|
" x = self.relu(self.fc2(x))\n",
|
||
|
" x = self.sigmoid(self.fc3(x))\n",
|
||
|
" return x\n",
|
||
|
"\n",
|
||
|
"model = SimpleNN()\n",
|
||
|
"criterion = nn.BCELoss()\n",
|
||
|
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
|
||
|
"\n",
|
||
|
"num_epochs = 1000\n",
|
||
|
"batch_size = 32\n",
|
||
|
"for epoch in range(num_epochs):\n",
|
||
|
" model.train()\n",
|
||
|
" optimizer.zero_grad()\n",
|
||
|
" \n",
|
||
|
" outputs = model(X_train_tensor)\n",
|
||
|
" loss = criterion(outputs, y_train_tensor)\n",
|
||
|
" \n",
|
||
|
" loss.backward()\n",
|
||
|
" optimizer.step()\n",
|
||
|
" \n",
|
||
|
" if (epoch+1) % 100 == 0:\n",
|
||
|
" model.eval()\n",
|
||
|
" with torch.no_grad():\n",
|
||
|
" valid_outputs = model(X_valid_tensor)\n",
|
||
|
" valid_loss = criterion(valid_outputs, y_valid_tensor)\n",
|
||
|
" print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n",
|
||
|
"\n",
|
||
|
"model.eval()\n",
|
||
|
"with torch.no_grad():\n",
|
||
|
" dev_predictions_raw = model(dev_features_tensor).numpy()\n",
|
||
|
" test_predictions_raw = model(test_features_tensor).numpy()\n",
|
||
|
"\n",
|
||
|
"dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n",
|
||
|
"test_predictions = (test_predictions_raw > 0.5).astype(int)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"save_predictions(dev_predictions, dev_0_out)\n",
|
||
|
"save_predictions(test_predictions,test_A_out)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Dokładność: 0.8995\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.88 0.84 0.86 1983\n",
|
||
|
" 1 0.91 0.93 0.92 3469\n",
|
||
|
"\n",
|
||
|
" accuracy 0.90 5452\n",
|
||
|
" macro avg 0.89 0.89 0.89 5452\n",
|
||
|
"weighted avg 0.90 0.90 0.90 5452\n",
|
||
|
"\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df = pd.read_csv(dev_0_out, header=None).values.flatten()\n",
|
||
|
"\n",
|
||
|
"accuracy = accuracy_score(dev_labels, df)\n",
|
||
|
"report = classification_report(dev_labels, df)\n",
|
||
|
"\n",
|
||
|
"print(f\"Dokładność: {accuracy:.4f}\")\n",
|
||
|
"print(report)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.12.6"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|