first commit

This commit is contained in:
arek 2024-09-27 04:17:42 +02:00
commit 7f82b14c1e
10 changed files with 125645 additions and 0 deletions

View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --precision 5

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

262
word2vec.ipynb Normal file
View File

@ -0,0 +1,262 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from gensim.models import Word2Vec\n",
"from gensim.utils import simple_preprocess\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"dev_0_in = \"./sport-text-classification-ball-ISI-public/dev-0/in.tsv\"\n",
"test_A_in = \"./sport-text-classification-ball-ISI-public/test-A/in.tsv\"\n",
"\n",
"dev_0_out = \"./sport-text-classification-ball-ISI-public/dev-0/out.tsv\"\n",
"test_A_out = \"./sport-text-classification-ball-ISI-public/test-A/out.tsv\"\n",
"\n",
"train = \"./sport-text-classification-ball-ISI-public/train/train.tsv\"\n",
"expected = \"./sport-text-classification-ball-ISI-public/dev-0/expected.tsv\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def build_corpus(file_list):\n",
" documents = []\n",
" for file in file_list:\n",
" with open(file, 'r', encoding=\"utf8\") as f:\n",
" for line in f:\n",
" processed_line = simple_preprocess(line)\n",
" documents.append(processed_line)\n",
" return documents"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def text_to_vector(text, model):\n",
" tokens = simple_preprocess(text)\n",
" word_vectors = [model.wv[token] for token in tokens if token in model.wv]\n",
" if word_vectors:\n",
" return np.mean(word_vectors, axis=0)\n",
" else:\n",
" return np.zeros(model.vector_size)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def read_text(filepath):\n",
" lines = []\n",
" with open(filepath, 'r', encoding=\"utf8\") as file:\n",
" for line in file:\n",
" lines.append(line.strip())\n",
" return lines"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def save_predictions(predictions, filepath):\n",
" with open(filepath, 'w', encoding=\"utf8\") as file:\n",
" for prediction in predictions:\n",
" file.write(f\"{prediction[0]}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"documents = build_corpus([dev_0_in, test_A_in])\n",
"w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)\n",
"w2v_model.save(\"word2vec.model\")\n",
"\n",
"dev_texts = read_text(dev_0_in)\n",
"test_texts = read_text(test_A_in)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])\n",
"test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])\n",
"\n",
"dev_labels = pd.read_csv(expected, sep='\\t', header=None).values.flatten()\n",
"X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540\n",
"Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339\n",
"Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201\n",
"Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047\n",
"Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913\n",
"Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807\n",
"Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718\n",
"Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654\n",
"Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605\n",
"Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573\n"
]
}
],
"source": [
"X_train_tensor = torch.tensor(X_train, dtype=torch.float32)\n",
"y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)\n",
"X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)\n",
"y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)\n",
"dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)\n",
"test_features_tensor = torch.tensor(test_features, dtype=torch.float32)\n",
"\n",
"class SimpleNN(nn.Module):\n",
" def __init__(self):\n",
" super(SimpleNN, self).__init__()\n",
" self.fc1 = nn.Linear(100, 64)\n",
" self.fc2 = nn.Linear(64, 32)\n",
" self.fc3 = nn.Linear(32, 1)\n",
" self.relu = nn.ReLU()\n",
" self.sigmoid = nn.Sigmoid()\n",
"\n",
" def forward(self, x):\n",
" x = self.relu(self.fc1(x))\n",
" x = self.relu(self.fc2(x))\n",
" x = self.sigmoid(self.fc3(x))\n",
" return x\n",
"\n",
"model = SimpleNN()\n",
"criterion = nn.BCELoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
"\n",
"num_epochs = 1000\n",
"batch_size = 32\n",
"for epoch in range(num_epochs):\n",
" model.train()\n",
" optimizer.zero_grad()\n",
" \n",
" outputs = model(X_train_tensor)\n",
" loss = criterion(outputs, y_train_tensor)\n",
" \n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" if (epoch+1) % 100 == 0:\n",
" model.eval()\n",
" with torch.no_grad():\n",
" valid_outputs = model(X_valid_tensor)\n",
" valid_loss = criterion(valid_outputs, y_valid_tensor)\n",
" print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')\n",
"\n",
"model.eval()\n",
"with torch.no_grad():\n",
" dev_predictions_raw = model(dev_features_tensor).numpy()\n",
" test_predictions_raw = model(test_features_tensor).numpy()\n",
"\n",
"dev_predictions = (dev_predictions_raw > 0.5).astype(int)\n",
"test_predictions = (test_predictions_raw > 0.5).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"save_predictions(dev_predictions, dev_0_out)\n",
"save_predictions(test_predictions,test_A_out)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dokładność: 0.8995\n",
" precision recall f1-score support\n",
"\n",
" 0 0.88 0.84 0.86 1983\n",
" 1 0.91 0.93 0.92 3469\n",
"\n",
" accuracy 0.90 5452\n",
" macro avg 0.89 0.89 0.89 5452\n",
"weighted avg 0.90 0.90 0.90 5452\n",
"\n"
]
}
],
"source": [
"df = pd.read_csv(dev_0_out, header=None).values.flatten()\n",
"\n",
"accuracy = accuracy_score(dev_labels, df)\n",
"report = classification_report(dev_labels, df)\n",
"\n",
"print(f\"Dokładność: {accuracy:.4f}\")\n",
"print(report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

BIN
word2vec.model Normal file

Binary file not shown.