added task2 solution

This commit is contained in:
Michal Gulczynski 2024-05-15 21:55:44 +02:00
commit 6d04f5d01a
10 changed files with 125525 additions and 0 deletions

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5453
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

142
test.ipynb Normal file
View File

@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from gensim.models import Word2Vec\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"def load_train_data(file_path):\n",
" texts = []\n",
" labels = []\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" for line in file:\n",
" parts = line.strip().split('\\t')\n",
" texts.append(parts[1])\n",
" labels.append(int(parts[0]))\n",
" return texts, labels\n",
"\n",
"train_texts, train_labels = load_train_data('train.tsv')\n",
"sentences = [text.split() for text in train_texts]\n",
"word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)\n",
"word2vec_model.save(\"word2vec.model\")"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"def load_test_data(file_name):\n",
" with open(file_name, 'r', encoding='utf-8') as file:\n",
" file_content = []\n",
" lines = file.readlines()\n",
" for line in lines:\n",
" file_content.append(line.strip().split('\\t')[0])\n",
" return file_content\n",
"\n",
"test_texts = load_test_data(\"dev-0/in.tsv\")\n",
"test_labels = np.array(load_test_data(\"dev-0/expected.tsv\")).astype(int)\n",
"test_a_texts = load_test_data(\"test-A/in.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test Accuracy: 0.9601980924431401\n"
]
}
],
"source": [
"def text_to_vector(text):\n",
" words = text.split()\n",
" vector = np.zeros(word2vec_model.vector_size)\n",
" count = 0\n",
" for word in words:\n",
" if word in word2vec_model.wv:\n",
" vector += word2vec_model.wv.get_vector(word)\n",
" count += 1\n",
" if count != 0:\n",
" vector /= count\n",
" return vector\n",
"\n",
"\n",
"word2vec_model_path = \"word2vec.model\"\n",
"word2vec_model_loaded = Word2Vec.load(word2vec_model_path)\n",
"\n",
"train_word2vec = np.array([text_to_vector(text) for text in train_texts])\n",
"test_word2vec = np.array([text_to_vector(text) for text in test_texts])\n",
"test_a_word2vec = np.array([text_to_vector(text) for text in test_a_texts])\n",
"\n",
"classifier = RandomForestClassifier()\n",
"\n",
"classifier.fit(train_word2vec, train_labels)\n",
"\n",
"predictions = classifier.predict(test_word2vec)\n",
"predictions_test_a = classifier.predict(test_a_word2vec)\n",
"\n",
"accuracy = accuracy_score(test_labels, predictions)\n",
"print(\"Test Accuracy:\", accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'w', newline='', encoding='utf-8') as f:\n",
" writer = csv.writer(f, delimiter='\\t')\n",
" print(\"Accuracy: \" + str(accuracy), file=f)\n",
" for prediction in predictions:\n",
" writer.writerow([prediction])\n",
"\n",
"with open('test-A/out.tsv', 'w', newline='', encoding='utf-8') as f:\n",
" writer = csv.writer(f, delimiter='\\t')\n",
" for prediction in predictions_test_a:\n",
" writer.writerow([prediction])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

98132
train.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
word2vec.model Normal file

Binary file not shown.

BIN
word2vec.model.syn1neg.npy Normal file

Binary file not shown.

Binary file not shown.