added task2 solution
This commit is contained in:
commit
6d04f5d01a
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5453
dev-0/out.tsv
Normal file
5453
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
142
test.ipynb
Normal file
142
test.ipynb
Normal file
@ -0,0 +1,142 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from gensim.models import Word2Vec\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"import csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_train_data(file_path):\n",
|
||||
" texts = []\n",
|
||||
" labels = []\n",
|
||||
" with open(file_path, 'r', encoding='utf-8') as file:\n",
|
||||
" for line in file:\n",
|
||||
" parts = line.strip().split('\\t')\n",
|
||||
" texts.append(parts[1])\n",
|
||||
" labels.append(int(parts[0]))\n",
|
||||
" return texts, labels\n",
|
||||
"\n",
|
||||
"train_texts, train_labels = load_train_data('train.tsv')\n",
|
||||
"sentences = [text.split() for text in train_texts]\n",
|
||||
"word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)\n",
|
||||
"word2vec_model.save(\"word2vec.model\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_test_data(file_name):\n",
|
||||
" with open(file_name, 'r', encoding='utf-8') as file:\n",
|
||||
" file_content = []\n",
|
||||
" lines = file.readlines()\n",
|
||||
" for line in lines:\n",
|
||||
" file_content.append(line.strip().split('\\t')[0])\n",
|
||||
" return file_content\n",
|
||||
"\n",
|
||||
"test_texts = load_test_data(\"dev-0/in.tsv\")\n",
|
||||
"test_labels = np.array(load_test_data(\"dev-0/expected.tsv\")).astype(int)\n",
|
||||
"test_a_texts = load_test_data(\"test-A/in.tsv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Test Accuracy: 0.9601980924431401\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def text_to_vector(text):\n",
|
||||
" words = text.split()\n",
|
||||
" vector = np.zeros(word2vec_model.vector_size)\n",
|
||||
" count = 0\n",
|
||||
" for word in words:\n",
|
||||
" if word in word2vec_model.wv:\n",
|
||||
" vector += word2vec_model.wv.get_vector(word)\n",
|
||||
" count += 1\n",
|
||||
" if count != 0:\n",
|
||||
" vector /= count\n",
|
||||
" return vector\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"word2vec_model_path = \"word2vec.model\"\n",
|
||||
"word2vec_model_loaded = Word2Vec.load(word2vec_model_path)\n",
|
||||
"\n",
|
||||
"train_word2vec = np.array([text_to_vector(text) for text in train_texts])\n",
|
||||
"test_word2vec = np.array([text_to_vector(text) for text in test_texts])\n",
|
||||
"test_a_word2vec = np.array([text_to_vector(text) for text in test_a_texts])\n",
|
||||
"\n",
|
||||
"classifier = RandomForestClassifier()\n",
|
||||
"\n",
|
||||
"classifier.fit(train_word2vec, train_labels)\n",
|
||||
"\n",
|
||||
"predictions = classifier.predict(test_word2vec)\n",
|
||||
"predictions_test_a = classifier.predict(test_a_word2vec)\n",
|
||||
"\n",
|
||||
"accuracy = accuracy_score(test_labels, predictions)\n",
|
||||
"print(\"Test Accuracy:\", accuracy)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dev-0/out.tsv', 'w', newline='', encoding='utf-8') as f:\n",
|
||||
" writer = csv.writer(f, delimiter='\\t')\n",
|
||||
" print(\"Accuracy: \" + str(accuracy), file=f)\n",
|
||||
" for prediction in predictions:\n",
|
||||
" writer.writerow([prediction])\n",
|
||||
"\n",
|
||||
"with open('test-A/out.tsv', 'w', newline='', encoding='utf-8') as f:\n",
|
||||
" writer = csv.writer(f, delimiter='\\t')\n",
|
||||
" for prediction in predictions_test_a:\n",
|
||||
" writer.writerow([prediction])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
BIN
word2vec.model.syn1neg.npy
Normal file
BIN
word2vec.model.syn1neg.npy
Normal file
Binary file not shown.
BIN
word2vec.model.wv.vectors.npy
Normal file
BIN
word2vec.model.wv.vectors.npy
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user