{ "cells": [ { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from gensim.models import Word2Vec\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score\n", "import csv" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "def load_train_data(file_path):\n", " texts = []\n", " labels = []\n", " with open(file_path, 'r', encoding='utf-8') as file:\n", " for line in file:\n", " parts = line.strip().split('\\t')\n", " texts.append(parts[1])\n", " labels.append(int(parts[0]))\n", " return texts, labels\n", "\n", "train_texts, train_labels = load_train_data('train.tsv')\n", "sentences = [text.split() for text in train_texts]\n", "word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)\n", "word2vec_model.save(\"word2vec.model\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "def load_test_data(file_name):\n", " with open(file_name, 'r', encoding='utf-8') as file:\n", " file_content = []\n", " lines = file.readlines()\n", " for line in lines:\n", " file_content.append(line.strip().split('\\t')[0])\n", " return file_content\n", "\n", "test_texts = load_test_data(\"dev-0/in.tsv\")\n", "test_labels = np.array(load_test_data(\"dev-0/expected.tsv\")).astype(int)\n", "test_a_texts = load_test_data(\"test-A/in.tsv\")" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test Accuracy: 0.9601980924431401\n" ] } ], "source": [ "def text_to_vector(text):\n", " words = text.split()\n", " vector = np.zeros(word2vec_model.vector_size)\n", " count = 0\n", " for word in words:\n", " if word in word2vec_model.wv:\n", " vector += word2vec_model.wv.get_vector(word)\n", " count += 1\n", " if count != 0:\n", " vector /= count\n", " return vector\n", "\n", "\n", "word2vec_model_path = \"word2vec.model\"\n", "word2vec_model_loaded = Word2Vec.load(word2vec_model_path)\n", "\n", "train_word2vec = np.array([text_to_vector(text) for text in train_texts])\n", "test_word2vec = np.array([text_to_vector(text) for text in test_texts])\n", "test_a_word2vec = np.array([text_to_vector(text) for text in test_a_texts])\n", "\n", "classifier = RandomForestClassifier()\n", "\n", "classifier.fit(train_word2vec, train_labels)\n", "\n", "predictions = classifier.predict(test_word2vec)\n", "predictions_test_a = classifier.predict(test_a_word2vec)\n", "\n", "accuracy = accuracy_score(test_labels, predictions)\n", "print(\"Test Accuracy:\", accuracy)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "with open('dev-0/out.tsv', 'w', newline='', encoding='utf-8') as f:\n", " writer = csv.writer(f, delimiter='\\t')\n", " print(\"Accuracy: \" + str(accuracy), file=f)\n", " for prediction in predictions:\n", " writer.writerow([prediction])\n", "\n", "with open('test-A/out.tsv', 'w', newline='', encoding='utf-8') as f:\n", " writer = csv.writer(f, delimiter='\\t')\n", " for prediction in predictions_test_a:\n", " writer.writerow([prediction])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" } }, "nbformat": 4, "nbformat_minor": 2 }