4.1 KiB
4.1 KiB
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import csv
def load_train_data(file_path):
texts = []
labels = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
parts = line.strip().split('\t')
texts.append(parts[1])
labels.append(int(parts[0]))
return texts, labels
train_texts, train_labels = load_train_data('train.tsv')
sentences = [text.split() for text in train_texts]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")
def load_test_data(file_name):
with open(file_name, 'r', encoding='utf-8') as file:
file_content = []
lines = file.readlines()
for line in lines:
file_content.append(line.strip().split('\t')[0])
return file_content
test_texts = load_test_data("dev-0/in.tsv")
test_labels = np.array(load_test_data("dev-0/expected.tsv")).astype(int)
test_a_texts = load_test_data("test-A/in.tsv")
def text_to_vector(text):
words = text.split()
vector = np.zeros(word2vec_model.vector_size)
count = 0
for word in words:
if word in word2vec_model.wv:
vector += word2vec_model.wv.get_vector(word)
count += 1
if count != 0:
vector /= count
return vector
word2vec_model_path = "word2vec.model"
word2vec_model_loaded = Word2Vec.load(word2vec_model_path)
train_word2vec = np.array([text_to_vector(text) for text in train_texts])
test_word2vec = np.array([text_to_vector(text) for text in test_texts])
test_a_word2vec = np.array([text_to_vector(text) for text in test_a_texts])
classifier = RandomForestClassifier()
classifier.fit(train_word2vec, train_labels)
predictions = classifier.predict(test_word2vec)
predictions_test_a = classifier.predict(test_a_word2vec)
accuracy = accuracy_score(test_labels, predictions)
print("Test Accuracy:", accuracy)
Test Accuracy: 0.9601980924431401
with open('dev-0/out.tsv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter='\t')
print("Accuracy: " + str(accuracy), file=f)
for prediction in predictions:
writer.writerow([prediction])
with open('test-A/out.tsv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter='\t')
for prediction in predictions_test_a:
writer.writerow([prediction])