6.1 KiB
6.1 KiB
import numpy as np
import csv
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
train_documents = []
train_classes = []
with open('train/train.tsv', 'r', encoding='utf-8') as file:
lines = file.readlines()
for line in lines:
elements = line.split('\t')
train_classes.append(int(elements[0]))
train_documents.append(elements[1].lower())
model = Word2Vec(sentences=[doc.split() for doc in train_documents], vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
def get_test_data(path):
with open(path, 'r', encoding='utf-8') as file:
test_data = []
lines = file.readlines()
for line in lines:
test_data.append(line.strip().split('\t')[0])
return test_data
dev0_documents = [x.lower() for x in get_test_data('dev-0/in.tsv')]
dev0_classes = [int(x) for x in get_test_data('dev-0/expected.tsv')]
a_documents = [x.lower() for x in get_test_data('test-A/in.tsv')]
def document_to_word2vec(doc):
vector = np.zeros(model.vector_size)
words = doc.split()
words_present = 0
for word in words:
if word in model.wv:
word_vector = model.wv.get_vector(word)
vector += word_vector
words_present += 1
if words_present > 0:
vector = vector / words_present
return vector
model_loaded = Word2Vec.load("word2vec.model")
train_documents_word2vec = [document_to_word2vec(doc) for doc in train_documents]
dev0_documents_word2vec = [document_to_word2vec(doc) for doc in dev0_documents]
a_documents_word2vec = [document_to_word2vec(doc) for doc in a_documents]
from sklearn.gaussian_process.kernels import RBF
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
names = [
"Linear SVM",
#"Neural Net",
"Naive Bayes",
"Random Forest",
"QDA"
]
classifiers = [
MLPClassifier(alpha=1, max_iter=1000, random_state=42),
#SVC(gamma=2, C=1, random_state=42),
GaussianNB(),
RandomForestClassifier(),
QuadraticDiscriminantAnalysis()
]
best_accuracy = 0
best_classifier_name = ""
for name, clf in zip(names, classifiers):
clf = make_pipeline(StandardScaler(), clf)
clf.fit(train_documents_word2vec, train_classes)
dev0_predictions = clf.predict(dev0_documents_word2vec)
a_predictions = clf.predict(a_documents_word2vec)
dev0_accuracy = accuracy_score(dev0_classes, dev0_predictions)
print("Test accuracy for classifier " + name + ":", dev0_accuracy)
if dev0_accuracy > best_accuracy:
best_accuracy = dev0_accuracy
best_classifier_name = name
best_dev0_predictions = dev0_predictions
best_a_predictions = a_predictions
Test accuracy for classifier Linear SVM: 0.9745047688921497 Test accuracy for classifier Naive Bayes: 0.892516507703595 Test accuracy for classifier Random Forest: 0.960564930300807 Test accuracy for classifier QDA: 0.923881144534116
with open('dev-0/out.tsv', 'w+', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter='\t')
for prediction in best_dev0_predictions:
writer.writerow([prediction])
with open('test-A/out.tsv', 'w+', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter='\t')
for prediction in best_a_predictions:
writer.writerow([prediction])