sport-text-classification-ball/solution.ipynb at solution2

s464920 88e1db4b02 solution with KeyedVectors(worse accuracy)

2024-05-17 22:16:22 +02:00

7.2 KiB

Raw Permalink Blame History

import numpy as np
import csv
import spacy
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score

train_documents = []
train_classes = []

with open('train/train.tsv', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines:
        elements = line.split('\t')
        train_classes.append(int(elements[0]))
        train_documents.append(elements[1].lower())

#model = Word2Vec(sentences=[doc.split() for doc in train_documents], vector_size=100, window=5, min_count=1, workers=4)
#model.save("word2vec.model")

def get_test_data(path):
    with open(path, 'r', encoding='utf-8') as file:
        test_data = []
        lines = file.readlines()
        for line in lines:
            test_data.append(line.strip().split('\t')[0])
        return test_data

dev0_documents = [x.lower() for x in get_test_data('dev-0/in.tsv')]
dev0_classes = [int(x) for x in get_test_data('dev-0/expected.tsv')]
a_documents = [x.lower() for x in get_test_data('test-A/in.tsv')]

word2vec = KeyedVectors.load('word2vec_100_3_polish.bin')

nlp = spacy.load('pl_core_news_sm')

def document_to_word2vec(document):
    tokens = nlp(document.lower())
    return np.sum([word2vec[token.text] for token in tokens if token.text in word2vec], axis=0)

#def document_to_word2vec(doc):
#    vector = np.zeros(model.vector_size)
#    words = doc.split()
#    words_present = 0
#    for word in words:
#        if word in model.wv:
#            word_vector = model.wv.get_vector(word)
#            vector += word_vector
#            words_present += 1
#    if words_present > 0:
#        vector = vector / words_present
#    return vector

#model_loaded = Word2Vec.load("word2vec.model")

train_documents_word2vec = [document_to_word2vec(doc) for doc in train_documents]
dev0_documents_word2vec = [document_to_word2vec(doc) for doc in dev0_documents]
a_documents_word2vec = [document_to_word2vec(doc) for doc in a_documents]

from sklearn.gaussian_process.kernels import RBF
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

a = (100,)
x = np.zeros(100)
for i in range(len(train_documents_word2vec)):
    if train_documents_word2vec[i].shape != a:
        train_documents_word2vec[i] = x

names = [
    "Linear SVM",
    "Naive Bayes",
    "Random Forest",
    "QDA"
]

classifiers = [
    MLPClassifier(alpha=1, max_iter=500, random_state=42),
    GaussianNB(),
    RandomForestClassifier(),
    QuadraticDiscriminantAnalysis()
]

best_accuracy = 0
best_classifier_name = ""

for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(train_documents_word2vec, train_classes)

    dev0_predictions = clf.predict(dev0_documents_word2vec)
    a_predictions = clf.predict(a_documents_word2vec)
    dev0_accuracy = accuracy_score(dev0_classes, dev0_predictions)
    print("Test accuracy for classifier " + name + ":", dev0_accuracy)

    if dev0_accuracy > best_accuracy:
        best_accuracy = dev0_accuracy
        best_classifier_name = name
        best_dev0_predictions = dev0_predictions
        best_a_predictions = a_predictions

Test accuracy for classifier Linear SVM: 0.9425898752751284
Test accuracy for classifier Naive Bayes: 0.8050256786500367
Test accuracy for classifier Random Forest: 0.8971019809244314
Test accuracy for classifier QDA: 0.9277329420396185

with open('dev-0/out.tsv', 'w+', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter='\t')
    for prediction in best_dev0_predictions:
        writer.writerow([prediction])

with open('test-A/out.tsv', 'w+', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, delimiter='\t')
    for prediction in best_a_predictions:
        writer.writerow([prediction])

7.2 KiB Raw Permalink Blame History

7.2 KiB

Raw Permalink Blame History