Word2Vec/Word2Vec.ipynb

13 KiB

Definiowanie funkcji i sieci neuronowej

import numpy as np

def sigmoid(x, e = 2.7183):
    return 1 / (1 + e**(-x))


def sigmoid_derivative(x):
    return x * (1 - x)


def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x <= 0, 0, 1)


def softmax(x):
    exps = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exps/np.sum(exps, axis=1, keepdims=True)
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, 
                 act_func, loss_func, 
                 learning_rate, epochs):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.activation_func = act_func
        self.loss_func = loss_func

        self.w1 = np.random.randn(self.input_size, self.hidden_size)
        self.w2 = np.random.randn(self.hidden_size, self.output_size)

        self.b1 = np.zeros((1, self.hidden_size))
        self.b2 = np.zeros((1, self.output_size))

        self.train_loss = []
        self.test_loss = []


    def predict(self, X):
        self.z1 = np.dot(X, self.w1) + self.b1
        if self.activation_func == 'sigmoid':
            self.a1 = sigmoid(self.z1)
        elif self.activation_func == 'relu':
            self.a1 = relu(self.z1)
        elif self.activation_func == 'tanh':
            self.a1 = tanh(self.z1)
        else:
            raise ValueError('Nieprawidłowa funkcja aktywacji')

        self.z2 = np.dot(self.a1, self.w2) + self.b2
        if self.loss_func == 'categorical_crossentropy':
            self.a2 = softmax(self.z2)
        else:
            if self.activation_func == 'sigmoid':
                self.a2 = sigmoid(self.z2)
            elif self.activation_func == 'relu':
                self.a2 = relu(self.z2)
            elif self.activation_func == 'tanh':
                self.a2 = tanh(self.z2)
            else:
                raise ValueError('Nieprawidłowa funkcja aktywacji')
        return self.a2


    def backward(self, X, Y):
        m = X.shape[0]
        
        self.dz2 = self.a2 - Y

        self.dw2 = (1 / m) * np.dot(self.a1.T, self.dz2)
        self.db2 = (1 / m) * np.sum(self.dz2, axis=0, keepdims=True)
        if self.activation_func == 'sigmoid':
            self.dz1 = np.dot(self.dz2, self.w2.T) * sigmoid_derivative(self.a1)
        elif self.activation_func == 'relu':
            self.dz1 = np.dot(self.dz2, self.w2.T) * relu_derivative(self.a1)
        elif self.activation_func == 'tanh':
            self.dz1 = np.dot(self.dz2, self.w2.T) * tanh_derivative(self.a1)
        else:
            raise ValueError('Nieprawidłowa funkcja aktywacji')
        self.dw1 = (1 / m) * np.dot(X.T, self.dz1)
        self.db1 = (1 / m) * np.sum(self.dz1, axis=0, keepdims=True)

        # Zaktualizuj wagi i przesunięcia
        self.w2 -= self.learning_rate * self.dw2
        self.b2 -= self.learning_rate * self.db2
        self.w1 -= self.learning_rate * self.dw1
        self.b1 -= self.learning_rate * self.db1

    def loss(self, y_true, y_pred):
        epsilon = 1e-10 
        y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
        if self.loss_func == 'mse':
            return np.mean((y_true - y_pred) ** 2)
        elif self.loss_func == 'log_loss':
            return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        elif self.loss_func == 'categorical_crossentropy':
            return -np.mean(y_true * np.log(y_pred))
        else:
            raise ValueError('Nieprawidłowa funkcja straty')


    def fit(self, X_train, y_train, X_test, y_test):
        for _ in range(self.epochs):
            self.predict(X_train)
            self.backward(X_train, y_train)

            train_loss = self.loss(y_train, self.a2)
            self.train_loss.append(train_loss)

            self.predict(X_test)
            test_loss = self.loss(y_test, self.a2)
            self.test_loss.append(test_loss)
import re

def tokenize_str(str_dirty):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
    new_str = str_dirty.lower()
    new_str = re.sub(' +', ' ', new_str)
    for char in punctuation:
        new_str = new_str.replace(char,'')
    return new_str.split(' ')
import csv

def load_data(path):
    with open(path, errors="ignore") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        file = list(tsv_file)

        data = []
        labels = []

        for elem in file:
            labels.append(int(elem[0]))
            data.append(tokenize_str(elem[1]))

        return data, labels

def load_test_data(path):
    with open(path, errors="ignore") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        data = list(tsv_file)
        data = [tokenize_str(elem[0]) for elem in data]
        return data

def load_test_labels(path):
    with open(path, errors="ignore") as file:
        tsv_file = csv.reader(file, delimiter="\t")
        data = list(tsv_file)
        data = [int(elem[0]) for elem in data]
        
        return data

Ładowanie danych

TRAIN_PATH = "./sport-text-classification-ball-isi-public/train/train.tsv"
TEST_DEV_DATA = "./sport-text-classification-ball-isi-public/dev-0/in.tsv"
TEST_A_DATA = "./sport-text-classification-ball-isi-public/test-A/in.tsv"
TEST_DEV_LABELS = "./sport-text-classification-ball-isi-public/dev-0/expected.tsv"
X_train, y_train = load_data(TRAIN_PATH)
X_test, y_test = load_test_data(TEST_DEV_DATA), load_test_labels(TEST_DEV_LABELS)
X_test2 = load_test_data(TEST_A_DATA)
from gensim.models import KeyedVectors

word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
import numpy as np
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, remove_stopwords

def document_to_vector(document, model):
    words = document
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)
X_train = [document_to_vector(doc, word2vec) for doc in X_train]
X_test = [document_to_vector(doc, word2vec) for doc in X_test]
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

Testy parametrów sieci

def accuracy(y_true, y_pred):
    predictions = (y_pred > 0.5).astype(int)
    return np.mean(predictions == y_true)
input_size = X_train.shape[1]
hidden_size = 64
output_size = 1 
learning_rate = 0.01
epochs = 1000

act_functions = ['relu', 'tanh', 'sigmoid']
loss_functions = ['categorical_crossentropy', 'mse', 'log_loss']
def run_and_test_model(act_func, loss_func):
    nn = NeuralNetwork(input_size, hidden_size, output_size, 
                       act_func=act_func, loss_func=loss_func, 
                       learning_rate=learning_rate, epochs=epochs)
    
    nn.fit(X_train, y_train, X_test, y_test)
    
    test_predictions = nn.predict(X_test)
    test_acc = accuracy(y_test, test_predictions)
    print(f'Dokładność na zbiorze {act_func} - {loss_func}: {test_acc * 100:.2f}%')
for act in act_functions:
    for loss in loss_functions:
        run_and_test_model(act, loss)
Dokładność na zbiorze relu - categorical_crossentropy: 63.63%
Dokładność na zbiorze relu - mse: 71.77%
Dokładność na zbiorze relu - log_loss: 43.56%
Dokładność na zbiorze tanh - categorical_crossentropy: 63.63%
Dokładność na zbiorze tanh - mse: 71.46%
Dokładność na zbiorze tanh - log_loss: 72.21%
Dokładność na zbiorze sigmoid - categorical_crossentropy: 63.63%
Dokładność na zbiorze sigmoid - mse: 71.53%
Dokładność na zbiorze sigmoid - log_loss: 65.00%
X_test2 = [document_to_vector(doc, word2vec) for doc in X_test2]

X_test2 = np.array(X_test)
y_test2 = np.array(y_test).reshape(-1, 1)

nn = NeuralNetwork(input_size, hidden_size, output_size, 
                       act_func='relu', loss_func='mse', 
                       learning_rate=learning_rate, epochs=epochs)

nn.fit(X_train, y_train, X_test, y_test)
def save_predictions_to_tsv(predictions, filename):
    np.savetxt(filename, predictions, fmt='%d', delimiter='\t')

test_predictions = nn.predict(X_test)
binary_predictions = (test_predictions >= 0.5).astype(int)
save_predictions_to_tsv(binary_predictions, './sport-text-classification-ball-isi-public/dev-0/out.tsv')

test_predictions2 = nn.predict(X_test2)
binary_predictions2 = (test_predictions2 >= 0.5).astype(int)
save_predictions_to_tsv(binary_predictions2, './sport-text-classification-ball-isi-public/test-A/out.tsv')