commit 9cf44989ff11f54c1f160e0a6c2dbc7847f4a96d Author: s464909 Date: Fri May 17 11:51:57 2024 +0200 Upload files to "/" diff --git a/Word2Vec.ipynb b/Word2Vec.ipynb new file mode 100644 index 0000000..55e4f4a --- /dev/null +++ b/Word2Vec.ipynb @@ -0,0 +1,483 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Definiowanie funkcji i sieci neuronowej" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def sigmoid(x, e = 2.7183):\n", + " return 1 / (1 + e**(-x))\n", + "\n", + "\n", + "def sigmoid_derivative(x):\n", + " return x * (1 - x)\n", + "\n", + "\n", + "def tanh(x):\n", + " return np.tanh(x)\n", + "\n", + "def tanh_derivative(x):\n", + " return 1 - np.tanh(x) ** 2\n", + "\n", + "def relu(x):\n", + " return np.maximum(0, x)\n", + "\n", + "def relu_derivative(x):\n", + " return np.where(x <= 0, 0, 1)\n", + "\n", + "\n", + "def softmax(x):\n", + " exps = np.exp(x - np.max(x, axis=1, keepdims=True))\n", + " return exps/np.sum(exps, axis=1, keepdims=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class NeuralNetwork:\n", + " def __init__(self, input_size, hidden_size, output_size, \n", + " act_func, loss_func, \n", + " learning_rate, epochs):\n", + " self.input_size = input_size\n", + " self.hidden_size = hidden_size\n", + " self.output_size = output_size\n", + " self.learning_rate = learning_rate\n", + " self.epochs = epochs\n", + " self.activation_func = act_func\n", + " self.loss_func = loss_func\n", + "\n", + " self.w1 = np.random.randn(self.input_size, self.hidden_size)\n", + " self.w2 = np.random.randn(self.hidden_size, self.output_size)\n", + "\n", + " self.b1 = np.zeros((1, self.hidden_size))\n", + " self.b2 = np.zeros((1, self.output_size))\n", + "\n", + " self.train_loss = []\n", + " self.test_loss = []\n", + "\n", + "\n", + " def predict(self, X):\n", + " self.z1 = np.dot(X, self.w1) + self.b1\n", + " if self.activation_func == 'sigmoid':\n", + " self.a1 = sigmoid(self.z1)\n", + " elif self.activation_func == 'relu':\n", + " self.a1 = relu(self.z1)\n", + " elif self.activation_func == 'tanh':\n", + " self.a1 = tanh(self.z1)\n", + " else:\n", + " raise ValueError('Nieprawidłowa funkcja aktywacji')\n", + "\n", + " self.z2 = np.dot(self.a1, self.w2) + self.b2\n", + " if self.loss_func == 'categorical_crossentropy':\n", + " self.a2 = softmax(self.z2)\n", + " else:\n", + " if self.activation_func == 'sigmoid':\n", + " self.a2 = sigmoid(self.z2)\n", + " elif self.activation_func == 'relu':\n", + " self.a2 = relu(self.z2)\n", + " elif self.activation_func == 'tanh':\n", + " self.a2 = tanh(self.z2)\n", + " else:\n", + " raise ValueError('Nieprawidłowa funkcja aktywacji')\n", + " return self.a2\n", + "\n", + "\n", + " def backward(self, X, Y):\n", + " m = X.shape[0]\n", + " \n", + " self.dz2 = self.a2 - Y\n", + "\n", + " self.dw2 = (1 / m) * np.dot(self.a1.T, self.dz2)\n", + " self.db2 = (1 / m) * np.sum(self.dz2, axis=0, keepdims=True)\n", + " if self.activation_func == 'sigmoid':\n", + " self.dz1 = np.dot(self.dz2, self.w2.T) * sigmoid_derivative(self.a1)\n", + " elif self.activation_func == 'relu':\n", + " self.dz1 = np.dot(self.dz2, self.w2.T) * relu_derivative(self.a1)\n", + " elif self.activation_func == 'tanh':\n", + " self.dz1 = np.dot(self.dz2, self.w2.T) * tanh_derivative(self.a1)\n", + " else:\n", + " raise ValueError('Nieprawidłowa funkcja aktywacji')\n", + " self.dw1 = (1 / m) * np.dot(X.T, self.dz1)\n", + " self.db1 = (1 / m) * np.sum(self.dz1, axis=0, keepdims=True)\n", + "\n", + " # Zaktualizuj wagi i przesunięcia\n", + " self.w2 -= self.learning_rate * self.dw2\n", + " self.b2 -= self.learning_rate * self.db2\n", + " self.w1 -= self.learning_rate * self.dw1\n", + " self.b1 -= self.learning_rate * self.db1\n", + "\n", + "\n", + " # def loss(self, y_true, y_pred):\n", + " # if self.loss_func == 'mse':\n", + " # return np.mean((y_pred - y_true)**2)\n", + " # elif self.loss_func == 'log_loss':\n", + " # return -np.mean(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))\n", + " # elif self.loss_func == 'categorical_crossentropy':\n", + " # return -np.mean(y_true*np.log(y_pred))\n", + " # else:\n", + " # raise ValueError('Nieprawidłowa funkcja straty')\n", + "\n", + " def loss(self, y_true, y_pred):\n", + " epsilon = 1e-10 # Mała wartość, aby uniknąć log(0)\n", + " y_pred = np.clip(y_pred, epsilon, 1. - epsilon)\n", + " if self.loss_func == 'mse':\n", + " return np.mean((y_pred - y_true)**2)\n", + " elif self.loss_func == 'log_loss':\n", + " return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))\n", + " elif self.loss_func == 'categorical_crossentropy':\n", + " return -np.mean(y_true * np.log(y_pred))\n", + " else:\n", + " raise ValueError('Nieprawidłowa funkcja straty')\n", + "\n", + "\n", + " def fit(self, X_train, y_train, X_test, y_test):\n", + " for _ in range(self.epochs):\n", + " self.predict(X_train)\n", + " self.backward(X_train, y_train)\n", + "\n", + " train_loss = self.loss(y_train, self.a2)\n", + " self.train_loss.append(train_loss)\n", + "\n", + " self.predict(X_test)\n", + " test_loss = self.loss(y_test, self.a2)\n", + " self.test_loss.append(test_loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def tokenize_str(str_dirty):\n", + " punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n", + " new_str = str_dirty.lower()\n", + " new_str = re.sub(' +', ' ', new_str)\n", + " for char in punctuation:\n", + " new_str = new_str.replace(char,'')\n", + " return new_str.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "def load_data(path):\n", + " with open(path) as file:\n", + " tsv_file = csv.reader(file, delimiter=\"\\t\")\n", + " file = list(tsv_file)\n", + "\n", + " data = []\n", + " labels = []\n", + "\n", + " for elem in file:\n", + " labels.append(int(elem[0]))\n", + " # tu jeszcze zrobić wektor albo listę wektorów\n", + " data.append(tokenize_str(elem[1]))\n", + "\n", + " return data, labels\n", + "\n", + "def load_test_data(path):\n", + " with open(path) as file:\n", + " tsv_file = csv.reader(file, delimiter=\"\\t\")\n", + " data = list(tsv_file)\n", + " data = [tokenize_str(elem[0]) for elem in data]\n", + " # tu jeszcze zrobić wektor albo listę wektorów\n", + " return data\n", + "\n", + "def load_test_labels(path):\n", + " with open(path) as file:\n", + " tsv_file = csv.reader(file, delimiter=\"\\t\")\n", + " data = list(tsv_file)\n", + " data = [int(elem[0]) for elem in data]\n", + " \n", + " return data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ładowanie danych" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "TRAIN_PATH = \"./sport-text-classification-ball-isi-public/train/train.tsv\"\n", + "TEST_DEV_DATA = \"./sport-text-classification-ball-isi-public/dev-0/in.tsv\"\n", + "TEST_A_DATA = \"./sport-text-classification-ball-isi-public/test-A/in.tsv\"\n", + "TEST_DEV_LABELS = \"./sport-text-classification-ball-isi-public/dev-0/expected.tsv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, y_train = load_data(TRAIN_PATH)\n", + "X_test, y_test = load_test_data(TEST_DEV_DATA), load_test_labels(TEST_DEV_LABELS)\n", + "X_test2 = load_test_data(TEST_A_DATA)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models import KeyedVectors\n", + "\n", + "word2vec = KeyedVectors.load(\"word2vec_100_3_polish.bin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from gensim.models import KeyedVectors\n", + "from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, remove_stopwords\n", + "\n", + "def document_to_vector(document, model):\n", + " words = document\n", + " word_vectors = [model[word] for word in words if word in model]\n", + " if len(word_vectors) == 0:\n", + " return np.zeros(model.vector_size)\n", + " return np.mean(word_vectors, axis=0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = [document_to_vector(doc, word2vec) for doc in X_train]\n", + "X_test = [document_to_vector(doc, word2vec) for doc in X_test]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Przygotowanie danych do trenowania modelu\n", + "X_train = np.array(X_train)\n", + "X_test = np.array(X_test)\n", + "y_train = np.array(y_train).reshape(-1, 1)\n", + "y_test = np.array(y_test).reshape(-1, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Testy parametrów sieci" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy(y_true, y_pred):\n", + " predictions = (y_pred > 0.5).astype(int) # Próg dla klasyfikacji binarnej\n", + " return np.mean(predictions == y_true)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Inicjalizacja i trenowanie modelu\n", + "input_size = X_train.shape[1]\n", + "hidden_size = 64\n", + "output_size = 1 \n", + "learning_rate = 0.01\n", + "epochs = 500\n", + "\n", + "act_functions = ['relu', 'tanh', 'sigmoid']\n", + "loss_functions = ['categorical_crossentropy', 'mse', 'log_loss']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def run_and_test_model(act_func, loss_func):\n", + " # Inicjalizacja modelu\n", + " nn = NeuralNetwork(input_size, hidden_size, output_size, \n", + " act_func=act_func, loss_func=loss_func, \n", + " learning_rate=learning_rate, epochs=epochs)\n", + " # Trenowanie modelu\n", + " nn.fit(X_train, y_train, X_test, y_test)\n", + " \n", + " # Obliczanie dokładności na zbiorze testowym\n", + " test_predictions = nn.predict(X_test)\n", + " test_acc = accuracy(y_test, test_predictions)\n", + " print(f'Dokładność na zbiorze testowym: {test_acc * 100:.2f}%')\n" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Dokładność na treningowym relu categorical_crossentropy: 63.62%\n", + "Dokładność na testowym relu categorical_crossentropy: 63.63%\n", + "\n", + "Dokładność na treningowym relu mse: 49.04%\n", + "Dokładność na testowym relu mse: 49.60%\n", + "\n", + "Dokładność na treningowym relu log_loss: 63.29%\n", + "Dokładność na testowym relu log_loss: 63.98%\n", + "\n", + "Dokładność na treningowym tanh categorical_crossentropy: 63.62%\n", + "Dokładność na testowym tanh categorical_crossentropy: 63.63%\n", + "\n", + "Dokładność na treningowym tanh mse: 71.85%\n", + "Dokładność na testowym tanh mse: 70.89%\n", + "\n", + "Dokładność na treningowym tanh log_loss: 72.18%\n", + "Dokładność na testowym tanh log_loss: 71.06%\n", + "\n", + "Dokładność na treningowym sigmoid categorical_crossentropy: 63.62%\n", + "Dokładność na testowym sigmoid categorical_crossentropy: 63.63%\n", + "\n", + "Dokładność na treningowym sigmoid mse: 62.54%\n", + "Dokładność na testowym sigmoid mse: 61.81%\n", + "\n", + "Dokładność na treningowym sigmoid log_loss: 58.20%\n", + "Dokładność na testowym sigmoid log_loss: 58.05%" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "input_size = X_train.shape[1]\n", + "hidden_size = 72\n", + "output_size = 1 \n", + "learning_rate = 0.01\n", + "epochs = 1000\n", + "\n", + "nn = NeuralNetwork(input_size, hidden_size, output_size, \n", + " act_func='tanh', loss_func='mse', \n", + " learning_rate=learning_rate, epochs=epochs)\n", + "# Trenowanie modelu\n", + "nn.fit(X_train, y_train, X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dokładność na zbiorze testowym: 71.64%\n" + ] + } + ], + "source": [ + "test_predictions = nn.predict(X_test)\n", + "test_acc = accuracy(y_test, test_predictions)\n", + "print(f'Dokładność na zbiorze testowym: {test_acc * 100:.2f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "X_test2 = [document_to_vector(doc, word2vec) for doc in X_test2]\n", + "\n", + "X_test2 = np.array(X_test)\n", + "y_test2 = np.array(y_test).reshape(-1, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Funkcja do zapisu predykcji do pliku TSV\n", + "def save_predictions_to_tsv(predictions, filename):\n", + " np.savetxt(filename, predictions, fmt='%d', delimiter='\\t')\n", + "\n", + "test_predictions = nn.predict(X_test)\n", + "binary_predictions = (test_predictions >= 0.5).astype(int)\n", + "save_predictions_to_tsv(binary_predictions, 'predictions_dev.tsv')\n", + "\n", + "test_predictions2 = nn.predict(X_test2)\n", + "binary_predictions2 = (test_predictions2 >= 0.5).astype(int)\n", + "save_predictions_to_tsv(binary_predictions2, 'predictions_a.tsv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}