Upload files to "/"
This commit is contained in:
parent
bb74f0246d
commit
e1032f4494
421
Word2Vec.ipynb
Normal file
421
Word2Vec.ipynb
Normal file
@ -0,0 +1,421 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Definiowanie funkcji i sieci neuronowej"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"def sigmoid(x, e = 2.7183):\n",
|
||||||
|
" return 1 / (1 + e**(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def sigmoid_derivative(x):\n",
|
||||||
|
" return x * (1 - x)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def tanh(x):\n",
|
||||||
|
" return np.tanh(x)\n",
|
||||||
|
"\n",
|
||||||
|
"def tanh_derivative(x):\n",
|
||||||
|
" return 1 - np.tanh(x) ** 2\n",
|
||||||
|
"\n",
|
||||||
|
"def relu(x):\n",
|
||||||
|
" return np.maximum(0, x)\n",
|
||||||
|
"\n",
|
||||||
|
"def relu_derivative(x):\n",
|
||||||
|
" return np.where(x <= 0, 0, 1)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def softmax(x):\n",
|
||||||
|
" exps = np.exp(x - np.max(x, axis=1, keepdims=True))\n",
|
||||||
|
" return exps/np.sum(exps, axis=1, keepdims=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class NeuralNetwork:\n",
|
||||||
|
" def __init__(self, input_size, hidden_size, output_size, \n",
|
||||||
|
" act_func, loss_func, \n",
|
||||||
|
" learning_rate, epochs):\n",
|
||||||
|
" self.input_size = input_size\n",
|
||||||
|
" self.hidden_size = hidden_size\n",
|
||||||
|
" self.output_size = output_size\n",
|
||||||
|
" self.learning_rate = learning_rate\n",
|
||||||
|
" self.epochs = epochs\n",
|
||||||
|
" self.activation_func = act_func\n",
|
||||||
|
" self.loss_func = loss_func\n",
|
||||||
|
"\n",
|
||||||
|
" self.w1 = np.random.randn(self.input_size, self.hidden_size)\n",
|
||||||
|
" self.w2 = np.random.randn(self.hidden_size, self.output_size)\n",
|
||||||
|
"\n",
|
||||||
|
" self.b1 = np.zeros((1, self.hidden_size))\n",
|
||||||
|
" self.b2 = np.zeros((1, self.output_size))\n",
|
||||||
|
"\n",
|
||||||
|
" self.train_loss = []\n",
|
||||||
|
" self.test_loss = []\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def predict(self, X):\n",
|
||||||
|
" self.z1 = np.dot(X, self.w1) + self.b1\n",
|
||||||
|
" if self.activation_func == 'sigmoid':\n",
|
||||||
|
" self.a1 = sigmoid(self.z1)\n",
|
||||||
|
" elif self.activation_func == 'relu':\n",
|
||||||
|
" self.a1 = relu(self.z1)\n",
|
||||||
|
" elif self.activation_func == 'tanh':\n",
|
||||||
|
" self.a1 = tanh(self.z1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError('Nieprawidłowa funkcja aktywacji')\n",
|
||||||
|
"\n",
|
||||||
|
" self.z2 = np.dot(self.a1, self.w2) + self.b2\n",
|
||||||
|
" if self.loss_func == 'categorical_crossentropy':\n",
|
||||||
|
" self.a2 = softmax(self.z2)\n",
|
||||||
|
" else:\n",
|
||||||
|
" if self.activation_func == 'sigmoid':\n",
|
||||||
|
" self.a2 = sigmoid(self.z2)\n",
|
||||||
|
" elif self.activation_func == 'relu':\n",
|
||||||
|
" self.a2 = relu(self.z2)\n",
|
||||||
|
" elif self.activation_func == 'tanh':\n",
|
||||||
|
" self.a2 = tanh(self.z2)\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError('Nieprawidłowa funkcja aktywacji')\n",
|
||||||
|
" return self.a2\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def backward(self, X, Y):\n",
|
||||||
|
" m = X.shape[0]\n",
|
||||||
|
" \n",
|
||||||
|
" self.dz2 = self.a2 - Y\n",
|
||||||
|
"\n",
|
||||||
|
" self.dw2 = (1 / m) * np.dot(self.a1.T, self.dz2)\n",
|
||||||
|
" self.db2 = (1 / m) * np.sum(self.dz2, axis=0, keepdims=True)\n",
|
||||||
|
" if self.activation_func == 'sigmoid':\n",
|
||||||
|
" self.dz1 = np.dot(self.dz2, self.w2.T) * sigmoid_derivative(self.a1)\n",
|
||||||
|
" elif self.activation_func == 'relu':\n",
|
||||||
|
" self.dz1 = np.dot(self.dz2, self.w2.T) * relu_derivative(self.a1)\n",
|
||||||
|
" elif self.activation_func == 'tanh':\n",
|
||||||
|
" self.dz1 = np.dot(self.dz2, self.w2.T) * tanh_derivative(self.a1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError('Nieprawidłowa funkcja aktywacji')\n",
|
||||||
|
" self.dw1 = (1 / m) * np.dot(X.T, self.dz1)\n",
|
||||||
|
" self.db1 = (1 / m) * np.sum(self.dz1, axis=0, keepdims=True)\n",
|
||||||
|
"\n",
|
||||||
|
" # Zaktualizuj wagi i przesunięcia\n",
|
||||||
|
" self.w2 -= self.learning_rate * self.dw2\n",
|
||||||
|
" self.b2 -= self.learning_rate * self.db2\n",
|
||||||
|
" self.w1 -= self.learning_rate * self.dw1\n",
|
||||||
|
" self.b1 -= self.learning_rate * self.db1\n",
|
||||||
|
"\n",
|
||||||
|
" def loss(self, y_true, y_pred):\n",
|
||||||
|
" epsilon = 1e-10 \n",
|
||||||
|
" y_pred = np.clip(y_pred, epsilon, 1. - epsilon)\n",
|
||||||
|
" if self.loss_func == 'mse':\n",
|
||||||
|
" return np.mean((y_true - y_pred) ** 2)\n",
|
||||||
|
" elif self.loss_func == 'log_loss':\n",
|
||||||
|
" return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))\n",
|
||||||
|
" elif self.loss_func == 'categorical_crossentropy':\n",
|
||||||
|
" return -np.mean(y_true * np.log(y_pred))\n",
|
||||||
|
" else:\n",
|
||||||
|
" raise ValueError('Nieprawidłowa funkcja straty')\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def fit(self, X_train, y_train, X_test, y_test):\n",
|
||||||
|
" for _ in range(self.epochs):\n",
|
||||||
|
" self.predict(X_train)\n",
|
||||||
|
" self.backward(X_train, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
" train_loss = self.loss(y_train, self.a2)\n",
|
||||||
|
" self.train_loss.append(train_loss)\n",
|
||||||
|
"\n",
|
||||||
|
" self.predict(X_test)\n",
|
||||||
|
" test_loss = self.loss(y_test, self.a2)\n",
|
||||||
|
" self.test_loss.append(test_loss)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"def tokenize_str(str_dirty):\n",
|
||||||
|
" punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n",
|
||||||
|
" new_str = str_dirty.lower()\n",
|
||||||
|
" new_str = re.sub(' +', ' ', new_str)\n",
|
||||||
|
" for char in punctuation:\n",
|
||||||
|
" new_str = new_str.replace(char,'')\n",
|
||||||
|
" return new_str.split(' ')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"\n",
|
||||||
|
"def load_data(path):\n",
|
||||||
|
" with open(path, errors=\"ignore\") as file:\n",
|
||||||
|
" tsv_file = csv.reader(file, delimiter=\"\\t\")\n",
|
||||||
|
" file = list(tsv_file)\n",
|
||||||
|
"\n",
|
||||||
|
" data = []\n",
|
||||||
|
" labels = []\n",
|
||||||
|
"\n",
|
||||||
|
" for elem in file:\n",
|
||||||
|
" labels.append(int(elem[0]))\n",
|
||||||
|
" data.append(tokenize_str(elem[1]))\n",
|
||||||
|
"\n",
|
||||||
|
" return data, labels\n",
|
||||||
|
"\n",
|
||||||
|
"def load_test_data(path):\n",
|
||||||
|
" with open(path, errors=\"ignore\") as file:\n",
|
||||||
|
" tsv_file = csv.reader(file, delimiter=\"\\t\")\n",
|
||||||
|
" data = list(tsv_file)\n",
|
||||||
|
" data = [tokenize_str(elem[0]) for elem in data]\n",
|
||||||
|
" return data\n",
|
||||||
|
"\n",
|
||||||
|
"def load_test_labels(path):\n",
|
||||||
|
" with open(path, errors=\"ignore\") as file:\n",
|
||||||
|
" tsv_file = csv.reader(file, delimiter=\"\\t\")\n",
|
||||||
|
" data = list(tsv_file)\n",
|
||||||
|
" data = [int(elem[0]) for elem in data]\n",
|
||||||
|
" \n",
|
||||||
|
" return data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Ładowanie danych"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"TRAIN_PATH = \"./sport-text-classification-ball-isi-public/train/train.tsv\"\n",
|
||||||
|
"TEST_DEV_DATA = \"./sport-text-classification-ball-isi-public/dev-0/in.tsv\"\n",
|
||||||
|
"TEST_A_DATA = \"./sport-text-classification-ball-isi-public/test-A/in.tsv\"\n",
|
||||||
|
"TEST_DEV_LABELS = \"./sport-text-classification-ball-isi-public/dev-0/expected.tsv\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_train, y_train = load_data(TRAIN_PATH)\n",
|
||||||
|
"X_test, y_test = load_test_data(TEST_DEV_DATA), load_test_labels(TEST_DEV_LABELS)\n",
|
||||||
|
"X_test2 = load_test_data(TEST_A_DATA)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from gensim.models import KeyedVectors\n",
|
||||||
|
"\n",
|
||||||
|
"word2vec = KeyedVectors.load(\"word2vec_100_3_polish.bin\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from gensim.models import KeyedVectors\n",
|
||||||
|
"from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric, remove_stopwords\n",
|
||||||
|
"\n",
|
||||||
|
"def document_to_vector(document, model):\n",
|
||||||
|
" words = document\n",
|
||||||
|
" word_vectors = [model[word] for word in words if word in model]\n",
|
||||||
|
" if len(word_vectors) == 0:\n",
|
||||||
|
" return np.zeros(model.vector_size)\n",
|
||||||
|
" return np.mean(word_vectors, axis=0)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_train = [document_to_vector(doc, word2vec) for doc in X_train]\n",
|
||||||
|
"X_test = [document_to_vector(doc, word2vec) for doc in X_test]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_train = np.array(X_train)\n",
|
||||||
|
"X_test = np.array(X_test)\n",
|
||||||
|
"y_train = np.array(y_train).reshape(-1, 1)\n",
|
||||||
|
"y_test = np.array(y_test).reshape(-1, 1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Testy parametrów sieci"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def accuracy(y_true, y_pred):\n",
|
||||||
|
" predictions = (y_pred > 0.5).astype(int)\n",
|
||||||
|
" return np.mean(predictions == y_true)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"input_size = X_train.shape[1]\n",
|
||||||
|
"hidden_size = 64\n",
|
||||||
|
"output_size = 1 \n",
|
||||||
|
"learning_rate = 0.01\n",
|
||||||
|
"epochs = 1000\n",
|
||||||
|
"\n",
|
||||||
|
"act_functions = ['relu', 'tanh', 'sigmoid']\n",
|
||||||
|
"loss_functions = ['categorical_crossentropy', 'mse', 'log_loss']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def run_and_test_model(act_func, loss_func):\n",
|
||||||
|
" nn = NeuralNetwork(input_size, hidden_size, output_size, \n",
|
||||||
|
" act_func=act_func, loss_func=loss_func, \n",
|
||||||
|
" learning_rate=learning_rate, epochs=epochs)\n",
|
||||||
|
" \n",
|
||||||
|
" nn.fit(X_train, y_train, X_test, y_test)\n",
|
||||||
|
" \n",
|
||||||
|
" test_predictions = nn.predict(X_test)\n",
|
||||||
|
" test_acc = accuracy(y_test, test_predictions)\n",
|
||||||
|
" print(f'Dokładność na zbiorze {act_func} - {loss_func}: {test_acc * 100:.2f}%')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dokładność na zbiorze relu - categorical_crossentropy: 63.63%\n",
|
||||||
|
"Dokładność na zbiorze relu - mse: 71.77%\n",
|
||||||
|
"Dokładność na zbiorze relu - log_loss: 43.56%\n",
|
||||||
|
"Dokładność na zbiorze tanh - categorical_crossentropy: 63.63%\n",
|
||||||
|
"Dokładność na zbiorze tanh - mse: 71.46%\n",
|
||||||
|
"Dokładność na zbiorze tanh - log_loss: 72.21%\n",
|
||||||
|
"Dokładność na zbiorze sigmoid - categorical_crossentropy: 63.63%\n",
|
||||||
|
"Dokładność na zbiorze sigmoid - mse: 71.53%\n",
|
||||||
|
"Dokładność na zbiorze sigmoid - log_loss: 65.00%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for act in act_functions:\n",
|
||||||
|
" for loss in loss_functions:\n",
|
||||||
|
" run_and_test_model(act, loss)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_test2 = [document_to_vector(doc, word2vec) for doc in X_test2]\n",
|
||||||
|
"\n",
|
||||||
|
"X_test2 = np.array(X_test)\n",
|
||||||
|
"y_test2 = np.array(y_test).reshape(-1, 1)\n",
|
||||||
|
"\n",
|
||||||
|
"nn = NeuralNetwork(input_size, hidden_size, output_size, \n",
|
||||||
|
" act_func='relu', loss_func='mse', \n",
|
||||||
|
" learning_rate=learning_rate, epochs=epochs)\n",
|
||||||
|
"\n",
|
||||||
|
"nn.fit(X_train, y_train, X_test, y_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def save_predictions_to_tsv(predictions, filename):\n",
|
||||||
|
" np.savetxt(filename, predictions, fmt='%d', delimiter='\\t')\n",
|
||||||
|
"\n",
|
||||||
|
"test_predictions = nn.predict(X_test)\n",
|
||||||
|
"binary_predictions = (test_predictions >= 0.5).astype(int)\n",
|
||||||
|
"save_predictions_to_tsv(binary_predictions, './sport-text-classification-ball-isi-public/dev-0/out.tsv')\n",
|
||||||
|
"\n",
|
||||||
|
"test_predictions2 = nn.predict(X_test2)\n",
|
||||||
|
"binary_predictions2 = (test_predictions2 >= 0.5).astype(int)\n",
|
||||||
|
"save_predictions_to_tsv(binary_predictions2, './sport-text-classification-ball-isi-public/test-A/out.tsv')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user