{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n", "b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n" ] } ], "source": [ "from sklearn.naive_bayes import GaussianNB\n", "import pandas as pd\n", "import torch\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import gensim\n", "from gensim.models import KeyedVectors\n", "import gensim.downloader\n", "import nltk\n", "import csv\n", "\n", "import numpy as np\n", "\n", "import numpy as np\n", "def tokenize_data(data):\n", " data_tokenize = [nltk.word_tokenize(x) for x in data]\n", " \n", " for doc in data_tokenize:\n", " i = 0\n", " while i < len(doc):\n", " if doc[i].isalpha():\n", " doc[i] = doc[i].lower()\n", " else:\n", " del doc[i]\n", " i += 1\n", " return data_tokenize\n", "\n", "class NeuralNetwork(torch.nn.Module):\n", " def __init__(self, input_size, hidden_size, num_classes):\n", " super(NeuralNetwork, self).__init__()\n", " self.l1 = torch.nn.Linear(input_size, hidden_size)\n", " self.l2 = torch.nn.Linear(hidden_size, num_classes)\n", " \n", " def forward(self, x):\n", " x = self.l1(x)\n", " x = torch.relu(x)\n", " x = self.l2(x)\n", " x = torch.sigmoid(x)\n", " return x\n", "r_in = './train/train.tsv'\n", "\n", "r_ind_ev = './dev-0/in.tsv'\n", "tsv_read = pd.read_table(r_in, error_bad_lines=False, quoting=csv.QUOTE_NONE, sep='\\t', header=None)\n", "tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\\t', quoting=csv.QUOTE_NONE, header=None)\n", "tsv_read_test_in = pd.read_table('./test-A/in.tsv', error_bad_lines=False,quoting=csv.QUOTE_NONE, header= None)\n", "\n", "y_train = tsv_read[0].values\n", "X_train = tsv_read[1].values\n", "X_dev = tsv_read_dev[0].values\n", "X_test= tsv_read_test_in[0].values\n", "\n", "X_train = tokenize_data(X_train)\n", "X_dev = tokenize_data(X_dev)\n", "X_test = tokenize_data(X_test)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "\n", "model = KeyedVectors.load(\"./word2vec/word2vec_100_3_polish.bin\")\n", "\n", "X_train = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_train]\n", "x_train_vectors = np.array(X_train, dtype=np.float32)\n", "x_train_tensor = torch.tensor(x_train_vectors.astype(np.float32))\n", "\n", "X_dev = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_dev]\n", "x_dev_vectors = np.array(X_dev, dtype=np.float32)\n", "\n", "\n", "X_test = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in X_test]\n", "x_test_vectors = np.array(X_test, dtype=np.float32)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "model = NeuralNetwork(100, 200, 1)\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n", " \n", "batch_size = 12\n", "\n", " \n", "for epoch in range(6):\n", " loss_score = 0\n", " acc_score = 0\n", " items_total = 0\n", " model.train()\n", " for i in range(0, y_train.shape[0], batch_size):\n", " X = x_train_vectors[i:i+batch_size]\n", " X = torch.tensor(X.astype(np.float32))\n", " Y = y_train[i:i+batch_size]\n", " Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n", " \n", " Y_predictions = model(X)\n", " acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n", " items_total += Y.shape[0] \n", " \n", " optimizer.zero_grad()\n", " loss = criterion(Y_predictions, Y)\n", " loss.backward()\n", " optimizer.step()\n", " \n", " \n", " loss_score += loss.item() * Y.shape[0]\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))\n", "predictions = predictions.cpu().detach().numpy() \n", "predictions = (predictions > 0.5)\n", "predictions = np.asarray(predictions, dtype=np.int32)\n", "predictions.tofile('dev-0/out.tsv', sep='\\n')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "predictions = model(torch.tensor(x_dev_vectors.astype(np.float32)))\n", "predictions = predictions.cpu().detach().numpy() \n", "predictions = (predictions > 0.5)\n", "predictions = np.asarray(predictions, dtype=np.int32)\n", "predictions.tofile('test-A/out.tsv', sep='\\n')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }