{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import torch\n", "from nltk.tokenize import word_tokenize\n", "import gensim.downloader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#wczytywanie danych\n", "x_train = pd.read_table('train/in.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['content', 'id'], usecols=['content'])\n", "y_train = pd.read_table('train/expected.tsv', sep='\\t', error_bad_lines=False, quoting=3, header=None, names=['label'])\n", "x_dev = pd.read_table('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])\n", "x_test = pd.read_table('test-A/in.tsv', sep='\\t', error_bad_lines=False, header=None, quoting=3, names=['content', 'id'], usecols=['content'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "x_train = x_train.content.str.lower()\n", "x_dev = x_dev.content.str.lower()\n", "x_test = x_test.content.str.lower()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/tomasz/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "x_train = [word_tokenize(content) for content in x_train]\n", "x_dev = [word_tokenize(content) for content in x_dev]\n", "x_test = [word_tokenize(content) for content in x_test]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "word2vec = gensim.downloader.load(\"word2vec-google-news-300\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def document_vector(doc):\n", " \"\"\"Create document vectors by averaging word vectors. Remove out-of-vocabulary words.\"\"\"\n", " return np.mean([word2vec[w] for w in doc if w in word2vec] or [np.zeros(300)], axis=0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "x_train = [document_vector(doc) for doc in x_train]\n", "x_dev = [document_vector(doc) for doc in x_dev]\n", "x_test = [document_vector(doc) for doc in x_test]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "class NeuralNetwork(torch.nn.Module): \n", " def __init__(self, hidden_size):\n", " super(NeuralNetwork, self).__init__()\n", " self.l1 = torch.nn.Linear(300, hidden_size)\n", " self.l2 = torch.nn.Linear(hidden_size, 1)\n", "\n", " def forward(self, x):\n", " x = self.l1(x)\n", " x = torch.relu(x)\n", " x = self.l2(x)\n", " x = torch.sigmoid(x)\n", " return x" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "hidden_size = 600\n", "epochs = 5\n", "batch_size = 15\n", "model = NeuralNetwork(hidden_size)\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/tomasz/.local/lib/python3.8/site-packages/torch/autograd/__init__.py:130: UserWarning: CUDA initialization: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.)\n", " Variable._execution_engine.run_backward(\n" ] } ], "source": [ "for epoch in range(epochs):\n", " model.train()\n", " for i in range(0, y_train.shape[0], batch_size):\n", " X = x_train[i:i+batch_size]\n", " X = torch.tensor(X)\n", " y = y_train[i:i+batch_size]\n", " y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)\n", " \n", " outputs = model(X.float())\n", " loss = criterion(outputs, y)\n", " \n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "y_dev = []\n", "y_test = []" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NeuralNetwork(\n", " (l1): Linear(in_features=300, out_features=600, bias=True)\n", " (l2): Linear(in_features=600, out_features=1, bias=True)\n", ")" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.eval()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "with torch.no_grad():\n", " for i in range(0, len(x_dev), batch_size):\n", " X = x_dev[i:i+batch_size]\n", " X = torch.tensor(X)\n", " outputs = model(X.float()) \n", " prediction = (outputs > 0.5)\n", " y_dev.extend(prediction)\n", "\n", " for i in range(0, len(x_test), batch_size):\n", " X = x_test[i:i+batch_size]\n", " X = torch.tensor(X)\n", " outputs = model(X.float())\n", " y = (outputs > 0.5)\n", " y_test.extend(prediction)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "y_dev = np.asarray(y_dev, dtype=np.int32)\n", "y_test = np.asarray(y_test, dtype=np.int32)\n", "\n", "y_dev = pd.DataFrame({'label':y_dev})\n", "y_test = pd.DataFrame({'label':y_test})\n", "\n", "y_dev.to_csv(r'dev-0/out.tsv', sep='\\t', index=False, header=False)\n", "y_test.to_csv(r'test-A/out.tsv', sep='\\t', index=False, header=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }