init

2021-05-27 17:11:28 +02:00 · 2021-05-27 17:11:28 +02:00 · 54ddfbc0e2
commit 54ddfbc0e2
parent 8cb18767bb
4 changed files with 11140 additions and 0 deletions
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/neural.ipynb
+++ b/neural.ipynb
@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "import nltk\n",
+    "import pandas as pd \n",
+    "import numpy as np \n",
+    "import os\n",
+    "import io\n",
+    "import gzip\n",
+    "import torch\n",
+    "\n",
+    "# wget http://publications.it.p.lodz.pl/2016/word_embeddings/pl-embeddings-cbow.txt   900MB\n",
+    "\n",
+    "def read_data_gz(baseUrl):\n",
+    "    f = gzip.open(baseUrl,'r')\n",
+    "    data_unzip = f.read()\n",
+    "    data = pd.read_table(io.StringIO(data_unzip.decode('utf-8')), error_bad_lines=False, header= None) \n",
+    "    return data\n",
+    "\n",
+    "def preprocess(data):\n",
+    "    data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
+    "\n",
+    "    for doc in data_tokenize:\n",
+    "        i = 0\n",
+    "        while i < len(doc):\n",
+    "            if doc[i].isalpha():\n",
+    "                doc[i] = doc[i].lower()\n",
+    "            else:\n",
+    "                del doc[i]\n",
+    "            i += 1\n",
+    "    return data_tokenize\n",
+    "\n",
+    "class NeuralNetworkModel(torch.nn.Module):\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        super(NeuralNetworkModel, self).__init__()\n",
+    "        self.fc1 = torch.nn.Linear(100,200)\n",
+    "        self.fc2 = torch.nn.Linear(200,1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.fc1(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.fc2(x)\n",
+    "        x = torch.sigmoid(x)\n",
+    "        return x\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_train = read_data_gz('./train/train.tsv.gz')\n",
+    "data_dev = pd.read_table('./dev-0/in.tsv', error_bad_lines=False, header= None)\n",
+    "\n",
+    "\n",
+    "y_train = data_train[0].values\n",
+    "x_train = data_train[1].values\n",
+    "x_dev  =  data_dev[0].values\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = gensim.models.KeyedVectors.load_word2vec_format('pl-embeddings-cbow.txt', binary=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train_tokenize = preprocess(x_train)\n",
+    "x_dev_tokenize = preprocess(x_dev)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train_vectors = [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_train_tokenize]\n",
+    "x_train_vectors = np.array(x_train_vectors)\n",
+    "\n",
+    "x_dev_vectors= [np.mean([model[word] for word in content if word in model] or [np.zeros(100)], axis=0) for content in x_dev_tokenize]\n",
+    "x_dev_vectors = np.array(x_dev_vectors, dtype=np.float32)\n",
+    "x_dev_tensor = torch.tensor(x_dev_vectors.astype(np.float32))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trenowanie modelu...\n"
+     ]
+    }
+   ],
+   "source": [
+    "# -------------------------------------------------------------------------------------------------------------------------------------------\n",
+    "model_nn = NeuralNetworkModel()\n",
+    "criterion = torch.nn.BCELoss()\n",
+    "optimizer = torch.optim.SGD(model_nn.parameters(), lr=0.01)\n",
+    " \n",
+    "batch_size = 10\n",
+    "print('Trenowanie modelu...')\n",
+    " \n",
+    "for epoch in range(6):\n",
+    "    loss_score = 0\n",
+    "    acc_score = 0\n",
+    "    items_total = 0\n",
+    "    model_nn.train()\n",
+    "    for i in range(0, y_train.shape[0], batch_size):\n",
+    "        X = x_train_vectors[i:i+batch_size]\n",
+    "        X = torch.tensor(X.astype(np.float32))\n",
+    "        Y = y_train[i:i+batch_size]\n",
+    "        Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n",
+    " \n",
+    "        Y_predictions = model_nn(X)\n",
+    "        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
+    "        items_total += Y.shape[0] \n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        loss = criterion(Y_predictions, Y)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "\n",
+    "        loss_score += loss.item() * Y.shape[0]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -------------------------------------------------------------------------------------------------------------------------------------------\n",
+    "ypred = model_nn(x_dev_tensor)\n",
+    "ypred = ypred.cpu().detach().numpy() \n",
+    "ypred = (ypred > 0.5)\n",
+    "ypred = np.asarray(ypred, dtype=np.int32)\n",
+    "\n",
+    "y_exptected = pd.read_table('./dev-0/expected.tsv', header= None)\n",
+    "y_exptected = y_exptected.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Score =  0.973037417461482\n",
+      "------------------------------------------------------------\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.97      0.96      0.96      1983\n",
+      "           1       0.97      0.98      0.98      3469\n",
+      "\n",
+      "    accuracy                           0.97      5452\n",
+      "   macro avg       0.97      0.97      0.97      5452\n",
+      "weighted avg       0.97      0.97      0.97      5452\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import classification_report\n",
+    "\n",
+    "print(\"Score = \",accuracy_score(y_exptected, ypred))\n",
+    "\n",
+    "print('-' * 60)\n",
+    "print(classification_report(y_exptected, ypred))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/train/train.tsv.gz
+++ b/train/train.tsv.gz