done

2021-05-26 01:35:32 +02:00 · 2021-05-26 01:35:32 +02:00 · 54b82a7411
commit 54b82a7411
parent 9cb2fb2612
6 changed files with 120260 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-0/out2.tsv
+++ b/dev-0/out2.tsv
--- a/regresja-logistyczna.ipynb
+++ b/regresja-logistyczna.ipynb
@ -0,0 +1,320 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d2b899fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import gensim\n",
+    "from gensim.models import Word2Vec\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.gridspec as gridspec\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "import torch\n",
+    "import csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "39a1f19a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "98129\n",
+      "98129\n",
+      "5452\n",
+      "5452\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "b'Skipping line 1983: expected 1 fields, saw 2\\nSkipping line 5199: expected 1 fields, saw 2\\n'\n"
+     ]
+    }
+   ],
+   "source": [
+    "# wczytanie danych\n",
+    "train = pd.read_table('train/train.tsv', error_bad_lines=False, sep='\\t', quoting=csv.QUOTE_NONE, header=None)\n",
+    "x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
+    "y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
+    "x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
+    "\n",
+    "# podzial na x i y\n",
+    "x_train = train[1].values\n",
+    "y_train = train[0].values\n",
+    "x_dev = x_dev[0].values\n",
+    "x_test = x_test[0].values\n",
+    "\n",
+    "print(len(x_train))\n",
+    "print(len(y_train))\n",
+    "print(len(x_dev))\n",
+    "print(len(y_dev))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "c637937e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "#nltk.download('punkt')\n",
+    "\n",
+    "\n",
+    "# tokenizacja \n",
+    "def tokenize_data(data):\n",
+    "    data_tokenize = [nltk.word_tokenize(x) for x in data]\n",
+    " \n",
+    "    for doc in data_tokenize:\n",
+    "        i = 0\n",
+    "        while i < len(doc):\n",
+    "            if doc[i].isalpha():\n",
+    "                doc[i] = doc[i].lower()\n",
+    "            else:\n",
+    "                del doc[i]\n",
+    "            i += 1\n",
+    "    return data_tokenize\n",
+    "\n",
+    "x_train_tokenized = tokenize_data(x_train)\n",
+    "x_dev_tokenized = tokenize_data(x_dev)\n",
+    "x_test_tokenized = tokenize_data(x_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "890b3cca",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('róż', 0.8955456018447876), ('kwiatek', 0.8504886031150818), ('fiołek', 0.831953763961792), ('chryzantema', 0.8315931558609009), ('bukiet', 0.8306410908699036), ('wiśnia', 0.8005671501159668), ('żonkil', 0.8005172610282898), ('liść', 0.7998315095901489), ('lilia', 0.7931062579154968), ('peonia', 0.7918344140052795)]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/Cellar/jupyterlab/3.0.14/libexec/lib/python3.9/site-packages/gensim/models/keyedvectors.py:772: RuntimeWarning: invalid value encountered in true_divide\n",
+      "  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim.models import KeyedVectors\n",
+    "\n",
+    "word2vec_model = KeyedVectors.load(\"word2vec.bin\")\n",
+    "\n",
+    "# sprawdzenie czy dziala\n",
+    "print(word2vec_model.similar_by_word(\"kwiat\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "6bd92640",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_train]\n",
+    "x_train_tensor = torch.tensor(np.array(x_train, dtype=np.float32).astype(np.float32))\n",
+    "x_train_vec = np.array(x_train, dtype=np.float32)\n",
+    "\n",
+    "x_dev = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_dev]\n",
+    "x_dev_vec = np.array(x_dev, dtype=np.float32)\n",
+    "\n",
+    "\n",
+    "x_test = [np.mean([word2vec_model[word] for word in content if word in word2vec_model] or [np.zeros(100)], axis=0) for content in x_test]\n",
+    "x_test_vec = np.array(x_test, dtype=np.float32)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "df544bfb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class NeuralNetworkModel(torch.nn.Module):\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        super(NeuralNetworkModel, self).__init__()\n",
+    "        self.fc1 = torch.nn.Linear(100,200)\n",
+    "        self.fc2 = torch.nn.Linear(200,1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.fc1(x)\n",
+    "        x = torch.relu(x)\n",
+    "        x = self.fc2(x)\n",
+    "        x = torch.sigmoid(x)\n",
+    "        return x\n",
+    "        \n",
+    "nn_model = NeuralNetworkModel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "884d80ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "criterion = torch.nn.BCELoss()\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "eacc269d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "batch_size = 12\n",
+    "for epoch in range(6):\n",
+    "    loss_score = 0\n",
+    "    acc_score = 0\n",
+    "    items_total = 0\n",
+    "    nn_model.train()\n",
+    "    \n",
+    "    for i in range(0, y_train.shape[0], batch_size):\n",
+    "        X = x_train_vec[i:i + batch_size]\n",
+    "        X = torch.tensor(X.astype(np.float32))\n",
+    "        Y = y_train[i:i + batch_size]\n",
+    "        Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)\n",
+    "        Y_predictions = nn_model(X)\n",
+    "        acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n",
+    "        items_total += Y.shape[0]\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        loss = criterion(Y_predictions, Y)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        loss_score += loss.item() * Y.shape[0]\n",
+    "    display(epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "daa85677",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# predykcje \n",
+    "y_pred_dev = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
+    "y_pred_dev = y_pred_dev.cpu().detach().numpy()\n",
+    "y_pred_dev = (y_pred_dev > 0.5)\n",
+    "y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)\n",
+    "y_pred_dev.tofile('dev-0/out2.tsv', sep='\\n')\n",
+    "\n",
+    "\n",
+    "y_pred_test = nn_model(torch.tensor(x_dev_vec.astype(np.float32)))\n",
+    "y_pred_test = y_pred_test.cpu().detach().numpy()\n",
+    "y_pred_test = (y_pred_test > 0.5)\n",
+    "y_pred_test = np.asarray(y_pred_test, dtype=np.int32)\n",
+    "y_pred_test.tofile('test-A/out2.tsv', sep='\\n')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-A/out2.tsv
+++ b/test-A/out2.tsv
--- a/train/train.tsv
+++ b/train/train.tsv