try 1500 100 900

2021-01-27 00:01:14 +01:00 · 2021-01-27 00:01:14 +01:00 · 6eb75c8749
commit 6eb75c8749
parent a26f56402c
7 changed files with 438881 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-1/out.tsv
+++ b/dev-1/out.tsv
--- a/BIN
+++ b/BIN
--- a/logistic_regression.ipynb
+++ b/logistic_regression.ipynb
@ -0,0 +1,343 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.8.0 64-bit ('tau': conda)",
+   "metadata": {
+    "interpreter": {
+     "hash": "99b9bc2e2925de034137bab8ac26137a7eaafe59960ece65892d3f1bd8bee5d4"
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "from unidecode import unidecode\n",
+    "from string import punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'\n",
+    "\n",
+    "#train size\n",
+    "learningRate = 0.1\n",
+    "epochs = 100\n",
+    "\n",
+    "#treainfile\n",
+    "trainin = filedir + '/train/intrain5k.tsv'\n",
+    "trainex = filedir + '/train/extrain5k.tsv'\n",
+    "\n",
+    "#data files\n",
+    "dev0in = filedir + '/dev-0/in.tsv'\n",
+    "dev0out = filedir + '/dev-0/out.tsv'\n",
+    "dev1in = filedir + '/dev-1/in.tsv'\n",
+    "dev1out = filedir + '/dev-1/out.tsv' \n",
+    "testAin = filedir + '/test-A/in.tsv'\n",
+    "testAout = filedir + '/test-A/out.tsv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "dane treningowe wczytane\n"
+     ]
+    }
+   ],
+   "source": [
+    "#dane do treningu\n",
+    "trainin_data = open(trainin, 'r').readlines()\n",
+    "trainex_data = open(trainex, 'r').readlines()\n",
+    "\n",
+    "train_data = []\n",
+    "for i in range(len(trainin_data)):\n",
+    "    inline = unidecode(trainin_data[i].lower())\n",
+    "\n",
+    "    for p in punctuation:\n",
+    "        if p in inline:\n",
+    "            inline.replace(p, ' ')\n",
+    "\n",
+    "    #weź tylko litery\n",
+    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
+    "    \n",
+    "    train_data.append((inline,int(trainex_data[i])))\n",
+    "\n",
+    "word_ix = {}\n",
+    "for sent, _ in train_data:\n",
+    "    for word in sent:\n",
+    "        if word not in word_ix:\n",
+    "            word_ix[word] = len(word_ix)\n",
+    "\n",
+    "print(\"dane treningowe wczytane\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "model regresji gotowy\n"
+     ]
+    }
+   ],
+   "source": [
+    "class LogisticRegression(torch.nn.Module):\n",
+    "     def __init__(self):\n",
+    "        super(LogisticRegression, self).__init__()\n",
+    "        self.linear = torch.nn.Linear(len(word_ix), 2)\n",
+    "\n",
+    "     def forward(self, x):\n",
+    "        return torch.nn.functional.log_softmax(self.linear(x), dim=1)\n",
+    "\n",
+    "model = LogisticRegression()\n",
+    "device = torch.device('cpu')\n",
+    "model.to(device)\n",
+    "criterion = torch.nn.NLLLoss() \n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)\n",
+    "\n",
+    "print('model regresji gotowy')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_vector(s, wi):\n",
+    "    v = torch.zeros(len(wi))\n",
+    "    for w in s:\n",
+    "        if (w in wi):\n",
+    "            v[wi[w]]+=1\n",
+    "    return v.view(1,-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "trening zakonczony\n"
+     ]
+    }
+   ],
+   "source": [
+    "#trening\n",
+    "for epoch in range(epochs):\n",
+    "    train_len = len(train_data)\n",
+    "    for inp, label in train_data: \n",
+    "        model.zero_grad()\n",
+    "\n",
+    "        inputs = create_vector(inp, word_ix)\n",
+    "        \n",
+    "        labels = torch.LongTensor([{0:0, 1:1}[label]])\n",
+    "\n",
+    "        outputs = model(inputs)\n",
+    "\n",
+    "        loss = criterion(outputs, labels)\n",
+    "\n",
+    "        loss.backward()\n",
+    "\n",
+    "        optimizer.step()\n",
+    "\n",
+    "print('trening zakonczony')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "dane dev0 wczytane\n",
+      "dane dev1 wczytane\n",
+      "dane testA wczytane\n"
+     ]
+    }
+   ],
+   "source": [
+    "#dane do przewidywania\n",
+    "dev0in_data = open(dev0in, 'r').readlines()\n",
+    "dev1in_data = open(dev1in, 'r').readlines()\n",
+    "testAin_data = open(testAin, 'r').readlines()\n",
+    "\n",
+    "dev0_data = []\n",
+    "for i in range(len(dev0in_data)):\n",
+    "    inline = unidecode(dev0in_data[i].lower())\n",
+    "\n",
+    "    for p in punctuation:\n",
+    "        if p in inline:\n",
+    "            inline.replace(p, ' ')\n",
+    "\n",
+    "    #weź tylko litery\n",
+    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
+    "    \n",
+    "    dev0_data.append(inline)\n",
+    "#dev0in_data.close()\n",
+    "\n",
+    "print(\"dane dev0 wczytane\")\n",
+    "\n",
+    "dev1_data = []\n",
+    "for i in range(len(dev1in_data)):\n",
+    "    inline = unidecode(dev1in_data[i].lower())\n",
+    "\n",
+    "    for p in punctuation:\n",
+    "        if p in inline:\n",
+    "            inline.replace(p, ' ')\n",
+    "\n",
+    "    #weź tylko litery\n",
+    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
+    "    \n",
+    "    dev1_data.append(inline)\n",
+    "#dev1in_data.close()\n",
+    "\n",
+    "print(\"dane dev1 wczytane\")\n",
+    "\n",
+    "testA_data = []\n",
+    "for i in range(len(testAin_data)):\n",
+    "    inline = unidecode(testAin_data[i].lower())\n",
+    "\n",
+    "    for p in punctuation:\n",
+    "        if p in inline:\n",
+    "            inline.replace(p, ' ')\n",
+    "\n",
+    "    #weź tylko litery\n",
+    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
+    "    \n",
+    "    testA_data.append(inline)\n",
+    "#testAin_data.close()\n",
+    "\n",
+    "print(\"dane testA wczytane\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dev 0 predict\n",
+    "\n",
+    "outfile = open(dev0out, 'w')\n",
+    "with torch.no_grad():\n",
+    "    for line in dev0_data:\n",
+    "        v = create_vector(line, word_ix)\n",
+    "        prob = model(v)\n",
+    "        if prob[0][0] > prob[0][1]:\n",
+    "            outfile.write(\"0\\n\")\n",
+    "        else:\n",
+    "            outfile.write(\"1\\n\")\n",
+    "outfile.close()\n",
+    "\n",
+    "print('plik wyjściowy dla dev0 został utworzony')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#dev 1 predict\n",
+    "\n",
+    "outfile = open(dev1out, 'w')\n",
+    "with torch.no_grad():\n",
+    "    for line in dev1_data:\n",
+    "        v = create_vector(line, word_ix)\n",
+    "        prob = model(v)\n",
+    "        if prob[0][0] > prob[0][1]:\n",
+    "            outfile.write(\"0\\n\")\n",
+    "        else:\n",
+    "            outfile.write(\"1\\n\")\n",
+    "outfile.close()\n",
+    "\n",
+    "print('plik wyjściowy dla dev1 został utworzony')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "plik wyjściowy dla testA został utworzony\n"
+     ]
+    }
+   ],
+   "source": [
+    "#test A predict\n",
+    "\n",
+    "outfile = open(testAout, 'w')\n",
+    "with torch.no_grad():\n",
+    "    for line in testA_data:\n",
+    "        v = create_vector(line, word_ix)\n",
+    "        prob = model(v)\n",
+    "        if prob[0][0] > prob[0][1]:\n",
+    "            outfile.write(\"0\\n\")\n",
+    "        else:\n",
+    "            outfile.write(\"1\\n\")\n",
+    "outfile.close()\n",
+    "\n",
+    "print('plik wyjściowy dla testA został utworzony')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/extrain5k.tsv
+++ b/train/extrain5k.tsv
--- a/train/intrain5k.tsv
+++ b/train/intrain5k.tsv