{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.0 64-bit ('tau': conda)",
   "metadata": {
    "interpreter": {
     "hash": "99b9bc2e2925de034137bab8ac26137a7eaafe59960ece65892d3f1bd8bee5d4"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from unidecode import unidecode\n",
    "from string import punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'\n",
    "\n",
    "#train size\n",
    "learningRate = 0.1\n",
    "epochs = 100\n",
    "\n",
    "#treainfile\n",
    "trainin = filedir + '/train/intrain5k.tsv'\n",
    "trainex = filedir + '/train/extrain5k.tsv'\n",
    "\n",
    "#data files\n",
    "dev0in = filedir + '/dev-0/in.tsv'\n",
    "dev0out = filedir + '/dev-0/out.tsv'\n",
    "dev1in = filedir + '/dev-1/in.tsv'\n",
    "dev1out = filedir + '/dev-1/out.tsv' \n",
    "testAin = filedir + '/test-A/in.tsv'\n",
    "testAout = filedir + '/test-A/out.tsv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "dane treningowe wczytane\n"
     ]
    }
   ],
   "source": [
    "#dane do treningu\n",
    "trainin_data = open(trainin, 'r').readlines()\n",
    "trainex_data = open(trainex, 'r').readlines()\n",
    "\n",
    "train_data = []\n",
    "for i in range(len(trainin_data)):\n",
    "    inline = unidecode(trainin_data[i].lower())\n",
    "\n",
    "    for p in punctuation:\n",
    "        if p in inline:\n",
    "            inline.replace(p, ' ')\n",
    "\n",
    "    #weź tylko litery\n",
    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
    "    \n",
    "    train_data.append((inline,int(trainex_data[i])))\n",
    "\n",
    "word_ix = {}\n",
    "for sent, _ in train_data:\n",
    "    for word in sent:\n",
    "        if word not in word_ix:\n",
    "            word_ix[word] = len(word_ix)\n",
    "\n",
    "print(\"dane treningowe wczytane\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "model regresji gotowy\n"
     ]
    }
   ],
   "source": [
    "class LogisticRegression(torch.nn.Module):\n",
    "     def __init__(self):\n",
    "        super(LogisticRegression, self).__init__()\n",
    "        self.linear = torch.nn.Linear(len(word_ix), 2)\n",
    "\n",
    "     def forward(self, x):\n",
    "        return torch.nn.functional.log_softmax(self.linear(x), dim=1)\n",
    "\n",
    "model = LogisticRegression()\n",
    "device = torch.device('cpu')\n",
    "model.to(device)\n",
    "criterion = torch.nn.NLLLoss() \n",
    "optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)\n",
    "\n",
    "print('model regresji gotowy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_vector(s, wi):\n",
    "    v = torch.zeros(len(wi))\n",
    "    for w in s:\n",
    "        if (w in wi):\n",
    "            v[wi[w]]+=1\n",
    "    return v.view(1,-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "trening zakonczony\n"
     ]
    }
   ],
   "source": [
    "#trening\n",
    "for epoch in range(epochs):\n",
    "    train_len = len(train_data)\n",
    "    for inp, label in train_data: \n",
    "        model.zero_grad()\n",
    "\n",
    "        inputs = create_vector(inp, word_ix)\n",
    "        \n",
    "        labels = torch.LongTensor([{0:0, 1:1}[label]])\n",
    "\n",
    "        outputs = model(inputs)\n",
    "\n",
    "        loss = criterion(outputs, labels)\n",
    "\n",
    "        loss.backward()\n",
    "\n",
    "        optimizer.step()\n",
    "\n",
    "print('trening zakonczony')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "dane dev0 wczytane\n",
      "dane dev1 wczytane\n",
      "dane testA wczytane\n"
     ]
    }
   ],
   "source": [
    "#dane do przewidywania\n",
    "dev0in_data = open(dev0in, 'r').readlines()\n",
    "dev1in_data = open(dev1in, 'r').readlines()\n",
    "testAin_data = open(testAin, 'r').readlines()\n",
    "\n",
    "dev0_data = []\n",
    "for i in range(len(dev0in_data)):\n",
    "    inline = unidecode(dev0in_data[i].lower())\n",
    "\n",
    "    for p in punctuation:\n",
    "        if p in inline:\n",
    "            inline.replace(p, ' ')\n",
    "\n",
    "    #weź tylko litery\n",
    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
    "    \n",
    "    dev0_data.append(inline)\n",
    "#dev0in_data.close()\n",
    "\n",
    "print(\"dane dev0 wczytane\")\n",
    "\n",
    "dev1_data = []\n",
    "for i in range(len(dev1in_data)):\n",
    "    inline = unidecode(dev1in_data[i].lower())\n",
    "\n",
    "    for p in punctuation:\n",
    "        if p in inline:\n",
    "            inline.replace(p, ' ')\n",
    "\n",
    "    #weź tylko litery\n",
    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
    "    \n",
    "    dev1_data.append(inline)\n",
    "#dev1in_data.close()\n",
    "\n",
    "print(\"dane dev1 wczytane\")\n",
    "\n",
    "testA_data = []\n",
    "for i in range(len(testAin_data)):\n",
    "    inline = unidecode(testAin_data[i].lower())\n",
    "\n",
    "    for p in punctuation:\n",
    "        if p in inline:\n",
    "            inline.replace(p, ' ')\n",
    "\n",
    "    #weź tylko litery\n",
    "    inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
    "    \n",
    "    testA_data.append(inline)\n",
    "#testAin_data.close()\n",
    "\n",
    "print(\"dane testA wczytane\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dev 0 predict\n",
    "\n",
    "outfile = open(dev0out, 'w')\n",
    "with torch.no_grad():\n",
    "    for line in dev0_data:\n",
    "        v = create_vector(line, word_ix)\n",
    "        prob = model(v)\n",
    "        if prob[0][0] > prob[0][1]:\n",
    "            outfile.write(\"0\\n\")\n",
    "        else:\n",
    "            outfile.write(\"1\\n\")\n",
    "outfile.close()\n",
    "\n",
    "print('plik wyjściowy dla dev0 został utworzony')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dev 1 predict\n",
    "\n",
    "outfile = open(dev1out, 'w')\n",
    "with torch.no_grad():\n",
    "    for line in dev1_data:\n",
    "        v = create_vector(line, word_ix)\n",
    "        prob = model(v)\n",
    "        if prob[0][0] > prob[0][1]:\n",
    "            outfile.write(\"0\\n\")\n",
    "        else:\n",
    "            outfile.write(\"1\\n\")\n",
    "outfile.close()\n",
    "\n",
    "print('plik wyjściowy dla dev1 został utworzony')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "plik wyjściowy dla testA został utworzony\n"
     ]
    }
   ],
   "source": [
    "#test A predict\n",
    "\n",
    "outfile = open(testAout, 'w')\n",
    "with torch.no_grad():\n",
    "    for line in testA_data:\n",
    "        v = create_vector(line, word_ix)\n",
    "        prob = model(v)\n",
    "        if prob[0][0] > prob[0][1]:\n",
    "            outfile.write(\"0\\n\")\n",
    "        else:\n",
    "            outfile.write(\"1\\n\")\n",
    "outfile.close()\n",
    "\n",
    "print('plik wyjściowy dla testA został utworzony')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}