{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0-final" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3.8.0 64-bit ('tau': conda)", "metadata": { "interpreter": { "hash": "99b9bc2e2925de034137bab8ac26137a7eaafe59960ece65892d3f1bd8bee5d4" } } } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import torch\n", "from unidecode import unidecode\n", "from string import punctuation" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'\n", "\n", "#train size\n", "learningRate = 0.1\n", "epochs = 100\n", "\n", "#treainfile\n", "trainin = filedir + '/train/intrain5k.tsv'\n", "trainex = filedir + '/train/extrain5k.tsv'\n", "\n", "#data files\n", "dev0in = filedir + '/dev-0/in.tsv'\n", "dev0out = filedir + '/dev-0/out.tsv'\n", "dev1in = filedir + '/dev-1/in.tsv'\n", "dev1out = filedir + '/dev-1/out.tsv' \n", "testAin = filedir + '/test-A/in.tsv'\n", "testAout = filedir + '/test-A/out.tsv'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [] }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "dane treningowe wczytane\n" ] } ], "source": [ "#dane do treningu\n", "trainin_data = open(trainin, 'r').readlines()\n", "trainex_data = open(trainex, 'r').readlines()\n", "\n", "train_data = []\n", "for i in range(len(trainin_data)):\n", " inline = unidecode(trainin_data[i].lower())\n", "\n", " for p in punctuation:\n", " if p in inline:\n", " inline.replace(p, ' ')\n", "\n", " #weź tylko litery\n", " inline = list(filter(lambda w: w.isalpha(), inline.split()))\n", " \n", " train_data.append((inline,int(trainex_data[i])))\n", "\n", "word_ix = {}\n", "for sent, _ in train_data:\n", " for word in sent:\n", " if word not in word_ix:\n", " word_ix[word] = len(word_ix)\n", "\n", "print(\"dane treningowe wczytane\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "model regresji gotowy\n" ] } ], "source": [ "class LogisticRegression(torch.nn.Module):\n", " def __init__(self):\n", " super(LogisticRegression, self).__init__()\n", " self.linear = torch.nn.Linear(len(word_ix), 2)\n", "\n", " def forward(self, x):\n", " return torch.nn.functional.log_softmax(self.linear(x), dim=1)\n", "\n", "model = LogisticRegression()\n", "device = torch.device('cpu')\n", "model.to(device)\n", "criterion = torch.nn.NLLLoss() \n", "optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)\n", "\n", "print('model regresji gotowy')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def create_vector(s, wi):\n", " v = torch.zeros(len(wi))\n", " for w in s:\n", " if (w in wi):\n", " v[wi[w]]+=1\n", " return v.view(1,-1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "tags": [] }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "trening zakonczony\n" ] } ], "source": [ "#trening\n", "for epoch in range(epochs):\n", " train_len = len(train_data)\n", " for inp, label in train_data: \n", " model.zero_grad()\n", "\n", " inputs = create_vector(inp, word_ix)\n", " \n", " labels = torch.LongTensor([{0:0, 1:1}[label]])\n", "\n", " outputs = model(inputs)\n", "\n", " loss = criterion(outputs, labels)\n", "\n", " loss.backward()\n", "\n", " optimizer.step()\n", "\n", "print('trening zakonczony')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "dane dev0 wczytane\n", "dane dev1 wczytane\n", "dane testA wczytane\n" ] } ], "source": [ "#dane do przewidywania\n", "dev0in_data = open(dev0in, 'r').readlines()\n", "dev1in_data = open(dev1in, 'r').readlines()\n", "testAin_data = open(testAin, 'r').readlines()\n", "\n", "dev0_data = []\n", "for i in range(len(dev0in_data)):\n", " inline = unidecode(dev0in_data[i].lower())\n", "\n", " for p in punctuation:\n", " if p in inline:\n", " inline.replace(p, ' ')\n", "\n", " #weź tylko litery\n", " inline = list(filter(lambda w: w.isalpha(), inline.split()))\n", " \n", " dev0_data.append(inline)\n", "#dev0in_data.close()\n", "\n", "print(\"dane dev0 wczytane\")\n", "\n", "dev1_data = []\n", "for i in range(len(dev1in_data)):\n", " inline = unidecode(dev1in_data[i].lower())\n", "\n", " for p in punctuation:\n", " if p in inline:\n", " inline.replace(p, ' ')\n", "\n", " #weź tylko litery\n", " inline = list(filter(lambda w: w.isalpha(), inline.split()))\n", " \n", " dev1_data.append(inline)\n", "#dev1in_data.close()\n", "\n", "print(\"dane dev1 wczytane\")\n", "\n", "testA_data = []\n", "for i in range(len(testAin_data)):\n", " inline = unidecode(testAin_data[i].lower())\n", "\n", " for p in punctuation:\n", " if p in inline:\n", " inline.replace(p, ' ')\n", "\n", " #weź tylko litery\n", " inline = list(filter(lambda w: w.isalpha(), inline.split()))\n", " \n", " testA_data.append(inline)\n", "#testAin_data.close()\n", "\n", "print(\"dane testA wczytane\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "#dev 0 predict\n", "\n", "outfile = open(dev0out, 'w')\n", "with torch.no_grad():\n", " for line in dev0_data:\n", " v = create_vector(line, word_ix)\n", " prob = model(v)\n", " if prob[0][0] > prob[0][1]:\n", " outfile.write(\"0\\n\")\n", " else:\n", " outfile.write(\"1\\n\")\n", "outfile.close()\n", "\n", "print('plik wyjściowy dla dev0 został utworzony')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "#dev 1 predict\n", "\n", "outfile = open(dev1out, 'w')\n", "with torch.no_grad():\n", " for line in dev1_data:\n", " v = create_vector(line, word_ix)\n", " prob = model(v)\n", " if prob[0][0] > prob[0][1]:\n", " outfile.write(\"0\\n\")\n", " else:\n", " outfile.write(\"1\\n\")\n", "outfile.close()\n", "\n", "print('plik wyjściowy dla dev1 został utworzony')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "plik wyjściowy dla testA został utworzony\n" ] } ], "source": [ "#test A predict\n", "\n", "outfile = open(testAout, 'w')\n", "with torch.no_grad():\n", " for line in testA_data:\n", " v = create_vector(line, word_ix)\n", " prob = model(v)\n", " if prob[0][0] > prob[0][1]:\n", " outfile.write(\"0\\n\")\n", " else:\n", " outfile.write(\"1\\n\")\n", "outfile.close()\n", "\n", "print('plik wyjściowy dla testA został utworzony')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }