petite-difference-challenge2/logistic_regression.ipynb

343 lines
8.5 KiB
Plaintext
Raw Permalink Normal View History

2021-01-27 00:01:14 +01:00
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.0 64-bit ('tau': conda)",
"metadata": {
"interpreter": {
"hash": "99b9bc2e2925de034137bab8ac26137a7eaafe59960ece65892d3f1bd8bee5d4"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from unidecode import unidecode\n",
"from string import punctuation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'\n",
"\n",
"#train size\n",
"learningRate = 0.1\n",
"epochs = 100\n",
"\n",
"#treainfile\n",
"trainin = filedir + '/train/intrain5k.tsv'\n",
"trainex = filedir + '/train/extrain5k.tsv'\n",
"\n",
"#data files\n",
"dev0in = filedir + '/dev-0/in.tsv'\n",
"dev0out = filedir + '/dev-0/out.tsv'\n",
"dev1in = filedir + '/dev-1/in.tsv'\n",
"dev1out = filedir + '/dev-1/out.tsv' \n",
"testAin = filedir + '/test-A/in.tsv'\n",
"testAout = filedir + '/test-A/out.tsv'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"dane treningowe wczytane\n"
]
}
],
"source": [
"#dane do treningu\n",
"trainin_data = open(trainin, 'r').readlines()\n",
"trainex_data = open(trainex, 'r').readlines()\n",
"\n",
"train_data = []\n",
"for i in range(len(trainin_data)):\n",
" inline = unidecode(trainin_data[i].lower())\n",
"\n",
" for p in punctuation:\n",
" if p in inline:\n",
" inline.replace(p, ' ')\n",
"\n",
" #weź tylko litery\n",
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
" \n",
" train_data.append((inline,int(trainex_data[i])))\n",
"\n",
"word_ix = {}\n",
"for sent, _ in train_data:\n",
" for word in sent:\n",
" if word not in word_ix:\n",
" word_ix[word] = len(word_ix)\n",
"\n",
"print(\"dane treningowe wczytane\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"model regresji gotowy\n"
]
}
],
"source": [
"class LogisticRegression(torch.nn.Module):\n",
" def __init__(self):\n",
" super(LogisticRegression, self).__init__()\n",
" self.linear = torch.nn.Linear(len(word_ix), 2)\n",
"\n",
" def forward(self, x):\n",
" return torch.nn.functional.log_softmax(self.linear(x), dim=1)\n",
"\n",
"model = LogisticRegression()\n",
"device = torch.device('cpu')\n",
"model.to(device)\n",
"criterion = torch.nn.NLLLoss() \n",
"optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)\n",
"\n",
"print('model regresji gotowy')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def create_vector(s, wi):\n",
" v = torch.zeros(len(wi))\n",
" for w in s:\n",
" if (w in wi):\n",
" v[wi[w]]+=1\n",
" return v.view(1,-1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"trening zakonczony\n"
]
}
],
"source": [
"#trening\n",
"for epoch in range(epochs):\n",
" train_len = len(train_data)\n",
" for inp, label in train_data: \n",
" model.zero_grad()\n",
"\n",
" inputs = create_vector(inp, word_ix)\n",
" \n",
" labels = torch.LongTensor([{0:0, 1:1}[label]])\n",
"\n",
" outputs = model(inputs)\n",
"\n",
" loss = criterion(outputs, labels)\n",
"\n",
" loss.backward()\n",
"\n",
" optimizer.step()\n",
"\n",
"print('trening zakonczony')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"dane dev0 wczytane\n",
"dane dev1 wczytane\n",
"dane testA wczytane\n"
]
}
],
"source": [
"#dane do przewidywania\n",
"dev0in_data = open(dev0in, 'r').readlines()\n",
"dev1in_data = open(dev1in, 'r').readlines()\n",
"testAin_data = open(testAin, 'r').readlines()\n",
"\n",
"dev0_data = []\n",
"for i in range(len(dev0in_data)):\n",
" inline = unidecode(dev0in_data[i].lower())\n",
"\n",
" for p in punctuation:\n",
" if p in inline:\n",
" inline.replace(p, ' ')\n",
"\n",
" #weź tylko litery\n",
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
" \n",
" dev0_data.append(inline)\n",
"#dev0in_data.close()\n",
"\n",
"print(\"dane dev0 wczytane\")\n",
"\n",
"dev1_data = []\n",
"for i in range(len(dev1in_data)):\n",
" inline = unidecode(dev1in_data[i].lower())\n",
"\n",
" for p in punctuation:\n",
" if p in inline:\n",
" inline.replace(p, ' ')\n",
"\n",
" #weź tylko litery\n",
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
" \n",
" dev1_data.append(inline)\n",
"#dev1in_data.close()\n",
"\n",
"print(\"dane dev1 wczytane\")\n",
"\n",
"testA_data = []\n",
"for i in range(len(testAin_data)):\n",
" inline = unidecode(testAin_data[i].lower())\n",
"\n",
" for p in punctuation:\n",
" if p in inline:\n",
" inline.replace(p, ' ')\n",
"\n",
" #weź tylko litery\n",
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
" \n",
" testA_data.append(inline)\n",
"#testAin_data.close()\n",
"\n",
"print(\"dane testA wczytane\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#dev 0 predict\n",
"\n",
"outfile = open(dev0out, 'w')\n",
"with torch.no_grad():\n",
" for line in dev0_data:\n",
" v = create_vector(line, word_ix)\n",
" prob = model(v)\n",
" if prob[0][0] > prob[0][1]:\n",
" outfile.write(\"0\\n\")\n",
" else:\n",
" outfile.write(\"1\\n\")\n",
"outfile.close()\n",
"\n",
"print('plik wyjściowy dla dev0 został utworzony')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#dev 1 predict\n",
"\n",
"outfile = open(dev1out, 'w')\n",
"with torch.no_grad():\n",
" for line in dev1_data:\n",
" v = create_vector(line, word_ix)\n",
" prob = model(v)\n",
" if prob[0][0] > prob[0][1]:\n",
" outfile.write(\"0\\n\")\n",
" else:\n",
" outfile.write(\"1\\n\")\n",
"outfile.close()\n",
"\n",
"print('plik wyjściowy dla dev1 został utworzony')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"plik wyjściowy dla testA został utworzony\n"
]
}
],
"source": [
"#test A predict\n",
"\n",
"outfile = open(testAout, 'w')\n",
"with torch.no_grad():\n",
" for line in testA_data:\n",
" v = create_vector(line, word_ix)\n",
" prob = model(v)\n",
" if prob[0][0] > prob[0][1]:\n",
" outfile.write(\"0\\n\")\n",
" else:\n",
" outfile.write(\"1\\n\")\n",
"outfile.close()\n",
"\n",
"print('plik wyjściowy dla testA został utworzony')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
]
}