try 1500 100 900
This commit is contained in:
parent
a26f56402c
commit
6eb75c8749
137314
dev-0/out.tsv
Normal file
137314
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Normal file
156606
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
343
logistic_regression.ipynb
Normal file
343
logistic_regression.ipynb
Normal file
@ -0,0 +1,343 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.0-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.8.0 64-bit ('tau': conda)",
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "99b9bc2e2925de034137bab8ac26137a7eaafe59960ece65892d3f1bd8bee5d4"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import torch\n",
|
||||
"from unidecode import unidecode\n",
|
||||
"from string import punctuation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'\n",
|
||||
"\n",
|
||||
"#train size\n",
|
||||
"learningRate = 0.1\n",
|
||||
"epochs = 100\n",
|
||||
"\n",
|
||||
"#treainfile\n",
|
||||
"trainin = filedir + '/train/intrain5k.tsv'\n",
|
||||
"trainex = filedir + '/train/extrain5k.tsv'\n",
|
||||
"\n",
|
||||
"#data files\n",
|
||||
"dev0in = filedir + '/dev-0/in.tsv'\n",
|
||||
"dev0out = filedir + '/dev-0/out.tsv'\n",
|
||||
"dev1in = filedir + '/dev-1/in.tsv'\n",
|
||||
"dev1out = filedir + '/dev-1/out.tsv' \n",
|
||||
"testAin = filedir + '/test-A/in.tsv'\n",
|
||||
"testAout = filedir + '/test-A/out.tsv'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"dane treningowe wczytane\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#dane do treningu\n",
|
||||
"trainin_data = open(trainin, 'r').readlines()\n",
|
||||
"trainex_data = open(trainex, 'r').readlines()\n",
|
||||
"\n",
|
||||
"train_data = []\n",
|
||||
"for i in range(len(trainin_data)):\n",
|
||||
" inline = unidecode(trainin_data[i].lower())\n",
|
||||
"\n",
|
||||
" for p in punctuation:\n",
|
||||
" if p in inline:\n",
|
||||
" inline.replace(p, ' ')\n",
|
||||
"\n",
|
||||
" #weź tylko litery\n",
|
||||
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
|
||||
" \n",
|
||||
" train_data.append((inline,int(trainex_data[i])))\n",
|
||||
"\n",
|
||||
"word_ix = {}\n",
|
||||
"for sent, _ in train_data:\n",
|
||||
" for word in sent:\n",
|
||||
" if word not in word_ix:\n",
|
||||
" word_ix[word] = len(word_ix)\n",
|
||||
"\n",
|
||||
"print(\"dane treningowe wczytane\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"model regresji gotowy\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"class LogisticRegression(torch.nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(LogisticRegression, self).__init__()\n",
|
||||
" self.linear = torch.nn.Linear(len(word_ix), 2)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" return torch.nn.functional.log_softmax(self.linear(x), dim=1)\n",
|
||||
"\n",
|
||||
"model = LogisticRegression()\n",
|
||||
"device = torch.device('cpu')\n",
|
||||
"model.to(device)\n",
|
||||
"criterion = torch.nn.NLLLoss() \n",
|
||||
"optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)\n",
|
||||
"\n",
|
||||
"print('model regresji gotowy')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_vector(s, wi):\n",
|
||||
" v = torch.zeros(len(wi))\n",
|
||||
" for w in s:\n",
|
||||
" if (w in wi):\n",
|
||||
" v[wi[w]]+=1\n",
|
||||
" return v.view(1,-1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"trening zakonczony\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#trening\n",
|
||||
"for epoch in range(epochs):\n",
|
||||
" train_len = len(train_data)\n",
|
||||
" for inp, label in train_data: \n",
|
||||
" model.zero_grad()\n",
|
||||
"\n",
|
||||
" inputs = create_vector(inp, word_ix)\n",
|
||||
" \n",
|
||||
" labels = torch.LongTensor([{0:0, 1:1}[label]])\n",
|
||||
"\n",
|
||||
" outputs = model(inputs)\n",
|
||||
"\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
"\n",
|
||||
" loss.backward()\n",
|
||||
"\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"print('trening zakonczony')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"dane dev0 wczytane\n",
|
||||
"dane dev1 wczytane\n",
|
||||
"dane testA wczytane\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#dane do przewidywania\n",
|
||||
"dev0in_data = open(dev0in, 'r').readlines()\n",
|
||||
"dev1in_data = open(dev1in, 'r').readlines()\n",
|
||||
"testAin_data = open(testAin, 'r').readlines()\n",
|
||||
"\n",
|
||||
"dev0_data = []\n",
|
||||
"for i in range(len(dev0in_data)):\n",
|
||||
" inline = unidecode(dev0in_data[i].lower())\n",
|
||||
"\n",
|
||||
" for p in punctuation:\n",
|
||||
" if p in inline:\n",
|
||||
" inline.replace(p, ' ')\n",
|
||||
"\n",
|
||||
" #weź tylko litery\n",
|
||||
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
|
||||
" \n",
|
||||
" dev0_data.append(inline)\n",
|
||||
"#dev0in_data.close()\n",
|
||||
"\n",
|
||||
"print(\"dane dev0 wczytane\")\n",
|
||||
"\n",
|
||||
"dev1_data = []\n",
|
||||
"for i in range(len(dev1in_data)):\n",
|
||||
" inline = unidecode(dev1in_data[i].lower())\n",
|
||||
"\n",
|
||||
" for p in punctuation:\n",
|
||||
" if p in inline:\n",
|
||||
" inline.replace(p, ' ')\n",
|
||||
"\n",
|
||||
" #weź tylko litery\n",
|
||||
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
|
||||
" \n",
|
||||
" dev1_data.append(inline)\n",
|
||||
"#dev1in_data.close()\n",
|
||||
"\n",
|
||||
"print(\"dane dev1 wczytane\")\n",
|
||||
"\n",
|
||||
"testA_data = []\n",
|
||||
"for i in range(len(testAin_data)):\n",
|
||||
" inline = unidecode(testAin_data[i].lower())\n",
|
||||
"\n",
|
||||
" for p in punctuation:\n",
|
||||
" if p in inline:\n",
|
||||
" inline.replace(p, ' ')\n",
|
||||
"\n",
|
||||
" #weź tylko litery\n",
|
||||
" inline = list(filter(lambda w: w.isalpha(), inline.split()))\n",
|
||||
" \n",
|
||||
" testA_data.append(inline)\n",
|
||||
"#testAin_data.close()\n",
|
||||
"\n",
|
||||
"print(\"dane testA wczytane\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#dev 0 predict\n",
|
||||
"\n",
|
||||
"outfile = open(dev0out, 'w')\n",
|
||||
"with torch.no_grad():\n",
|
||||
" for line in dev0_data:\n",
|
||||
" v = create_vector(line, word_ix)\n",
|
||||
" prob = model(v)\n",
|
||||
" if prob[0][0] > prob[0][1]:\n",
|
||||
" outfile.write(\"0\\n\")\n",
|
||||
" else:\n",
|
||||
" outfile.write(\"1\\n\")\n",
|
||||
"outfile.close()\n",
|
||||
"\n",
|
||||
"print('plik wyjściowy dla dev0 został utworzony')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#dev 1 predict\n",
|
||||
"\n",
|
||||
"outfile = open(dev1out, 'w')\n",
|
||||
"with torch.no_grad():\n",
|
||||
" for line in dev1_data:\n",
|
||||
" v = create_vector(line, word_ix)\n",
|
||||
" prob = model(v)\n",
|
||||
" if prob[0][0] > prob[0][1]:\n",
|
||||
" outfile.write(\"0\\n\")\n",
|
||||
" else:\n",
|
||||
" outfile.write(\"1\\n\")\n",
|
||||
"outfile.close()\n",
|
||||
"\n",
|
||||
"print('plik wyjściowy dla dev1 został utworzony')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"plik wyjściowy dla testA został utworzony\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#test A predict\n",
|
||||
"\n",
|
||||
"outfile = open(testAout, 'w')\n",
|
||||
"with torch.no_grad():\n",
|
||||
" for line in testA_data:\n",
|
||||
" v = create_vector(line, word_ix)\n",
|
||||
" prob = model(v)\n",
|
||||
" if prob[0][0] > prob[0][1]:\n",
|
||||
" outfile.write(\"0\\n\")\n",
|
||||
" else:\n",
|
||||
" outfile.write(\"1\\n\")\n",
|
||||
"outfile.close()\n",
|
||||
"\n",
|
||||
"print('plik wyjściowy dla testA został utworzony')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
]
|
||||
}
|
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5000
train/extrain5k.tsv
Normal file
5000
train/extrain5k.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5000
train/intrain5k.tsv
Normal file
5000
train/intrain5k.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user