8.5 KiB
8.5 KiB
import numpy as np
import pandas as pd
import torch
from unidecode import unidecode
from string import punctuation
filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'
#train size
learningRate = 0.1
epochs = 100
#treainfile
trainin = filedir + '/train/intrain5k.tsv'
trainex = filedir + '/train/extrain5k.tsv'
#data files
dev0in = filedir + '/dev-0/in.tsv'
dev0out = filedir + '/dev-0/out.tsv'
dev1in = filedir + '/dev-1/in.tsv'
dev1out = filedir + '/dev-1/out.tsv'
testAin = filedir + '/test-A/in.tsv'
testAout = filedir + '/test-A/out.tsv'
#dane do treningu
trainin_data = open(trainin, 'r').readlines()
trainex_data = open(trainex, 'r').readlines()
train_data = []
for i in range(len(trainin_data)):
inline = unidecode(trainin_data[i].lower())
for p in punctuation:
if p in inline:
inline.replace(p, ' ')
#weź tylko litery
inline = list(filter(lambda w: w.isalpha(), inline.split()))
train_data.append((inline,int(trainex_data[i])))
word_ix = {}
for sent, _ in train_data:
for word in sent:
if word not in word_ix:
word_ix[word] = len(word_ix)
print("dane treningowe wczytane")
dane treningowe wczytane
class LogisticRegression(torch.nn.Module):
def __init__(self):
super(LogisticRegression, self).__init__()
self.linear = torch.nn.Linear(len(word_ix), 2)
def forward(self, x):
return torch.nn.functional.log_softmax(self.linear(x), dim=1)
model = LogisticRegression()
device = torch.device('cpu')
model.to(device)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
print('model regresji gotowy')
model regresji gotowy
def create_vector(s, wi):
v = torch.zeros(len(wi))
for w in s:
if (w in wi):
v[wi[w]]+=1
return v.view(1,-1)
#trening
for epoch in range(epochs):
train_len = len(train_data)
for inp, label in train_data:
model.zero_grad()
inputs = create_vector(inp, word_ix)
labels = torch.LongTensor([{0:0, 1:1}[label]])
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
print('trening zakonczony')
trening zakonczony
#dane do przewidywania
dev0in_data = open(dev0in, 'r').readlines()
dev1in_data = open(dev1in, 'r').readlines()
testAin_data = open(testAin, 'r').readlines()
dev0_data = []
for i in range(len(dev0in_data)):
inline = unidecode(dev0in_data[i].lower())
for p in punctuation:
if p in inline:
inline.replace(p, ' ')
#weź tylko litery
inline = list(filter(lambda w: w.isalpha(), inline.split()))
dev0_data.append(inline)
#dev0in_data.close()
print("dane dev0 wczytane")
dev1_data = []
for i in range(len(dev1in_data)):
inline = unidecode(dev1in_data[i].lower())
for p in punctuation:
if p in inline:
inline.replace(p, ' ')
#weź tylko litery
inline = list(filter(lambda w: w.isalpha(), inline.split()))
dev1_data.append(inline)
#dev1in_data.close()
print("dane dev1 wczytane")
testA_data = []
for i in range(len(testAin_data)):
inline = unidecode(testAin_data[i].lower())
for p in punctuation:
if p in inline:
inline.replace(p, ' ')
#weź tylko litery
inline = list(filter(lambda w: w.isalpha(), inline.split()))
testA_data.append(inline)
#testAin_data.close()
print("dane testA wczytane")
dane dev0 wczytane dane dev1 wczytane dane testA wczytane
#dev 0 predict
outfile = open(dev0out, 'w')
with torch.no_grad():
for line in dev0_data:
v = create_vector(line, word_ix)
prob = model(v)
if prob[0][0] > prob[0][1]:
outfile.write("0\n")
else:
outfile.write("1\n")
outfile.close()
print('plik wyjściowy dla dev0 został utworzony')
#dev 1 predict
outfile = open(dev1out, 'w')
with torch.no_grad():
for line in dev1_data:
v = create_vector(line, word_ix)
prob = model(v)
if prob[0][0] > prob[0][1]:
outfile.write("0\n")
else:
outfile.write("1\n")
outfile.close()
print('plik wyjściowy dla dev1 został utworzony')
#test A predict
outfile = open(testAout, 'w')
with torch.no_grad():
for line in testA_data:
v = create_vector(line, word_ix)
prob = model(v)
if prob[0][0] > prob[0][1]:
outfile.write("0\n")
else:
outfile.write("1\n")
outfile.close()
print('plik wyjściowy dla testA został utworzony')
plik wyjściowy dla testA został utworzony