petite-difference-challenge2/logistic_regression.ipynb

8.5 KiB

import numpy as np
import pandas as pd
import torch
from unidecode import unidecode
from string import punctuation
filedir = '/home/ubuntu/Pulpit/TAU/petite-difference-challenge2'

#train size
learningRate = 0.1
epochs = 100

#treainfile
trainin = filedir + '/train/intrain5k.tsv'
trainex = filedir + '/train/extrain5k.tsv'

#data files
dev0in = filedir + '/dev-0/in.tsv'
dev0out = filedir + '/dev-0/out.tsv'
dev1in = filedir + '/dev-1/in.tsv'
dev1out = filedir + '/dev-1/out.tsv' 
testAin = filedir + '/test-A/in.tsv'
testAout = filedir + '/test-A/out.tsv'
#dane do treningu
trainin_data = open(trainin, 'r').readlines()
trainex_data = open(trainex, 'r').readlines()

train_data = []
for i in range(len(trainin_data)):
    inline = unidecode(trainin_data[i].lower())

    for p in punctuation:
        if p in inline:
            inline.replace(p, ' ')

    #weź tylko litery
    inline = list(filter(lambda w: w.isalpha(), inline.split()))
    
    train_data.append((inline,int(trainex_data[i])))

word_ix = {}
for sent, _ in train_data:
    for word in sent:
        if word not in word_ix:
            word_ix[word] = len(word_ix)

print("dane treningowe wczytane")
dane treningowe wczytane
class LogisticRegression(torch.nn.Module):
     def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(len(word_ix), 2)

     def forward(self, x):
        return torch.nn.functional.log_softmax(self.linear(x), dim=1)

model = LogisticRegression()
device = torch.device('cpu')
model.to(device)
criterion = torch.nn.NLLLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

print('model regresji gotowy')
model regresji gotowy
def create_vector(s, wi):
    v = torch.zeros(len(wi))
    for w in s:
        if (w in wi):
            v[wi[w]]+=1
    return v.view(1,-1)
#trening
for epoch in range(epochs):
    train_len = len(train_data)
    for inp, label in train_data: 
        model.zero_grad()

        inputs = create_vector(inp, word_ix)
        
        labels = torch.LongTensor([{0:0, 1:1}[label]])

        outputs = model(inputs)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

print('trening zakonczony')
trening zakonczony
#dane do przewidywania
dev0in_data = open(dev0in, 'r').readlines()
dev1in_data = open(dev1in, 'r').readlines()
testAin_data = open(testAin, 'r').readlines()

dev0_data = []
for i in range(len(dev0in_data)):
    inline = unidecode(dev0in_data[i].lower())

    for p in punctuation:
        if p in inline:
            inline.replace(p, ' ')

    #weź tylko litery
    inline = list(filter(lambda w: w.isalpha(), inline.split()))
    
    dev0_data.append(inline)
#dev0in_data.close()

print("dane dev0 wczytane")

dev1_data = []
for i in range(len(dev1in_data)):
    inline = unidecode(dev1in_data[i].lower())

    for p in punctuation:
        if p in inline:
            inline.replace(p, ' ')

    #weź tylko litery
    inline = list(filter(lambda w: w.isalpha(), inline.split()))
    
    dev1_data.append(inline)
#dev1in_data.close()

print("dane dev1 wczytane")

testA_data = []
for i in range(len(testAin_data)):
    inline = unidecode(testAin_data[i].lower())

    for p in punctuation:
        if p in inline:
            inline.replace(p, ' ')

    #weź tylko litery
    inline = list(filter(lambda w: w.isalpha(), inline.split()))
    
    testA_data.append(inline)
#testAin_data.close()

print("dane testA wczytane")
dane dev0 wczytane
dane dev1 wczytane
dane testA wczytane
#dev 0 predict

outfile = open(dev0out, 'w')
with torch.no_grad():
    for line in dev0_data:
        v = create_vector(line, word_ix)
        prob = model(v)
        if prob[0][0] > prob[0][1]:
            outfile.write("0\n")
        else:
            outfile.write("1\n")
outfile.close()

print('plik wyjściowy dla dev0 został utworzony')
#dev 1 predict

outfile = open(dev1out, 'w')
with torch.no_grad():
    for line in dev1_data:
        v = create_vector(line, word_ix)
        prob = model(v)
        if prob[0][0] > prob[0][1]:
            outfile.write("0\n")
        else:
            outfile.write("1\n")
outfile.close()

print('plik wyjściowy dla dev1 został utworzony')
#test A predict

outfile = open(testAout, 'w')
with torch.no_grad():
    for line in testA_data:
        v = create_vector(line, word_ix)
        prob = model(v)
        if prob[0][0] > prob[0][1]:
            outfile.write("0\n")
        else:
            outfile.write("1\n")
outfile.close()

print('plik wyjściowy dla testA został utworzony')
plik wyjściowy dla testA został utworzony