petite-difference-challenge2/logistic_regression.py

81 lines
2.2 KiB
Python

import torch
import torch.nn as nn
import torch.optim as optim
import itertools as IT
import numpy as np
import csv
class LogisticRegression(torch.nn.Module):
def __init__(self, WORDS_IN_DICTIONARY):
super(LogisticRegression, self).__init__()
self.linear = torch.nn.Linear(WORDS_IN_DICTIONARY, 2)
def forward(self, x):
y_pred = torch.sigmoid(self.linear(x))
return y_pred
def make_vector(sentence, dictionary):
vector = torch.zeros(len(dictionary))
for word in sentence:
vector[dictionary[word]] += 1
return vector.view(1, -1)
def read_data(path):
line = open(path, 'r').readlines()[0:1000]
data = []
for word in line:
data.append(word.split())
return data
def main():
train_data = read_data("train/in.tsv")
temp = open('train/expected.tsv', 'r').readlines()[0:1000]
train_data_output = []
for sent in temp:
train_data_output.append(int(sent))
lines = open('dev-0/in.tsv', 'r').readlines()
test_data = []
for line in lines:
test_data.append(line.split())
output = open('dev-0/out.tsv', 'w')
dictionary = {}
for sent in train_data + test_data:
for word in sent:
if word not in dictionary:
dictionary[word] = len(dictionary)
WORDS_IN_DICTIONARY = len(dictionary)
model = LogisticRegression(WORDS_IN_DICTIONARY)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
epochs = 100
for epoch in range(epochs):
if epoch % 10 == 0:
print(str(epoch/epochs * 100) + "%")
for instance, label in IT.zip_longest(train_data, train_data_output):
vector = make_vector(instance, dictionary)
target = torch.LongTensor([label])
model.zero_grad()
log_probs = model(vector)
loss = criterion(log_probs, target)
loss.backward()
optimizer.step()
for instance in test_data:
vec = make_vector(instance, dictionary)
log_probs = model(vec)
y_pred = np.argmax(log_probs[0].detach().numpy())
output.write(str(int(y_pred)) + '\n')
output.close()
if __name__ == '__main__':
main()