neural-network-ISI/solution.py

79 lines
2.6 KiB
Python

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import torch
import numpy as np
from model import n_features, NeuralNetworkModel, get_loss_acc, batch_size
stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords') as f:
stopwords = [line.rstrip() for line in f]
def preprocess(doc):
doc = doc.lower().split(' ')
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
return doc
vectorizer = Doc2Vec()
model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
def train():
with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
y = []
for doc in docs:
y_with_doc = doc.split('\t')
y.append(y_with_doc[0])
docs_preprocessed.append(preprocess(y_with_doc[1]))
y = [int(value) for value in y]
y = np.reshape(y, (len(y), 1))
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
print("Fitting vectorizer...")
global vectorizer
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
print("DONE!")
x = vectorizer.dv.vectors
for epoch in range(5):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, y.shape[0], batch_size):
X = x[i:i + batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y[i:i + batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
print(epoch)
print(get_loss_acc(model, x, y, criterion, optimizer))
def classify(path):
print("Predicting for", path)
with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f]
docs = [preprocess(doc) for doc in docs]
test_x = [vectorizer.infer_vector(doc) for doc in docs]
test_x = np.array(test_x, np.float32)
predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0]
predictions = [1 if value >= 0.5 else 0 for value in predictions]
with open(path + 'out.tsv', 'w') as file:
for prediction in predictions:
file.write("%i\n" % prediction)
train()
classify('dev-0/')
classify('test-A/')