79 lines
2.6 KiB
Python
79 lines
2.6 KiB
Python
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
import torch
|
|
import numpy as np
|
|
from model import n_features, NeuralNetworkModel, get_loss_acc, batch_size
|
|
|
|
stopwords = []
|
|
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
|
with open('stopwords') as f:
|
|
stopwords = [line.rstrip() for line in f]
|
|
|
|
|
|
def preprocess(doc):
|
|
doc = doc.lower().split(' ')
|
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
|
return doc
|
|
|
|
|
|
vectorizer = Doc2Vec()
|
|
model = NeuralNetworkModel()
|
|
criterion = torch.nn.BCELoss()
|
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
|
|
|
|
|
def train():
|
|
with open('train/train.tsv') as f:
|
|
docs = [line.rstrip() for line in f]
|
|
docs_preprocessed = []
|
|
y = []
|
|
for doc in docs:
|
|
y_with_doc = doc.split('\t')
|
|
y.append(y_with_doc[0])
|
|
docs_preprocessed.append(preprocess(y_with_doc[1]))
|
|
y = [int(value) for value in y]
|
|
y = np.reshape(y, (len(y), 1))
|
|
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
|
print("Fitting vectorizer...")
|
|
global vectorizer
|
|
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
|
|
print("DONE!")
|
|
x = vectorizer.dv.vectors
|
|
for epoch in range(5):
|
|
loss_score = 0
|
|
acc_score = 0
|
|
items_total = 0
|
|
model.train()
|
|
for i in range(0, y.shape[0], batch_size):
|
|
X = x[i:i + batch_size]
|
|
X = torch.tensor(X.astype(np.float32))
|
|
Y = y[i:i + batch_size]
|
|
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
|
Y_predictions = model(X)
|
|
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
|
items_total += Y.shape[0]
|
|
optimizer.zero_grad()
|
|
loss = criterion(Y_predictions, Y)
|
|
loss.backward()
|
|
optimizer.step()
|
|
loss_score += loss.item() * Y.shape[0]
|
|
print(epoch)
|
|
print(get_loss_acc(model, x, y, criterion, optimizer))
|
|
|
|
|
|
def classify(path):
|
|
print("Predicting for", path)
|
|
with open(path + 'in.tsv') as f:
|
|
docs = [line.rstrip() for line in f]
|
|
docs = [preprocess(doc) for doc in docs]
|
|
test_x = [vectorizer.infer_vector(doc) for doc in docs]
|
|
test_x = np.array(test_x, np.float32)
|
|
predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0]
|
|
predictions = [1 if value >= 0.5 else 0 for value in predictions]
|
|
with open(path + 'out.tsv', 'w') as file:
|
|
for prediction in predictions:
|
|
file.write("%i\n" % prediction)
|
|
|
|
|
|
train()
|
|
classify('dev-0/')
|
|
classify('test-A/') |