from gensim.models.doc2vec import Doc2Vec, TaggedDocument import torch import numpy as np from model import n_features, NeuralNetworkModel, get_loss_acc, batch_size stopwords = [] # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt with open('stopwords') as f: stopwords = [line.rstrip() for line in f] def preprocess(doc): doc = doc.lower().split(' ') doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) return doc vectorizer = Doc2Vec() model = NeuralNetworkModel() criterion = torch.nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs: y_with_doc = doc.split('\t') y.append(y_with_doc[0]) docs_preprocessed.append(preprocess(y_with_doc[1])) y = [int(value) for value in y] y = np.reshape(y, (len(y), 1)) tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)] print("Fitting vectorizer...") global vectorizer vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features) print("DONE!") x = vectorizer.dv.vectors for epoch in range(5): loss_score = 0 acc_score = 0 items_total = 0 model.train() for i in range(0, y.shape[0], batch_size): X = x[i:i + batch_size] X = torch.tensor(X.astype(np.float32)) Y = y[i:i + batch_size] Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1) Y_predictions = model(X) acc_score += torch.sum((Y_predictions > 0.5) == Y).item() items_total += Y.shape[0] optimizer.zero_grad() loss = criterion(Y_predictions, Y) loss.backward() optimizer.step() loss_score += loss.item() * Y.shape[0] print(epoch) print(get_loss_acc(model, x, y, criterion, optimizer)) def classify(path): print("Predicting for", path) with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] docs = [preprocess(doc) for doc in docs] test_x = [vectorizer.infer_vector(doc) for doc in docs] test_x = np.array(test_x, np.float32) predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0] predictions = [1 if value >= 0.5 else 0 for value in predictions] with open(path + 'out.tsv', 'w') as file: for prediction in predictions: file.write("%i\n" % prediction) train() classify('dev-0/') classify('test-A/')