import torch, numpy as np from gensim.models import Word2Vec import inout as io from nnModel import NeuralNetworkModel, trainModel, predict def getX(train, dev, test): Xs = [] for file in [train, dev, test]: X = io.read(file) Xs.append([x[0].split() for x in X]) return Xs def getY(dir): return [np.array(io.read(file)) for file in dir] def vectorize(word2vec, documents): vectorized = [] for d in documents: vectorized.append(np.mean([word2vec.wv[word] if word in word2vec.wv else np.zeros(100, dtype=float) for word in d], axis=0)) return np.array(vectorized) if __name__ == '__main__': trainX, devX, testX = getX('train/in.tsv.xz', 'dev-0/in.tsv.xz', 'test-A/in.tsv.xz') trainY, devY = getY(['train/expected.tsv', 'dev-0/expected.tsv']) word2vec = Word2Vec(trainX, vector_size=100, min_count=2) trainX = vectorize(word2vec, trainX) devX = vectorize(word2vec, devX) testX = vectorize(word2vec, testX) nnModel = NeuralNetworkModel() optimizer = torch.optim.SGD(nnModel.parameters(), lr = 0.1) trainModel(nnModel, trainX, trainY, devX, devY, optimizer) io.write(predict(nnModel, trainX), 'train/out.tsv') io.write(predict(nnModel, devX), 'dev-0/out.tsv') io.write(predict(nnModel, testX), 'test-A/out.tsv')