import gensim.downloader as gensim import numpy as np import pandas as pd import torch from nltk.tokenize import word_tokenize class NeuralNetworkModel(torch.nn.Module): def __init__(self): super(NeuralNetworkModel, self).__init__() self.1 = torch.nn.Linear(300, 300) self.2 = torch.nn.Linear(300, 1) def forward(self, x): x = self.1(x) x = torch.relu(x) x = self.2(x) x = torch.sigmoid(x) return x nm = NeuralNetworkModel() dev_train = [] test_train = [] word2vec = gensim.load('word2vec-google-news-300') np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(300)], axis=0) def model_train(): train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', quoting=3) trainy = pd.read_table('train/expected.tsv', sep='\t', quoting=3) trainy = trainy[0] def model_prepare(): dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', quoting=3) test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', quoting=3) train = [word_tokenize(x) for x in train] dev = [word_tokenize(x) for x in dev] test = [word_tokenize(x) for x in test] def word_2_voc(): train = [d2v(doc) for doc in train] dev = [d2v(doc) for doc in dev] test = [d2v(doc) for doc in test] criterion = torch.nn.BCELoss() optimizer = torch.optim.Adam(model.parameters()) print ("1") for epoch in range(5): model.train() for i in range(0, y_train.shape[0], 5): X = train[i:i + 5] X = torch.tensor(X) y = trainy[i:i + 5] y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) optimizer.zero_grad() outputs = nm(X.float()) loss = criterion(outputs, y) loss.backward() optimizer.step() print ("2") with torch.no_grad(): for i in range(0, len(dev), 5): X = dev[i:i + 5] X = torch.tensor(X) outputs = nm(X.float()) y = (outputs > 0.5) dev_train.extend(y) for i in range(0, len(test), 5): X = test[i:i + 5] X = torch.tensor(X) outputs = nm(X.float()) y = (outputs >= 0.5) testy.extend(y) dev_train.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False) test_train.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)