import gensim.downloader import torch.optim as optim import torch.nn as nn import torch import numpy as np from torch import relu, sigmoid #from timeit import default_timer as timer class NNet(nn.Module): def __init__(self): super(NNet, self).__init__() self.ll1 = nn.Linear(100, 1000) self.ll2 = nn.Linear(1000, 400) self.ll3 = nn.Linear(400, 1) def forward(self, x): x = relu(self.ll1(x)) x = relu(self.ll2(x)) x = sigmoid(self.ll3(x)) return x def read_data(folder_name): with open(f'{folder_name}/in.tsv', encoding='utf-8') as file: x = [line.lower().split()[:-2] for line in file.readlines()] with open(f'{folder_name}/expected.tsv', encoding='utf-8') as file: y = [int(line.split()[0]) for line in file.readlines()] return x, y def process_data(data, word2vec): processed_data = [] for reddit in data: words_sim = [word2vec[word] for word in reddit if word in word2vec] processed_data.append(np.mean(words_sim or [np.zeros(100)], axis=0)) return processed_data def predict(folder_name, model, word_vec): with open(f'{folder_name}/in.tsv', encoding='utf-8') as file: x_data = [line.lower().split()[:-2] for line in file.readlines()] x_train = process_data(x_data, word_vec) y_predictions = [] with torch.no_grad(): for i, inputs in enumerate(x_train): inputs = torch.tensor(inputs.astype(np.float32)).to(device) y_predicted = model(inputs) y_predictions.append(y_predicted > 0.5) return y_predictions def save_predictions(folder_name, predicted_labels): predictions = [] for pred in predicted_labels: predictions.append(pred.int()[0].item()) with open(f"{folder_name}/out.tsv", "w", encoding="UTF-8") as file_out: for pred in predictions: file_out.writelines(f"{str(pred)}\n") device = "cuda" if torch.cuda.is_available() else "cpu" print(device) #gpu is a bit faster here word_vectors = gensim.downloader.load("glove-wiki-gigaword-100") x_data, y_train = read_data('train') x_train = process_data(x_data, word_vectors) model = NNet().to(device) criterion = nn.BCELoss() optimizer = optim.SGD(model.parameters(), lr=0.005) #, momentum=0.9) for epoch in range(2): running_loss = 0.0 correct = 0. total = 0. for i, (inputs, label) in enumerate(zip(x_train, y_train)): inputs = torch.tensor(inputs.astype(np.float32)).to(device) label = torch.tensor(np.array(label).astype(np.float32)).reshape(1).to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize y_predicted = model(inputs) loss = criterion(y_predicted, label) loss.backward() optimizer.step() # print statistics running_loss += loss.item() correct += ((y_predicted > 0.5) == label).type(torch.float).sum().item() total += label.size(0) if i % 10000 == 9999: # print every 10000 mini-batches print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 10000:.3f}') print(f'Accuracy score: {100 * correct / total} %') running_loss = 0.0 predicted = predict('dev-0', model, word_vectors) save_predictions('dev-0', predicted) predicted = predict('test-A', model, word_vectors) save_predictions('test-A', predicted)