import numpy as np import pandas as pd import torch from csv import QUOTE_NONE from nltk.tokenize import word_tokenize import gensim.downloader #Based on source material from classes class MyNeuralNetwork(torch.nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(MyNeuralNetwork, self).__init__() self.fc1 = torch.nn.Linear(input_size, hidden_size) self.fc2 = torch.nn.Linear(hidden_size, num_classes) def forward(self, x): x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) x = torch.sigmoid(x) return x word2vec = gensim.downloader.load('word2vec-google-news-300') def get_word2vec(document): return np.mean([word2vec[token] for token in document if token in word2vec] or [np.zeros(300)], axis=0) #Basic paths + reading from files XtrainingData = pd.read_table('train/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) YtrainingData = pd.read_table('train/expected.tsv', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['label'])['label'] XtestData = pd.read_table('test-A/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) XdevData = pd.read_table('dev-0/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) #Data filltering and preprocessing XtrainingData = [word_tokenize(row) for row in XtrainingData['content'].str.lower()] XtestData = [word_tokenize(row) for row in XtestData['content'].str.lower()] XdevData = [word_tokenize(row) for row in XdevData['content'].str.lower()] XtrainingData = [get_word2vec(document) for document in XtrainingData] XtestData = [get_word2vec(document) for document in XtestData] XdevData = [get_word2vec(document) for document in XdevData] #Basic parameters for the model eph = 30 batches = 5 network = MyNeuralNetwork(300, 600, 1) crit = torch.nn.BCELoss() opt = torch.optim.SGD(network.parameters(), lr=0.03) ########Accuracy for different parameters according to Geval########### #0.7561 for 5 epochs and 5 batches #0.7728 for 30 epochs and 5 batches #0.7712 for 30 epochs and 15 batches ####################################################################### #Model training according to source files from classes for epoch in range(eph): network.train() for i in range(0, YtrainingData.shape[0], batches): x = XtrainingData[i :i + batches] x = torch.tensor(x) y = YtrainingData[i :i + batches] y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) outcome = network(x.float()) loss = crit(outcome, y) opt.zero_grad() loss.backward() opt.step() #Basic evaluation YtestPred = [] YpredDev = [] with torch.no_grad(): for i in range(0, len(XdevData), batches): x = XdevData[i :i + batches] x = torch.tensor(x) outcome = network(x.float()) predict = outcome > 0.5 YpredDev += predict.tolist() for i in range(0, len(XtestData), batches): x = XtestData[i :i + batches] x = torch.tensor(x) outcome = network(x.float()) predict = outcome > 0.5 YtestPred += predict.tolist() #Saving outputs np.asarray(YpredDev, dtype=np.int32).tofile('./dev-0/out.tsv', sep='\n') np.asarray(YtestPred, dtype=np.int32).tofile('./test-A/out.tsv', sep='\n') ########Accuracy for different parameters according to Geval########### #0.7561 for 5 epochs and 5 batches #0.7728 for 30 epochs and 5 batches #0.7712 for 30 epochs and 15 batches #######################################################################