diff --git a/regression.py b/regression.py index 6eece22..c4f3811 100644 --- a/regression.py +++ b/regression.py @@ -21,11 +21,13 @@ class MyNeuralNetwork(torch.nn.Module): word2vec = gensim.downloader.load('word2vec-google-news-300') def get_word2vec(document): + return np.mean([word2vec[token] for token in document if token in word2vec] or [np.zeros(300)], axis=0) #Basic paths + reading from files XtrainingData = pd.read_table('train/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) YtrainingData = pd.read_table('train/expected.tsv', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['label'])['label'] + XtestData = pd.read_table('test-A/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) XdevData = pd.read_table('dev-0/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) @@ -42,43 +44,57 @@ XdevData = [get_word2vec(document) for document in XdevData] eph = 30 batches = 5 network = MyNeuralNetwork(300, 600, 1) -criterion = torch.nn.BCELoss() -optimizer = torch.optim.SGD(network.parameters(), lr=0.02) +crit = torch.nn.BCELoss() +opt = torch.optim.SGD(network.parameters(), lr=0.03) +########Accuracy for different parameters according to Geval########### +#0.7561 for 5 epochs and 5 batches +#0.7728 for 30 epochs and 5 batches +#0.7712 for 30 epochs and 15 batches +####################################################################### #Model training according to source files from classes for epoch in range(eph): network.train() + for i in range(0, YtrainingData.shape[0], batches): x = XtrainingData[i :i + batches] x = torch.tensor(x) y = YtrainingData[i :i + batches] y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) - outputs = network(x.float()) - loss = criterion(outputs, y) - optimizer.zero_grad() + outcome = network(x.float()) + loss = crit(outcome, y) + opt.zero_grad() loss.backward() - optimizer.step() + opt.step() #Basic evaluation -YpredDev = [] YtestPred = [] +YpredDev = [] with torch.no_grad(): for i in range(0, len(XdevData), batches): x = XdevData[i :i + batches] x = torch.tensor(x) - outputs = network(x.float()) - prediction = outputs > 0.5 - YpredDev += prediction.tolist() + outcome = network(x.float()) + predict = outcome > 0.5 + + YpredDev += predict.tolist() for i in range(0, len(XtestData), batches): x = XtestData[i :i + batches] x = torch.tensor(x) - outputs = network(x.float()) - prediction = outputs > 0.5 - YtestPred += prediction.tolist() + outcome = network(x.float()) + predict = outcome > 0.5 + + YtestPred += predict.tolist() #Saving outputs np.asarray(YpredDev, dtype=np.int32).tofile('./dev-0/out.tsv', sep='\n') np.asarray(YtestPred, dtype=np.int32).tofile('./test-A/out.tsv', sep='\n') + +########Accuracy for different parameters according to Geval########### +#0.7561 for 5 epochs and 5 batches +#0.7728 for 30 epochs and 5 batches +#0.7712 for 30 epochs and 15 batches +#######################################################################