From 38911e69e59379f11ada33925ac6b00aad5c449f Mon Sep 17 00:00:00 2001 From: Damian Bregier Date: Tue, 25 May 2021 23:57:06 +0200 Subject: [PATCH] ADD: Model file --- regression.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 regression.py diff --git a/regression.py b/regression.py new file mode 100644 index 0000000..6eece22 --- /dev/null +++ b/regression.py @@ -0,0 +1,84 @@ +import numpy as np +import pandas as pd +import torch +from csv import QUOTE_NONE +from nltk.tokenize import word_tokenize +import gensim.downloader + +#Based on source material from classes +class MyNeuralNetwork(torch.nn.Module): + def __init__(self, input_size, hidden_size, num_classes): + super(MyNeuralNetwork, self).__init__() + self.fc1 = torch.nn.Linear(input_size, hidden_size) + self.fc2 = torch.nn.Linear(hidden_size, num_classes) + + def forward(self, x): + x = self.fc1(x) + x = torch.relu(x) + x = self.fc2(x) + x = torch.sigmoid(x) + return x + +word2vec = gensim.downloader.load('word2vec-google-news-300') +def get_word2vec(document): + return np.mean([word2vec[token] for token in document if token in word2vec] or [np.zeros(300)], axis=0) + +#Basic paths + reading from files +XtrainingData = pd.read_table('train/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) +YtrainingData = pd.read_table('train/expected.tsv', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['label'])['label'] +XtestData = pd.read_table('test-A/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) +XdevData = pd.read_table('dev-0/in.tsv.xz', error_bad_lines=False, header=None, quoting=QUOTE_NONE, names=['content', 'id']) + + +#Data filltering and preprocessing +XtrainingData = [word_tokenize(row) for row in XtrainingData['content'].str.lower()] +XtestData = [word_tokenize(row) for row in XtestData['content'].str.lower()] +XdevData = [word_tokenize(row) for row in XdevData['content'].str.lower()] +XtrainingData = [get_word2vec(document) for document in XtrainingData] +XtestData = [get_word2vec(document) for document in XtestData] +XdevData = [get_word2vec(document) for document in XdevData] + +#Basic parameters for the model +eph = 30 +batches = 5 +network = MyNeuralNetwork(300, 600, 1) +criterion = torch.nn.BCELoss() +optimizer = torch.optim.SGD(network.parameters(), lr=0.02) + +#Model training according to source files from classes +for epoch in range(eph): + network.train() + for i in range(0, YtrainingData.shape[0], batches): + x = XtrainingData[i :i + batches] + x = torch.tensor(x) + y = YtrainingData[i :i + batches] + y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) + + outputs = network(x.float()) + loss = criterion(outputs, y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + +#Basic evaluation +YpredDev = [] +YtestPred = [] + +with torch.no_grad(): + for i in range(0, len(XdevData), batches): + x = XdevData[i :i + batches] + x = torch.tensor(x) + outputs = network(x.float()) + prediction = outputs > 0.5 + YpredDev += prediction.tolist() + + for i in range(0, len(XtestData), batches): + x = XtestData[i :i + batches] + x = torch.tensor(x) + outputs = network(x.float()) + prediction = outputs > 0.5 + YtestPred += prediction.tolist() + +#Saving outputs +np.asarray(YpredDev, dtype=np.int32).tofile('./dev-0/out.tsv', sep='\n') +np.asarray(YtestPred, dtype=np.int32).tofile('./test-A/out.tsv', sep='\n')