From 46fee9605e02f4e31246742326652ced97546645 Mon Sep 17 00:00:00 2001 From: Joanna Kurczalska Date: Tue, 21 Jun 2022 22:56:12 +0200 Subject: [PATCH] =?UTF-8?q?Prze=C5=9Blij=20pliki=20do=20''?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- run.py | 196 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 105 insertions(+), 91 deletions(-) diff --git a/run.py b/run.py index c3acf31..dcb005c 100644 --- a/run.py +++ b/run.py @@ -1,91 +1,105 @@ -import numpy as np -import pandas as pd -import torch -import csv -import lzma -import gensim.downloader -from nltk import word_tokenize - -#print('wczytanie danych') - -x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) -y_train = pd.read_table('train/expected.tsv', sep='\t', header=None, quoting=3) -x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) -x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) - -#print('inicjalizacja modelu') -class NeuralNetworkModel(torch.nn.Module): - def __init__(self): - super(NeuralNetworkModel, self).__init__() - self.l01 = torch.nn.Linear(300, 300) - self.l02 = torch.nn.Linear(300, 1) - - def forward(self, x): - x = self.l01(x) - x = torch.relu(x) - x = self.l02(x) - x = torch.sigmoid(x) - return x - - -#print('przygotowanie danych') - -x_train = x_train[0].str.lower() -y_train = y_train[0] -x_dev = x_dev[0].str.lower() -x_test = x_test[0].str.lower() - -x_train = [word_tokenize(x) for x in x_train] -x_dev = [word_tokenize(x) for x in x_dev] -x_test = [word_tokenize(x) for x in x_test] - -word2vec = gensim.downloader.load('word2vec-google-news-300') -x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train] -x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev] -x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test] - - -#print('trenowanie modelu') -model = NeuralNetworkModel() -BATCH_SIZE = 5 -criterion = torch.nn.BCELoss() -optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - -for epoch in range(BATCH_SIZE): - model.train() - for i in range(0, y_train.shape[0], BATCH_SIZE): - X = x_train[i:i + BATCH_SIZE] - X = torch.tensor(X) - y = y_train[i:i + BATCH_SIZE] - y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) - optimizer.zero_grad() - outputs = model(X.float()) - loss = criterion(outputs, y) - loss.backward() - optimizer.step() - -#print('predykcja wynikow') -y_dev = [] -y_test = [] -model.eval() - -with torch.no_grad(): - for i in range(0, len(x_dev), BATCH_SIZE): - X = x_dev[i:i + BATCH_SIZE] - X = torch.tensor(X) - outputs = model(X.float()) - prediction = (outputs > 0.5) - y_dev += prediction.tolist() - - for i in range(0, len(x_test), BATCH_SIZE): - X = x_test[i:i + BATCH_SIZE] - X = torch.tensor(X) - outputs = model(X.float()) - y = (outputs >= 0.5) - y_test += prediction.tolist() - -# print('eksportowanie do plików') -y_dev = np.asarray(y_dev, dtype=np.int32) -y_test = np.asarray(y_test, dtype=np.int32) -y_dev.tofile('./dev-0/out.tsv', sep='\n') -y_test.tofile('./test-A/out.tsv', sep='\n') +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import transforms +import pickle +import numpy as np +import pandas as pd +from word2vec import Word2Vec + + +class FFN(nn.Module): + + def __init__(self, input_dim, output_dim, hidden1_size, hidden2_size, lr, epochs, batch_size): + super(FFN, self).__init__() + self.path = 'model1.pickle' + self.lr = lr + self.epochs = epochs + self.output_dim = output_dim + self.word2vec = Word2Vec() + self.word2vec.load() + self.batch_size = batch_size + self.input_dim = input_dim + self.fc1 = nn.Linear(batch_size, hidden1_size) + self.fc2 = nn.Linear(hidden1_size, hidden2_size) + self.fc3 = nn.Linear(hidden2_size, hidden2_size) + self.fc4 = nn.Linear(hidden2_size, hidden2_size) + self.fc5 = nn.Linear(hidden2_size, batch_size) + + def forward(self, data): + data = F.relu(self.fc1(data)) + data = F.relu(self.fc2(data)) + data = F.relu(self.fc3(data)) + data = F.relu(self.fc4(data)) + data = F.sigmoid(self.fc5(data)) + return data + + def serialize(self): + with open(self.path, 'wb') as file: + pickle.dump(self, file) + + def load(self): + with open(self.path, 'rb') as file: + self = pickle.load(file) + + def batch(self, iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] + + """ + data is a tuple of embedding vector and a label of 0/1 + """ + + def train(self, data, expected): + self.zero_grad() + criterion = torch.nn.BCELoss() + optimizer = optim.Adam(self.parameters(), lr=self.lr) + batch_size = self.batch_size + num_of_classes = self.output_dim + for epoch in range(self.epochs): + epoch_loss = 0.0 + idx = 0 + for i in range(0, int(len(data) / batch_size) * batch_size, batch_size): + inputs = data[i:i + batch_size] + labels = expected[i:i + batch_size] + optimizer.zero_grad() + outputs = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs))) + target = torch.tensor(labels.values).double() + loss = criterion(outputs.view(batch_size), target.view(-1, )) + loss.backward() + optimizer.step() + + epoch_loss += loss.item() + if (idx % 1000 == 0): + print('epoch: {}, idx: {}, loss: {}'.format(epoch, idx, epoch_loss / 1000)) + epoch_loss = 0 + idx += 1 + self.serialize() + + def test(self, data, expected, path): + correct = 0 + incorrect = 0 + total = 0 + predictions = [] + batch_size = self.batch_size + for i in range(0, int(len(data) / batch_size) * batch_size, batch_size): + inputs = data[i:i + batch_size] + labels = expected[i:i + batch_size] + predicted = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs))) + score = [1 if x > 0.5 else 0 for x in predicted] + + for x, y in zip(score, labels): + if (x == y): + correct += 1 + else: + incorrect += 1 + predictions.append(score) + + print(correct) + print(incorrect) + print(correct / (incorrect + correct)) + df = pd.DataFrame(np.asarray(predictions).reshape(int(len(data) / batch_size) * batch_size)) + df.reset_index(drop=True, inplace=True) + df.to_csv(path, sep="\t", index=False)