Prześlij pliki do ''

This commit is contained in:
Joanna Kurczalska 2022-06-21 22:56:12 +02:00
parent 7196c1a211
commit 46fee9605e

196
run.py
View File

@ -1,91 +1,105 @@
import numpy as np import torch
import pandas as pd import torch.nn as nn
import torch import torch.nn.functional as F
import csv import torch.optim as optim
import lzma from torchvision import transforms
import gensim.downloader import pickle
from nltk import word_tokenize import numpy as np
import pandas as pd
#print('wczytanie danych') from word2vec import Word2Vec
x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
y_train = pd.read_table('train/expected.tsv', sep='\t', header=None, quoting=3) class FFN(nn.Module):
x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3) def __init__(self, input_dim, output_dim, hidden1_size, hidden2_size, lr, epochs, batch_size):
super(FFN, self).__init__()
#print('inicjalizacja modelu') self.path = 'model1.pickle'
class NeuralNetworkModel(torch.nn.Module): self.lr = lr
def __init__(self): self.epochs = epochs
super(NeuralNetworkModel, self).__init__() self.output_dim = output_dim
self.l01 = torch.nn.Linear(300, 300) self.word2vec = Word2Vec()
self.l02 = torch.nn.Linear(300, 1) self.word2vec.load()
self.batch_size = batch_size
def forward(self, x): self.input_dim = input_dim
x = self.l01(x) self.fc1 = nn.Linear(batch_size, hidden1_size)
x = torch.relu(x) self.fc2 = nn.Linear(hidden1_size, hidden2_size)
x = self.l02(x) self.fc3 = nn.Linear(hidden2_size, hidden2_size)
x = torch.sigmoid(x) self.fc4 = nn.Linear(hidden2_size, hidden2_size)
return x self.fc5 = nn.Linear(hidden2_size, batch_size)
def forward(self, data):
#print('przygotowanie danych') data = F.relu(self.fc1(data))
data = F.relu(self.fc2(data))
x_train = x_train[0].str.lower() data = F.relu(self.fc3(data))
y_train = y_train[0] data = F.relu(self.fc4(data))
x_dev = x_dev[0].str.lower() data = F.sigmoid(self.fc5(data))
x_test = x_test[0].str.lower() return data
x_train = [word_tokenize(x) for x in x_train] def serialize(self):
x_dev = [word_tokenize(x) for x in x_dev] with open(self.path, 'wb') as file:
x_test = [word_tokenize(x) for x in x_test] pickle.dump(self, file)
word2vec = gensim.downloader.load('word2vec-google-news-300') def load(self):
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train] with open(self.path, 'rb') as file:
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev] self = pickle.load(file)
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
def batch(self, iterable, n=1):
l = len(iterable)
#print('trenowanie modelu') for ndx in range(0, l, n):
model = NeuralNetworkModel() yield iterable[ndx:min(ndx + n, l)]
BATCH_SIZE = 5
criterion = torch.nn.BCELoss() """
optimizer = torch.optim.SGD(model.parameters(), lr=0.01) data is a tuple of embedding vector and a label of 0/1
"""
for epoch in range(BATCH_SIZE):
model.train() def train(self, data, expected):
for i in range(0, y_train.shape[0], BATCH_SIZE): self.zero_grad()
X = x_train[i:i + BATCH_SIZE] criterion = torch.nn.BCELoss()
X = torch.tensor(X) optimizer = optim.Adam(self.parameters(), lr=self.lr)
y = y_train[i:i + BATCH_SIZE] batch_size = self.batch_size
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1) num_of_classes = self.output_dim
optimizer.zero_grad() for epoch in range(self.epochs):
outputs = model(X.float()) epoch_loss = 0.0
loss = criterion(outputs, y) idx = 0
loss.backward() for i in range(0, int(len(data) / batch_size) * batch_size, batch_size):
optimizer.step() inputs = data[i:i + batch_size]
labels = expected[i:i + batch_size]
#print('predykcja wynikow') optimizer.zero_grad()
y_dev = [] outputs = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
y_test = [] target = torch.tensor(labels.values).double()
model.eval() loss = criterion(outputs.view(batch_size), target.view(-1, ))
loss.backward()
with torch.no_grad(): optimizer.step()
for i in range(0, len(x_dev), BATCH_SIZE):
X = x_dev[i:i + BATCH_SIZE] epoch_loss += loss.item()
X = torch.tensor(X) if (idx % 1000 == 0):
outputs = model(X.float()) print('epoch: {}, idx: {}, loss: {}'.format(epoch, idx, epoch_loss / 1000))
prediction = (outputs > 0.5) epoch_loss = 0
y_dev += prediction.tolist() idx += 1
self.serialize()
for i in range(0, len(x_test), BATCH_SIZE):
X = x_test[i:i + BATCH_SIZE] def test(self, data, expected, path):
X = torch.tensor(X) correct = 0
outputs = model(X.float()) incorrect = 0
y = (outputs >= 0.5) total = 0
y_test += prediction.tolist() predictions = []
batch_size = self.batch_size
# print('eksportowanie do plików') for i in range(0, int(len(data) / batch_size) * batch_size, batch_size):
y_dev = np.asarray(y_dev, dtype=np.int32) inputs = data[i:i + batch_size]
y_test = np.asarray(y_test, dtype=np.int32) labels = expected[i:i + batch_size]
y_dev.tofile('./dev-0/out.tsv', sep='\n') predicted = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
y_test.tofile('./test-A/out.tsv', sep='\n') score = [1 if x > 0.5 else 0 for x in predicted]
for x, y in zip(score, labels):
if (x == y):
correct += 1
else:
incorrect += 1
predictions.append(score)
print(correct)
print(incorrect)
print(correct / (incorrect + correct))
df = pd.DataFrame(np.asarray(predictions).reshape(int(len(data) / batch_size) * batch_size))
df.reset_index(drop=True, inplace=True)
df.to_csv(path, sep="\t", index=False)