Prześlij pliki do ''

This commit is contained in:
Joanna Kurczalska 2022-06-21 22:56:12 +02:00
parent 7196c1a211
commit 46fee9605e

168
run.py
View File

@ -1,91 +1,105 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
import pickle
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import torch from word2vec import Word2Vec
import csv
import lzma
import gensim.downloader
from nltk import word_tokenize
#print('wczytanie danych')
x_train = pd.read_table('train/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
y_train = pd.read_table('train/expected.tsv', sep='\t', header=None, quoting=3)
x_dev = pd.read_table('dev-0/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
x_test = pd.read_table('test-A/in.tsv.xz', compression='xz', sep='\t', header=None, quoting=3)
#print('inicjalizacja modelu')
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.l01 = torch.nn.Linear(300, 300)
self.l02 = torch.nn.Linear(300, 1)
def forward(self, x):
x = self.l01(x)
x = torch.relu(x)
x = self.l02(x)
x = torch.sigmoid(x)
return x
#print('przygotowanie danych') class FFN(nn.Module):
x_train = x_train[0].str.lower() def __init__(self, input_dim, output_dim, hidden1_size, hidden2_size, lr, epochs, batch_size):
y_train = y_train[0] super(FFN, self).__init__()
x_dev = x_dev[0].str.lower() self.path = 'model1.pickle'
x_test = x_test[0].str.lower() self.lr = lr
self.epochs = epochs
self.output_dim = output_dim
self.word2vec = Word2Vec()
self.word2vec.load()
self.batch_size = batch_size
self.input_dim = input_dim
self.fc1 = nn.Linear(batch_size, hidden1_size)
self.fc2 = nn.Linear(hidden1_size, hidden2_size)
self.fc3 = nn.Linear(hidden2_size, hidden2_size)
self.fc4 = nn.Linear(hidden2_size, hidden2_size)
self.fc5 = nn.Linear(hidden2_size, batch_size)
x_train = [word_tokenize(x) for x in x_train] def forward(self, data):
x_dev = [word_tokenize(x) for x in x_dev] data = F.relu(self.fc1(data))
x_test = [word_tokenize(x) for x in x_test] data = F.relu(self.fc2(data))
data = F.relu(self.fc3(data))
data = F.relu(self.fc4(data))
data = F.sigmoid(self.fc5(data))
return data
word2vec = gensim.downloader.load('word2vec-google-news-300') def serialize(self):
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_train] with open(self.path, 'wb') as file:
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_dev] pickle.dump(self, file)
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [np.zeros(300)], axis=0) for content in x_test]
def load(self):
with open(self.path, 'rb') as file:
self = pickle.load(file)
#print('trenowanie modelu') def batch(self, iterable, n=1):
model = NeuralNetworkModel() l = len(iterable)
BATCH_SIZE = 5 for ndx in range(0, l, n):
criterion = torch.nn.BCELoss() yield iterable[ndx:min(ndx + n, l)]
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(BATCH_SIZE): """
model.train() data is a tuple of embedding vector and a label of 0/1
for i in range(0, y_train.shape[0], BATCH_SIZE): """
X = x_train[i:i + BATCH_SIZE]
X = torch.tensor(X)
y = y_train[i:i + BATCH_SIZE]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
optimizer.zero_grad()
outputs = model(X.float())
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
#print('predykcja wynikow') def train(self, data, expected):
y_dev = [] self.zero_grad()
y_test = [] criterion = torch.nn.BCELoss()
model.eval() optimizer = optim.Adam(self.parameters(), lr=self.lr)
batch_size = self.batch_size
num_of_classes = self.output_dim
for epoch in range(self.epochs):
epoch_loss = 0.0
idx = 0
for i in range(0, int(len(data) / batch_size) * batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i + batch_size]
optimizer.zero_grad()
outputs = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
target = torch.tensor(labels.values).double()
loss = criterion(outputs.view(batch_size), target.view(-1, ))
loss.backward()
optimizer.step()
with torch.no_grad(): epoch_loss += loss.item()
for i in range(0, len(x_dev), BATCH_SIZE): if (idx % 1000 == 0):
X = x_dev[i:i + BATCH_SIZE] print('epoch: {}, idx: {}, loss: {}'.format(epoch, idx, epoch_loss / 1000))
X = torch.tensor(X) epoch_loss = 0
outputs = model(X.float()) idx += 1
prediction = (outputs > 0.5) self.serialize()
y_dev += prediction.tolist()
for i in range(0, len(x_test), BATCH_SIZE): def test(self, data, expected, path):
X = x_test[i:i + BATCH_SIZE] correct = 0
X = torch.tensor(X) incorrect = 0
outputs = model(X.float()) total = 0
y = (outputs >= 0.5) predictions = []
y_test += prediction.tolist() batch_size = self.batch_size
for i in range(0, int(len(data) / batch_size) * batch_size, batch_size):
inputs = data[i:i + batch_size]
labels = expected[i:i + batch_size]
predicted = self.forward(torch.tensor(self.word2vec.list_of_sentences2vec(inputs)))
score = [1 if x > 0.5 else 0 for x in predicted]
# print('eksportowanie do plików') for x, y in zip(score, labels):
y_dev = np.asarray(y_dev, dtype=np.int32) if (x == y):
y_test = np.asarray(y_test, dtype=np.int32) correct += 1
y_dev.tofile('./dev-0/out.tsv', sep='\n') else:
y_test.tofile('./test-A/out.tsv', sep='\n') incorrect += 1
predictions.append(score)
print(correct)
print(incorrect)
print(correct / (incorrect + correct))
df = pd.DataFrame(np.asarray(predictions).reshape(int(len(data) / batch_size) * batch_size))
df.reset_index(drop=True, inplace=True)
df.to_csv(path, sep="\t", index=False)