Added model training

This commit is contained in:
Maciej Sobkowiak 2021-05-25 22:27:39 +02:00
parent 894a4fbebb
commit 1b3c4dd9ef

75
main.py
View File

@ -1,21 +1,49 @@
from nltk.util import pr
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import torch
from gensim import downloader from gensim import downloader
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
x_train = pd.read_table('train/in.tsv', error_bad_lines=False, class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
dim = 100
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(dim, 500)
self.fc2 = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
def read_data():
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
x_train = pd.read_table('train/in.tsv', error_bad_lines=False,
header=None, quoting=3, names=x_labels) header=None, quoting=3, names=x_labels)
y_train = pd.read_table('train/expected.tsv', error_bad_lines=False, y_train = pd.read_table('train/expected.tsv', error_bad_lines=False,
header=None, quoting=3, names=y_labels) header=None, quoting=3, names=y_labels)
x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False,
header=None, quoting=3, names=x_labels) header=None, quoting=3, names=x_labels)
x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
header=None, quoting=3, names=x_labels) header=None, quoting=3, names=x_labels)
print(x_train) # remove some rows for faster development
remove_n = 200000
drop_indices = np.random.choice(x_train.index, remove_n, replace=False)
x_train = x_train.drop(drop_indices)
return x_labels, y_labels, x_train, y_train, x_dev, x_test
x_labels, y_labels, x_train, y_train, x_dev, x_test = read_data()
x_train = x_train[x_labels[0]].str.lower() x_train = x_train[x_labels[0]].str.lower()
x_dev = x_dev[x_labels[0]].str.lower() x_dev = x_dev[x_labels[0]].str.lower()
@ -26,16 +54,33 @@ x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev] x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test] x_test = [word_tokenize(x) for x in x_test]
print(x_train) w2v = downloader.load('glove-wiki-gigaword-200')
x_train = [np.mean([w2v[word] for word in doc if word in w2v] or [
np.zeros(50)], axis=0) for doc in x_train]
x_dev = [np.mean([w2v[word] for word in doc if word in w2v]
or [np.zeros(50)], axis=0) for doc in x_dev]
x_test = [np.mean([w2v[word] for word in doc if word in w2v]
or [np.zeros(50)], axis=0) for doc in x_test]
# w2v = downloader.load('glove-wiki-gigaword-200') nn_model = NeuralNetworkModel()
BATCH_SIZE = 5
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1)
# def document_vector(doc): for epoch in range(5):
# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0) nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i+BATCH_SIZE]
X = torch.tensor(X.astype(np.float32).todense())
Y = y_train[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
# for doc in x_train: Y_predictions = nn_model(X)
# x_train = [document_vector(doc) for doc in x_train] optimizer.zero_grad()
# x_dev = [document_vector(doc) for doc in x_dev] loss = criterion(Y_predictions, Y)
# x_test = [document_vector(doc) for doc in x_test] loss.backward()
optimizer.step()
print(Y_predictions)