diff --git a/main.py b/main.py index a1329ab..ae10119 100644 --- a/main.py +++ b/main.py @@ -1,21 +1,49 @@ +from nltk.util import pr import pandas as pd import numpy as np +import torch from gensim import downloader from nltk.tokenize import word_tokenize -x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns -y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns -x_train = pd.read_table('train/in.tsv', error_bad_lines=False, - header=None, quoting=3, names=x_labels) -y_train = pd.read_table('train/expected.tsv', error_bad_lines=False, - header=None, quoting=3, names=y_labels) -x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, - header=None, quoting=3, names=x_labels) -x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, - header=None, quoting=3, names=x_labels) +class NeuralNetworkModel(torch.nn.Module): -print(x_train) + def __init__(self): + dim = 100 + super(NeuralNetworkModel, self).__init__() + self.fc1 = torch.nn.Linear(dim, 500) + self.fc2 = torch.nn.Linear(500, 1) + + def forward(self, x): + x = self.fc1(x) + x = torch.relu(x) + x = self.fc2(x) + x = torch.sigmoid(x) + return x + + +def read_data(): + x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns + y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns + + x_train = pd.read_table('train/in.tsv', error_bad_lines=False, + header=None, quoting=3, names=x_labels) + y_train = pd.read_table('train/expected.tsv', error_bad_lines=False, + header=None, quoting=3, names=y_labels) + x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, + header=None, quoting=3, names=x_labels) + x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False, + header=None, quoting=3, names=x_labels) + + # remove some rows for faster development + remove_n = 200000 + drop_indices = np.random.choice(x_train.index, remove_n, replace=False) + x_train = x_train.drop(drop_indices) + + return x_labels, y_labels, x_train, y_train, x_dev, x_test + + +x_labels, y_labels, x_train, y_train, x_dev, x_test = read_data() x_train = x_train[x_labels[0]].str.lower() x_dev = x_dev[x_labels[0]].str.lower() @@ -26,16 +54,33 @@ x_train = [word_tokenize(x) for x in x_train] x_dev = [word_tokenize(x) for x in x_dev] x_test = [word_tokenize(x) for x in x_test] -print(x_train) +w2v = downloader.load('glove-wiki-gigaword-200') +x_train = [np.mean([w2v[word] for word in doc if word in w2v] or [ + np.zeros(50)], axis=0) for doc in x_train] +x_dev = [np.mean([w2v[word] for word in doc if word in w2v] + or [np.zeros(50)], axis=0) for doc in x_dev] +x_test = [np.mean([w2v[word] for word in doc if word in w2v] + or [np.zeros(50)], axis=0) for doc in x_test] -# w2v = downloader.load('glove-wiki-gigaword-200') +nn_model = NeuralNetworkModel() +BATCH_SIZE = 5 +criterion = torch.nn.BCELoss() +optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1) -# def document_vector(doc): -# return np.mean([word2vec[word] for word in doc if word in word2vec] or [np.zeros(50)], axis=0) +for epoch in range(5): + nn_model.train() + for i in range(0, y_train.shape[0], BATCH_SIZE): + X = x_train[i:i+BATCH_SIZE] + X = torch.tensor(X.astype(np.float32).todense()) + Y = y_train[i:i+BATCH_SIZE] + Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1) -# for doc in x_train: + Y_predictions = nn_model(X) -# x_train = [document_vector(doc) for doc in x_train] -# x_dev = [document_vector(doc) for doc in x_dev] -# x_test = [document_vector(doc) for doc in x_test] + optimizer.zero_grad() + loss = criterion(Y_predictions, Y) + loss.backward() + optimizer.step() + +print(Y_predictions)