Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

7 changed files with 0 additions and 310532 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
geval

Binary file not shown.

105
main.py
View File

@ -1,105 +0,0 @@
# noinspection PyUnresolvedReferences
import csv
import torch
import numpy as np
import pandas as pd
from nltk.util import pr
from gensim import downloader
from nltk.tokenize import word_tokenize
BATCH_SIZE = 5
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
dim = 200
super(NeuralNetworkModel, self).__init__()
self.one = torch.nn.Linear(dim, 500)
self.two = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.one(x)
x = torch.relu(x)
x = self.two(x)
x = torch.sigmoid(x)
return x
def read_data():
x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
x_train = pd.read_table('train/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
y_train = pd.read_table('train/expected.tsv', header=None, quoting=csv.QUOTE_NONE, names=y_labels)
x_dev = pd.read_table('dev-0/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
x_test = pd.read_table('test-A/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
# remove some rows for faster development
remove_n = 200000
drop_indices = np.random.choice(x_train.index, remove_n, replace=False)
x_train = x_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)
return x_labels, y_labels, x_train, y_train, x_dev, x_test
def process_data(x_labels, y_labels, x_train, y_train, x_dev, x_test):
x_train = x_train[x_labels[0]].str.lower()
x_dev = x_dev[x_labels[0]].str.lower()
x_test = x_test[x_labels[0]].str.lower()
y_train = y_train[y_labels[0]]
x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
w2v = downloader.load('glove-wiki-gigaword-200')
x_train = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_train]
x_dev = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_dev]
x_test = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_test]
return x_train, y_train, x_dev, x_test
def predict(model, x_data, out_path):
y_out = []
model.eval()
with torch.no_grad():
for i in range(0, len(x_data), BATCH_SIZE):
x = x_data[i:i + BATCH_SIZE]
x = torch.tensor(x)
pred = nn_model(x.float())
y_pred = (pred > 0.5)
y_out.extend(y_pred)
y_data = np.asarray(y_out, dtype=np.int32)
pd.DataFrame(y_data).to_csv(out_path, sep='\t', index=False, header=False)
if __name__ == "__main__":
x_labels, y_labels, x_train, y_train, x_dev, x_test = read_data()
x_train, y_train, x_dev, x_test = process_data(x_labels, y_labels, x_train, y_train, x_dev, x_test)
nn_model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1)
for epoch in range(5):
nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i + BATCH_SIZE]
X = torch.tensor(X)
Y = y_train[i:i + BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32).to_numpy()).reshape(-1, 1)
Y_predictions = nn_model(X.float())
loss = criterion(Y_predictions, Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
predict(nn_model, x_dev, 'dev-0/out.tsv')
predict(nn_model, x_test, 'test-A/out.tsv')

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long