Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

7 changed files with 0 additions and 310546 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
geval

Binary file not shown.

119
main.py
View File

@ -1,119 +0,0 @@
from nltk.util import pr
import pandas as pd
import numpy as np
import torch
from gensim import downloader
from nltk.tokenize import word_tokenize
import csv
import os
dir_path = os.path.dirname(os.path.realpath(__file__)) + '\\'
IN_HEADER_PATH = dir_path + 'in-header.tsv'
OUT_HEADER_PATH = dir_path + 'out-header.tsv'
IN_TRAIN_TABLE_PATH = dir_path + 'train\\in.tsv'
IN_EXPECTED_TABLE_PATH = dir_path + 'train\\expected.tsv'
X_DEV_PATH = dir_path + 'dev-0\\in.tsv'
X_TEST_PATH = dir_path + 'test-A\\in.tsv'
DEV_OUT_PATH = dir_path + 'dev-0\\out.tsv'
TEST_OUT_PATH = dir_path + 'test-A\\out.tsv'
BATCH_SIZE = 5
class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
dim = 200
super(NeuralNetworkModel, self).__init__()
self.one = torch.nn.Linear(dim, 500)
self.two = torch.nn.Linear(500, 1)
def forward(self, x):
x = self.one(x)
x = torch.relu(x)
x = self.two(x)
x = torch.sigmoid(x)
return x
def read():
x_labels = (pd.read_csv(IN_HEADER_PATH, sep='\t')).columns
y_labels = (pd.read_csv(OUT_HEADER_PATH, sep='\t')).columns
x_train = pd.read_table(IN_TRAIN_TABLE_PATH, header=None, quoting=csv.QUOTE_NONE, names=x_labels)
y_train = pd.read_table(IN_EXPECTED_TABLE_PATH, header=None, quoting=csv.QUOTE_NONE, names=y_labels)
x_dev = pd.read_table(X_DEV_PATH, header=None, quoting=csv.QUOTE_NONE, names=x_labels)
x_test = pd.read_table(X_TEST_PATH, header=None, quoting=csv.QUOTE_NONE, names=x_labels)
remove_n = 200000
drop_indices = np.random.choice(x_train.index, remove_n, replace=False)
x_train = x_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)
return x_labels, y_labels, x_train, y_train, x_dev, x_test
def process(x_labels, y_labels, x_train, y_train, x_dev, x_test):
x_train = x_train[x_labels[0]].str.lower()
x_dev = x_dev[x_labels[0]].str.lower()
x_test = x_test[x_labels[0]].str.lower()
y_train = y_train[y_labels[0]]
x_train = [word_tokenize(x) for x in x_train]
x_dev = [word_tokenize(x) for x in x_dev]
x_test = [word_tokenize(x) for x in x_test]
word2vec = downloader.load('glove-wiki-gigaword-200')
x_train = [np.mean([word2vec[w] for w in d if w in word2vec] or [np.zeros(200)], axis=0) for d in x_train]
x_dev = [np.mean([word2vec[w] for w in d if w in word2vec] or [np.zeros(200)], axis=0) for d in x_dev]
x_test = [np.mean([word2vec[w] for w in d if w in word2vec] or [np.zeros(200)], axis=0) for d in x_test]
return x_train, y_train, x_dev, x_test
def predict(model, x_data, out_path):
y_out = []
model.eval()
with torch.no_grad():
for i in range(0, len(x_data), BATCH_SIZE):
x = x_data[i:i+BATCH_SIZE]
x = torch.tensor(x)
pred = model(x.float())
y_pred = (pred > 0.5)
y_out.extend(y_pred)
y_data = np.asarray(y_out, dtype=np.int32)
pd.DataFrame(y_data).to_csv(out_path, sep='\t', index=False, header=False)
def main():
x_labels, y_labels, x_train, y_train, x_dev, x_test = read()
x_train, y_train, x_dev, x_test = process(x_labels, y_labels, x_train, y_train, x_dev, x_test)
nn_model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1)
for epoch in range(5):
nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i+BATCH_SIZE]
X = torch.tensor(X)
Y = y_train[i:i+BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32).to_numpy()).reshape(-1, 1)
Y_predictions = nn_model(X.float())
loss = criterion(Y_predictions, Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
predict(nn_model, x_dev, DEV_OUT_PATH)
predict(nn_model, x_test, TEST_OUT_PATH)
if __name__ == "__main__":
main()

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long