forked from kubapok/en-ner-conll-2003
135 lines
4.2 KiB
Python
135 lines
4.2 KiB
Python
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import csv
|
||
|
import torch
|
||
|
from tqdm import tqdm
|
||
|
from itertools import islice
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
import gensim.downloader
|
||
|
|
||
|
class NeuralNetwork(torch.nn.Module):
|
||
|
def __init__(self, input_size, hidden_size, num_classes):
|
||
|
super(NeuralNetwork, self).__init__()
|
||
|
self.l1 = torch.nn.Linear(input_size, hidden_size)
|
||
|
self.l2 = torch.nn.Linear(hidden_size, num_classes)
|
||
|
|
||
|
def forward(self, x):
|
||
|
x = self.l1(x)
|
||
|
x = torch.relu(x)
|
||
|
x = self.l2(x)
|
||
|
x = torch.log_softmax(x, dim=1)
|
||
|
return x
|
||
|
|
||
|
print("Loading word embeddings...")
|
||
|
word2vec = gensim.downloader.load('word2vec-google-news-300')
|
||
|
|
||
|
WORD_FEATURES_LEN = word2vec.vector_size
|
||
|
LABEL = {'O': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-MISC': 3, 'I-MISC': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-PER': 7, 'I-PER': 8}
|
||
|
NUM_LABELS = len(LABEL)
|
||
|
PUNCTUATION = {',', '<', '/', '>', '%', '$', '#', '@', '^', '*', '(', ')', '[', ']', '{', '}', ':'}
|
||
|
OUT_OF_VOCABULARY = np.ones(WORD_FEATURES_LEN)
|
||
|
|
||
|
X_train = []
|
||
|
y_train = []
|
||
|
X_dev = []
|
||
|
X_test = []
|
||
|
|
||
|
def map_number_to_label(number):
|
||
|
return list(LABEL.keys())[list(LABEL.values()).index(number)]
|
||
|
|
||
|
def vectorize(word):
|
||
|
extra_features = [word[0].isupper(), word[0].isdigit(), len(word) == 1, word[0] in PUNCTUATION]
|
||
|
word = word.lower()
|
||
|
if word in word2vec:
|
||
|
vec = word2vec[word]
|
||
|
else:
|
||
|
vec = OUT_OF_VOCABULARY
|
||
|
vec = vec.reshape(-1,1)
|
||
|
extra_features = np.array(extra_features).reshape(-1, 1)
|
||
|
return np.concatenate((vec, extra_features), axis=0)
|
||
|
|
||
|
def prediction_to_string(prediction):
|
||
|
output = prediction.tolist()
|
||
|
output = [map_number_to_label(x) for x in output]
|
||
|
return ' '.join(output)
|
||
|
|
||
|
train_set = pd.read_table('train/train.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||
|
dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||
|
test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||
|
|
||
|
for index, row in tqdm(train_set.iterrows(), desc="Loading train data", total=train_set.shape[0]):
|
||
|
labels, words = row[0], row[1]
|
||
|
words, labels = words.split(), labels.split()
|
||
|
for word in words:
|
||
|
X_train.append(vectorize(word))
|
||
|
for label in labels:
|
||
|
y_train.append(LABEL[label])
|
||
|
|
||
|
for index, row in tqdm(dev_set.iterrows(), desc="Loading dev data", total=dev_set.shape[0]):
|
||
|
words = row[0]
|
||
|
words = words.split()
|
||
|
words = [vectorize(word) for word in words]
|
||
|
X_dev.append(words)
|
||
|
|
||
|
for index, row in tqdm(test_set.iterrows(), desc="Loading test data", total=test_set.shape[0]):
|
||
|
words = row[0]
|
||
|
words = words.split()
|
||
|
words = [vectorize(word) for word in words]
|
||
|
X_test.append(words)
|
||
|
|
||
|
model = NeuralNetwork(304, 600, NUM_LABELS)
|
||
|
|
||
|
criterion = torch.nn.NLLLoss()
|
||
|
optimizer = torch.optim.Adam(model.parameters())
|
||
|
|
||
|
batch_size = 64
|
||
|
|
||
|
print("Training model...")
|
||
|
for epoch in range(1):
|
||
|
model.train()
|
||
|
for i in range(0, len(y_train), batch_size):
|
||
|
X = X_train[i:i+batch_size]
|
||
|
X = np.array(X).reshape(len(X), 304)
|
||
|
X = torch.tensor(X)
|
||
|
y = y_train[i:i+batch_size]
|
||
|
y = np.array(y)
|
||
|
y = torch.tensor(y)
|
||
|
|
||
|
outputs = model(X.float())
|
||
|
loss = criterion(outputs, y.long())
|
||
|
|
||
|
optimizer.zero_grad()
|
||
|
loss.backward()
|
||
|
optimizer.step()
|
||
|
|
||
|
print("Making predictions...")
|
||
|
dev_prediction = []
|
||
|
test_prediction = []
|
||
|
|
||
|
model.eval()
|
||
|
with torch.no_grad():
|
||
|
for i in range(0, len(X_dev)):
|
||
|
X = X_dev[i]
|
||
|
X = np.array(X).reshape(len(X), 304)
|
||
|
X = torch.tensor(X)
|
||
|
|
||
|
output = model(X.float())
|
||
|
prediction = torch.argmax(output, dim=1)
|
||
|
dev_prediction.append(prediction_to_string(prediction))
|
||
|
|
||
|
for i in range(0, len(X_test)):
|
||
|
X = X_test[i]
|
||
|
X = np.array(X).reshape(len(X), 304)
|
||
|
X = torch.tensor(X)
|
||
|
|
||
|
output = model(X.float())
|
||
|
prediction = torch.argmax(output, dim=1)
|
||
|
test_prediction.append(prediction_to_string(prediction))
|
||
|
|
||
|
dev_prediction = np.asarray(dev_prediction)
|
||
|
test_prediction = np.asarray(test_prediction)
|
||
|
|
||
|
dev_prediction.tofile('./dev-0/out.tsv', sep='\n', format='%s')
|
||
|
test_prediction.tofile('./test-A/out.tsv', sep='\n', format='%s')
|
||
|
|