import pandas as pd import numpy as np import csv import torch from tqdm import tqdm from itertools import islice from nltk.tokenize import word_tokenize import gensim.downloader class NeuralNetwork(torch.nn.Module): def __init__(self, input_size, hidden_size, num_classes): super(NeuralNetwork, self).__init__() self.l1 = torch.nn.Linear(input_size, hidden_size) self.l2 = torch.nn.Linear(hidden_size, num_classes) def forward(self, x): x = self.l1(x) x = torch.relu(x) x = self.l2(x) x = torch.log_softmax(x, dim=1) return x print("Loading word embeddings...") word2vec = gensim.downloader.load('word2vec-google-news-300') WORD_FEATURES_LEN = word2vec.vector_size LABEL = {'O': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-MISC': 3, 'I-MISC': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-PER': 7, 'I-PER': 8} NUM_LABELS = len(LABEL) PUNCTUATION = {',', '<', '/', '>', '%', '$', '#', '@', '^', '*', '(', ')', '[', ']', '{', '}', ':'} OUT_OF_VOCABULARY = np.ones(WORD_FEATURES_LEN) X_train = [] y_train = [] X_dev = [] X_test = [] def map_number_to_label(number): return list(LABEL.keys())[list(LABEL.values()).index(number)] def vectorize(word): extra_features = [word[0].isupper(), word[0].isdigit(), len(word) == 1, word[0] in PUNCTUATION] word = word.lower() if word in word2vec: vec = word2vec[word] else: vec = OUT_OF_VOCABULARY vec = vec.reshape(-1,1) extra_features = np.array(extra_features).reshape(-1, 1) return np.concatenate((vec, extra_features), axis=0) def prediction_to_string(prediction): output = prediction.tolist() output = [map_number_to_label(x) for x in output] return ' '.join(output) train_set = pd.read_table('train/train.tsv.xz', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) dev_set = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) test_set = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) for index, row in tqdm(train_set.iterrows(), desc="Loading train data", total=train_set.shape[0]): labels, words = row[0], row[1] words, labels = words.split(), labels.split() for word in words: X_train.append(vectorize(word)) for label in labels: y_train.append(LABEL[label]) for index, row in tqdm(dev_set.iterrows(), desc="Loading dev data", total=dev_set.shape[0]): words = row[0] words = words.split() words = [vectorize(word) for word in words] X_dev.append(words) for index, row in tqdm(test_set.iterrows(), desc="Loading test data", total=test_set.shape[0]): words = row[0] words = words.split() words = [vectorize(word) for word in words] X_test.append(words) model = NeuralNetwork(304, 600, NUM_LABELS) criterion = torch.nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters()) batch_size = 64 print("Training model...") for epoch in range(1): model.train() for i in range(0, len(y_train), batch_size): X = X_train[i:i+batch_size] X = np.array(X).reshape(len(X), 304) X = torch.tensor(X) y = y_train[i:i+batch_size] y = np.array(y) y = torch.tensor(y) outputs = model(X.float()) loss = criterion(outputs, y.long()) optimizer.zero_grad() loss.backward() optimizer.step() print("Making predictions...") dev_prediction = [] test_prediction = [] model.eval() with torch.no_grad(): for i in range(0, len(X_dev)): X = X_dev[i] X = np.array(X).reshape(len(X), 304) X = torch.tensor(X) output = model(X.float()) prediction = torch.argmax(output, dim=1) dev_prediction.append(prediction_to_string(prediction)) for i in range(0, len(X_test)): X = X_test[i] X = np.array(X).reshape(len(X), 304) X = torch.tensor(X) output = model(X.float()) prediction = torch.argmax(output, dim=1) test_prediction.append(prediction_to_string(prediction)) dev_prediction = np.asarray(dev_prediction) test_prediction = np.asarray(test_prediction) dev_prediction.tofile('./dev-0/out.tsv', sep='\n', format='%s') test_prediction.tofile('./test-A/out.tsv', sep='\n', format='%s')