From 28988a94a388521e7315d3f24d9a5a74c066a2ed Mon Sep 17 00:00:00 2001 From: s440058 Date: Mon, 21 Jun 2021 18:19:40 +0200 Subject: [PATCH] add gru --- gru.py | 210 ++++++++++++++++++++++++++++++++++++++++++ requirements-test.txt | 6 ++ requirements.txt | 4 + 3 files changed, 220 insertions(+) create mode 100644 gru.py create mode 100644 requirements-test.txt create mode 100644 requirements.txt diff --git a/gru.py b/gru.py new file mode 100644 index 0000000..32ffbbe --- /dev/null +++ b/gru.py @@ -0,0 +1,210 @@ +import numpy as np +import gensim +import torch +import pandas as pd +import csv +import seaborn as sns +from sklearn.model_selection import train_test_split +import torch +from torchcrf import CRF +from tqdm import tqdm + +from torchtext.vocab import Vocab +from collections import Counter + +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics import accuracy_score + +DATA_PATH = ['train/train.tsv', 'dev-0/in.tsv', 'dev-0/expected.tsv', 'test-A/in.tsv'] +DATA_PATH_OUTPUT = ['dev-0/out.tsv', 'test-A/out.tsv'] + +def get_data(path): + train = pd.read_table(path, error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) + return train + +def split(x): + return x.split() + +def replace(x): + newList = [] + for word in x: + if word == 'O': + newList.append(0) + if word == 'B-LOC': + newList.append(1) + if word == 'I-LOC': + newList.append(2) + if word == 'B-MISC': + newList.append(3) + if word == 'B-ORG': + newList.append(4) + if word == 'I-ORG': + newList.append(5) + if word == 'B-PER': + newList.append(6) + if word == 'I-PER': + newList.append(7) + return newList + +def build_vocab(dataset): + counter = Counter() + for document in dataset: + counter.update(document) + return Vocab(counter, specials=['', '', '', '']) + +def labels_process(dt): + return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt] + +def data_process(dt, vocab): + return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt] + +class GRU(torch.nn.Module): + def __init__(self, doc_vocab_len, train_tokens_ids): + super(GRU, self).__init__() + self.emb = torch.nn.Embedding(doc_vocab_len, 100) + self.dropout = torch.nn.Dropout(0.2) + self.rec = torch.nn.GRU(100, 256, 2, batch_first=True, bidirectional=True) + self.fc1 = torch.nn.Linear(2*256, 9) + + def forward(self, x): + emb = torch.relu(self.emb(x)) + emb = self.dropout(emb) + gru_output, h_n = self.rec(emb) + out_weights = self.fc1(gru_output) + + return out_weights + + +class NERModel(torch.nn.Module): + def __init__(self,): + super(NERModel, self).__init__() + self.emb = torch.nn.Embedding(23627,200) + self.fc1 = torch.nn.Linear(600,9) + + def forward(self, x): + x = self.emb(x) + x = x.reshape(600) + x = self.fc1(x) + return x + +def configure(train, vocab): + train_labels = labels_process(train[0]) + train_tokens_ids = data_process(train[1], vocab) + ner_model = NERModel() + criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(ner_model.parameters()) + gru_model = GRU(len(vocab.itos),train_tokens_ids) + + return train_labels, train_tokens_ids, ner_model, criterion, optimizer, gru_model + +def training(gru_model, crf, train_labels, train_tokens_ids, ner_model, optimizer, criterion): + for epoch in range(2): + loss_score = 0 + acc_score = 0 + prec_score = 0 + selected_items = 0 + recall_score = 0 + relevant_items = 0 + items_total = 0 + gru_model.train() + crf.train() + for i in range(100): + for j in range(1, len(train_labels[i]) - 1): + X = train_tokens_ids[i][j-1: j+2] + Y = train_labels[i][j: j+1] + + Y_predictions = ner_model(X) + + acc_score += int(torch.argmax(Y_predictions) == Y) + + if torch.argmax(Y_predictions) != 0: + selected_items +=1 + if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item(): + prec_score += 1 + + if Y.item() != 0: + relevant_items +=1 + if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item(): + recall_score += 1 + + items_total += 1 + + optimizer.zero_grad() + loss = criterion(Y_predictions.unsqueeze(0), Y) + loss.backward() + optimizer.step() + loss_score += loss.item() + + +def generate_result(result,path): + features = ['O', 'B-LOC', 'I-LOC', 'B-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'B-PER', 'I-PER'] + final_result = [] + + for i in range(len(result)): + final_result.append([]) + for j in range(len(result[i])): + final_result[i].append(features[result[i][j]]) + + f = open(path, "a") + for i in final_result: + f.write(' '.join(i) + '\n') + f.close() + +def get_crf(device): + crf = CRF(9).to(device) + return crf + +def eval_dev(gru_model, tokens_ids, labels, crf, device): + Y_pred = [] + crf.eval() + for i in tqdm(range(len(labels))): + batch_tokens = tokens_ids[i].unsqueeze(0).to(device) + emissions = gru_model(batch_tokens).squeeze(0).unsqueeze(1).to(device) + Y_pred += [crf.decode(emissions)[0]] + +def eval_test(gru_model, tokens_ids, crf, device): + Y_pred = [] + crf.eval() + for i in tqdm(range(len(tokens_ids))): + batch_tokens = tokens_ids[i].unsqueeze(0).to(device) + emissions = gru_model(batch_tokens).squeeze(0).unsqueeze(1).to(device) + Y_pred += [crf.decode(emissions)[0]] + + +def main(): + #prepare train + train = get_data(DATA_PATH[0]) + train[0] = train[0].map(split) + train[1] = train[1].map(split) + train[0] = train[0].map(replace) + + #configure + vocab = build_vocab(train[1]) + train_labels, train_tokens_ids, ner_model, criterion, optimizer, gru_model = configure(train, vocab) + device = torch.device("cpu") + crf = get_crf(device) + + #train + training(gru_model, crf, train_labels, train_tokens_ids, ner_model, optimizer, criterion) + + #dev + dev_in = get_data(DATA_PATH[1]) + dev_ex = get_data(DATA_PATH[2]) + dev_in[0] = dev_in[0].map(split) + dev_ex[0] = dev_ex[0].map(split) + dev_ex[0] = dev_ex[0].map(replace) + dev_labels = labels_process(dev_ex[0]) + dev_tokens_ids = data_process(dev_in[0], vocab) + eval_dev(gru_model, dev_tokens_ids, dev_labels, crf, device) + + #test + test_in = get_data(DATA_PATH[3]) + test_in[0] = test_in[0].map(split) + test_tokens_ids = data_process(test_in[0], vocab) + eval_test(gru_model, test_tokens_ids, crf, device) + + #results + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..8699828 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,6 @@ +# This only installs PyTorch with a specific CUDA version which may not be +# compatible with yours. If so, install PyTorch with the correct CUDA version +# as instructed on https://pytorch.org/get-started/locally/ +torch +pytest==3.2.5 +pytest-cov==2.5.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..df46974 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +-r requirements-test.txt +flake8==3.5.0 +flake8-mypy==17.8.0 +yapf==0.25.0 \ No newline at end of file