forked from kubapok/en-ner-conll-2003
fix
This commit is contained in:
parent
6837f07f7e
commit
1e6e429e58
111
generate.py
Normal file
111
generate.py
Normal file
@ -0,0 +1,111 @@
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import torch
|
||||
from sklearn.metrics import accuracy_score, f1_score, classification_report
|
||||
from model import Model
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
def process_output(lines):
|
||||
result = []
|
||||
for line in lines:
|
||||
last_label = None
|
||||
new_line = []
|
||||
for label in line:
|
||||
if(label != "O" and label[0:2] == "I-"):
|
||||
if last_label == None or last_label == "O":
|
||||
label = label.replace('I-', 'B-')
|
||||
else:
|
||||
label = "I-" + last_label[2:]
|
||||
last_label = label
|
||||
new_line.append(label)
|
||||
x = (" ".join(new_line))
|
||||
result.append(" ".join(new_line))
|
||||
return result
|
||||
def data_process(dt):
|
||||
return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt]
|
||||
def labels_process(dt):
|
||||
return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]
|
||||
def process_document(document):
|
||||
return [process_token(x) for x in document.split(" ")]
|
||||
def save_file(path, obj):
|
||||
with open(path, "w") as file:
|
||||
file.write(obj)
|
||||
def process_token(token):
|
||||
return token.lower()
|
||||
|
||||
data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data'])
|
||||
ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels'])
|
||||
|
||||
in_data = data['data']
|
||||
target = ex_data['labels']
|
||||
num_tags = 9
|
||||
seq_length = 5
|
||||
save_path = "dev-0/out.tsv"
|
||||
|
||||
with open("vocab.pickle", "rb") as file:
|
||||
vocab = pickle.load(file)
|
||||
|
||||
labels_vocab = {
|
||||
'O': 0,
|
||||
'B-PER': 1,
|
||||
'B-LOC': 2,
|
||||
'I-PER': 3,
|
||||
'B-MISC': 4,
|
||||
'I-MISC': 5,
|
||||
'I-LOC': 6,
|
||||
'B-ORG': 7,
|
||||
'I-ORG': 8
|
||||
}
|
||||
|
||||
inv_labels_vocab = {v: k for k, v in labels_vocab.items()}
|
||||
|
||||
train_tokens_ids = data_process(in_data)
|
||||
train_labels = labels_process(target)
|
||||
|
||||
model = Model(num_tags, seq_length, vocab)
|
||||
device = torch.device("cuda")
|
||||
model.to(device)
|
||||
model.cuda(0)
|
||||
|
||||
model.eval()
|
||||
model.eval_mode()
|
||||
predicted = []
|
||||
correct = []
|
||||
model.load_state_dict(torch.load("model.torch"))
|
||||
for i in tqdm(range(0, len(train_tokens_ids))):
|
||||
last_idx = 0
|
||||
for k in range(0, len(train_tokens_ids[i]) - seq_length + 1, seq_length):
|
||||
batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
|
||||
tags = train_labels[i][k: k + seq_length].unsqueeze(1)
|
||||
predicted_tags = model.decode(batch_tokens.to(device))
|
||||
predicted += predicted_tags[0]
|
||||
correct += [x[0] for x in tags.numpy().tolist()]
|
||||
last_idx = k
|
||||
l = len(train_tokens_ids[i])
|
||||
rest = l - int(l/seq_length) * seq_length
|
||||
if rest != 0:
|
||||
batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0)
|
||||
tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1)
|
||||
predicted_tags = model.decode(batch_tokens.to(device))
|
||||
predicted += predicted_tags[0]
|
||||
correct += [x[0] for x in tags.numpy().tolist()]
|
||||
|
||||
print(classification_report(correct, predicted))
|
||||
print(accuracy_score(correct, predicted))
|
||||
print(f1_score(correct, predicted, average="micro"))
|
||||
save_file("correct.txt", '\n'.join([str(x) for x in correct]))
|
||||
save_file("predicted.txt", '\n'.join([str(x) for x in predicted]))
|
||||
|
||||
predicted = list(map(lambda x: inv_labels_vocab[x], predicted))
|
||||
slices = [len(x.split(" ")) for x in in_data]
|
||||
with open(save_path, "w") as save:
|
||||
writer = csv.writer(save, delimiter='\t', lineterminator='\n')
|
||||
accumulator = 0
|
||||
output = []
|
||||
for slice in slices:
|
||||
output.append(predicted[accumulator: accumulator + slice])
|
||||
accumulator += slice
|
||||
for line in process_output(output):
|
||||
writer.writerow([line])
|
||||
|
||||
|
65
main.py
65
main.py
@ -13,38 +13,10 @@ import numpy as np
|
||||
from sklearn.metrics import accuracy_score, f1_score, classification_report
|
||||
import csv
|
||||
import pickle
|
||||
from model import Model
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self, num_tags, seq_length):
|
||||
super(Model, self).__init__()
|
||||
self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100)
|
||||
self.gru = torch.nn.GRU(100, 256, 1, batch_first=True)
|
||||
self.hidden2tag = torch.nn.Linear(256, 9)
|
||||
self.crf = CRF(num_tags, batch_first=True)
|
||||
self.relu = torch.nn.ReLU()
|
||||
self.fc1 = torch.nn.Linear(1, seq_length)
|
||||
self.softmax = torch.nn.Softmax(dim=0)
|
||||
self.sigm = torch.nn.Sigmoid()
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
def forward(self, data, tags):
|
||||
emb = self.relu(self.emb(data))
|
||||
out, h_n = self.gru(emb)
|
||||
out = self.hidden2tag(out)
|
||||
out = self.crf(out, tags.T)
|
||||
return -out
|
||||
|
||||
def decode(self, data):
|
||||
emb = self.relu(self.emb(data))
|
||||
out, h_n = self.gru(emb)
|
||||
out = self.hidden2tag(out)
|
||||
out = self.crf.decode(out)
|
||||
return out
|
||||
|
||||
def train_mode(self):
|
||||
self.crf.train()
|
||||
|
||||
def eval_mode(self):
|
||||
self.crf.eval()
|
||||
|
||||
def process_output(lines):
|
||||
result = []
|
||||
@ -64,7 +36,14 @@ def process_output(lines):
|
||||
return result
|
||||
|
||||
def process_document(document):
|
||||
return document.split(" ")
|
||||
return [process_token(x) for x in document.split(" ")]
|
||||
|
||||
def save_file(path, obj):
|
||||
with open(path, "w") as file:
|
||||
file.write(obj)
|
||||
|
||||
def process_token(token):
|
||||
return token.lower()
|
||||
|
||||
def build_vocab(dataset):
|
||||
counter = Counter()
|
||||
@ -78,26 +57,28 @@ def build_vocab(dataset):
|
||||
return v
|
||||
|
||||
def data_process(dt):
|
||||
return [ torch.tensor([vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]
|
||||
return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt]
|
||||
|
||||
def labels_process(dt):
|
||||
return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]
|
||||
|
||||
# mode = "train"
|
||||
# mode = "eval"
|
||||
mode = "generate"
|
||||
mode = "eval"
|
||||
# mode = "generate"
|
||||
|
||||
save_path = "dev-0/out.tsv"
|
||||
|
||||
data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['0'])
|
||||
data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data'])
|
||||
# data.columns = ["labels", "text"]
|
||||
|
||||
# train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data'])
|
||||
|
||||
ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels'])
|
||||
|
||||
in_data = data["0"]
|
||||
target = ex_data["labels"]
|
||||
in_data = data['data']
|
||||
target = ex_data['labels']
|
||||
# in_data = data["0"]
|
||||
# target = ex_data["labels"]
|
||||
|
||||
# test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0'])
|
||||
# test_data.columns = ['0']
|
||||
@ -137,10 +118,10 @@ num_tags = 9
|
||||
NUM_EPOCHS = 5
|
||||
seq_length = 5
|
||||
|
||||
model = Model(num_tags, seq_length)
|
||||
device = torch.device("cpu")
|
||||
model = Model(num_tags, seq_length, vocab)
|
||||
device = torch.device("cuda")
|
||||
model.to(device)
|
||||
# model.cuda(0)
|
||||
model.cuda(0)
|
||||
|
||||
if mode == "train":
|
||||
criterion = torch.nn.CrossEntropyLoss()
|
||||
@ -191,7 +172,9 @@ if mode == "eval" or mode == "generate":
|
||||
if mode == "eval":
|
||||
print(classification_report(correct, predicted))
|
||||
print(accuracy_score(correct, predicted))
|
||||
print(f1_score(correct, predicted, average="weighted"))
|
||||
print(f1_score(correct, predicted, average="micro"))
|
||||
save_file("correct.txt", '\n'.join([str(x) for x in correct]))
|
||||
save_file("predicted.txt", '\n'.join([str(x) for x in predicted]))
|
||||
|
||||
predicted = list(map(lambda x: inv_labels_vocab[x], predicted))
|
||||
slices = [len(x.split(" ")) for x in in_data]
|
||||
|
34
model.py
Normal file
34
model.py
Normal file
@ -0,0 +1,34 @@
|
||||
import torch
|
||||
from torchcrf import CRF
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self, num_tags, seq_length, vocab):
|
||||
super(Model, self).__init__()
|
||||
self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100)
|
||||
self.gru = torch.nn.GRU(100, 256, 1, batch_first=True)
|
||||
self.hidden2tag = torch.nn.Linear(256, 9)
|
||||
self.crf = CRF(num_tags, batch_first=True)
|
||||
self.relu = torch.nn.ReLU()
|
||||
self.fc1 = torch.nn.Linear(1, seq_length)
|
||||
self.softmax = torch.nn.Softmax(dim=0)
|
||||
self.sigm = torch.nn.Sigmoid()
|
||||
|
||||
def forward(self, data, tags):
|
||||
emb = self.relu(self.emb(data))
|
||||
out, h_n = self.gru(emb)
|
||||
out = self.hidden2tag(out)
|
||||
out = self.crf(out, tags.T)
|
||||
return -out
|
||||
|
||||
def decode(self, data):
|
||||
emb = self.relu(self.emb(data))
|
||||
out, h_n = self.gru(emb)
|
||||
out = self.hidden2tag(out)
|
||||
out = self.crf.decode(out)
|
||||
return out
|
||||
|
||||
def train_mode(self):
|
||||
self.crf.train()
|
||||
|
||||
def eval_mode(self):
|
||||
self.crf.eval()
|
Loading…
Reference in New Issue
Block a user