en-ner-conll-2003/main.py

191 lines
6.0 KiB
Python

from os import sep
from nltk import word_tokenize
import pandas as pd
import torch
from torch._C import device
from tqdm import tqdm
from torchtext.vocab import vocab
from collections import Counter, OrderedDict
import spacy
from torchcrf import CRF
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
import csv
import pickle
from model import Model
nlp = spacy.load("en_core_web_sm")
def process_output(lines):
result = []
for line in lines:
last_label = None
new_line = []
for label in line:
if(label != "O" and label[0:2] == "I-"):
if last_label == None or last_label == "O":
label = label.replace('I-', 'B-')
else:
label = "I-" + last_label[2:]
last_label = label
new_line.append(label)
x = (" ".join(new_line))
result.append(" ".join(new_line))
return result
def process_document(document):
return [process_token(x) for x in document.split(" ")]
def save_file(path, obj):
with open(path, "w") as file:
file.write(obj)
def process_token(token):
return token.lower()
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(process_document(document))
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
v = vocab(counter)
default_index = 0
v.set_default_index(default_index)
return v
def data_process(dt):
return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt]
def labels_process(dt):
return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]
# mode = "train"
# mode = "eval"
mode = "generate"
save_path = "dev-0/out.tsv"
data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data'])
# data.columns = ["labels", "text"]
# train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data'])
ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels'])
in_data = data['data']
target = ex_data['labels']
# in_data = data["0"]
# target = ex_data["labels"]
# test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0'])
# test_data.columns = ['0']
# data = test_data['0']
# in_data = test_data['0']
# target = list(np.zeros(len(in_data)))
# target = train_target['labels']
# labels_vocab = build_vocab(data['labels'])
if mode == "train":
vocab = build_vocab(in_data)
with open("vocab.pickle", "wb") as file:
pickle.dump(vocab, file)
print("Vocab saved")
else:
with open("vocab.pickle", "rb") as file:
vocab = pickle.load(file)
labels_vocab = {
'O': 0,
'B-PER': 1,
'B-LOC': 2,
'I-PER': 3,
'B-MISC': 4,
'I-MISC': 5,
'I-LOC': 6,
'B-ORG': 7,
'I-ORG': 8
}
inv_labels_vocab = {v: k for k, v in labels_vocab.items()}
train_tokens_ids = data_process(in_data)
train_labels = labels_process(target)
num_tags = 9
NUM_EPOCHS = 5
seq_length = 5
model = Model(num_tags, seq_length, vocab)
device = torch.device("cuda")
model.to(device)
model.cuda(0)
if mode == "train":
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
for i in range(NUM_EPOCHS):
model.train()
model.train_mode()
#for i in tqdm(range(500)):
for i in tqdm(range(len(train_labels))):
for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):
batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
tags = train_labels[i][k: k + seq_length].unsqueeze(1)
predicted_tags = model(batch_tokens.to(device), tags.to(device))
predicted_tags.backward()
optimizer.step()
model.zero_grad()
model.crf.zero_grad()
optimizer.zero_grad()
torch.save(model.state_dict(), "model.torch")
if mode == "eval" or mode == "generate":
model.eval()
model.eval_mode()
predicted = []
correct = []
model.load_state_dict(torch.load("model.torch"))
for i in tqdm(range(0, len(train_tokens_ids))):
last_idx = 0
for k in range(0, len(train_tokens_ids[i]) - seq_length + 1, seq_length):
batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
tags = train_labels[i][k: k + seq_length].unsqueeze(1)
predicted_tags = model.decode(batch_tokens.to(device))
predicted += predicted_tags[0]
correct += [x[0] for x in tags.numpy().tolist()]
last_idx = k
l = len(train_tokens_ids[i])
rest = l - int(l/seq_length) * seq_length
if rest != 0:
batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0)
tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1)
predicted_tags = model.decode(batch_tokens.to(device))
predicted += predicted_tags[0]
correct += [x[0] for x in tags.numpy().tolist()]
if mode == "eval":
print(classification_report(correct, predicted))
print(accuracy_score(correct, predicted))
print(f1_score(correct, predicted, average="micro"))
save_file("correct.txt", '\n'.join([str(x) for x in correct]))
save_file("predicted.txt", '\n'.join([str(x) for x in predicted]))
predicted = list(map(lambda x: inv_labels_vocab[x], predicted))
slices = [len(x.split(" ")) for x in in_data]
with open(save_path, "w") as save:
writer = csv.writer(save, delimiter='\t', lineterminator='\n')
accumulator = 0
output = []
for slice in slices:
output.append(predicted[accumulator: accumulator + slice])
accumulator += slice
for line in process_output(output):
writer.writerow([line])