its working

This commit is contained in:
wangobango 2021-06-21 00:43:43 +02:00
parent 434e164ea3
commit 50621d5a7f
1 changed files with 40 additions and 9 deletions

49
main.py
View File

@ -9,6 +9,9 @@ from collections import Counter, OrderedDict
import spacy import spacy
from torchcrf import CRF from torchcrf import CRF
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
nlp = spacy.load('en_core_web_sm') nlp = spacy.load('en_core_web_sm')
@ -31,7 +34,7 @@ class Model(torch.nn.Module):
out = self.hidden2tag(out) out = self.hidden2tag(out)
out = self.crf(out, tags.T) out = self.crf(out, tags.T)
# out = self.sigm(self.fc1(torch.tensor([out]))) # out = self.sigm(self.fc1(torch.tensor([out])))
return out return -out
def decode(self, data): def decode(self, data):
emb = self.relu(self.emb(data)) emb = self.relu(self.emb(data))
@ -41,6 +44,12 @@ class Model(torch.nn.Module):
out = self.crf.decode(out) out = self.crf.decode(out)
return out return out
def train_mode(self):
self.crf.train()
def eval_mode(self):
self.crf.eval()
def process_document(document): def process_document(document):
# return [str(tok.lemma) for tok in nlp(document)] # return [str(tok.lemma) for tok in nlp(document)]
@ -62,7 +71,7 @@ def data_process(dt):
def labels_process(dt): def labels_process(dt):
return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt]
save_path = "train/out.tsv"
data = pd.read_csv("train/train.tsv", sep="\t") data = pd.read_csv("train/train.tsv", sep="\t")
data.columns = ["labels", "text"] data.columns = ["labels", "text"]
@ -81,12 +90,14 @@ labels_vocab = {
'I-ORG': 8 'I-ORG': 8
} }
inv_labels_vocab = {v: k for k, v in labels_vocab.items()}
train_tokens_ids = data_process(data["text"]) train_tokens_ids = data_process(data["text"])
train_labels = labels_process(data["labels"]) train_labels = labels_process(data["labels"])
num_tags = 9 num_tags = 9
NUM_EPOCHS = 5 NUM_EPOCHS = 5
seq_length = 15 seq_length = 5
model = Model(num_tags, seq_length) model = Model(num_tags, seq_length)
device = torch.device("cuda") device = torch.device("cuda")
@ -99,13 +110,14 @@ optimizer = torch.optim.Adam(model.parameters())
train_dataloader = DataLoader(list(zip(train_tokens_ids, train_labels)), batch_size=64, shuffle=True) train_dataloader = DataLoader(list(zip(train_tokens_ids, train_labels)), batch_size=64, shuffle=True)
# test_dataloader = DataLoader(train_labels, batch_size=64, shuffle=True) # test_dataloader = DataLoader(train_labels, batch_size=64, shuffle=True)
mode = "train" # mode = "train"
# mode = "eval" mode = "eval"
# mode = "generate" # mode = "generate"
if mode == "train": if mode == "train":
for i in range(NUM_EPOCHS): for i in range(NUM_EPOCHS):
model.train() model.train()
model.train_mode()
#for i in tqdm(range(500)): #for i in tqdm(range(500)):
for i in tqdm(range(len(train_labels))): for i in tqdm(range(len(train_labels))):
for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length): for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):
@ -114,22 +126,41 @@ if mode == "train":
predicted_tags = model(batch_tokens.to(device), tags.to(device)) predicted_tags = model(batch_tokens.to(device), tags.to(device))
optimizer.zero_grad()
# tags = torch.tensor([x[0] for x in tags]) # tags = torch.tensor([x[0] for x in tags])
# loss = criterion(predicted_tags.unsqueeze(0),tags.T) # loss = criterion(predicted_tags.unsqueeze(0),tags.T)
predicted_tags.backward() predicted_tags.backward()
optimizer.step() optimizer.step()
model.zero_grad() model.zero_grad()
model.crf.zero_grad()
optimizer.zero_grad()
torch.save(model.state_dict(), "model.torch") torch.save(model.state_dict(), "model.torch")
if mode == "eval": if mode == "eval":
model.eval() model.eval()
for i in tqdm(range(len(train_labels))): model.eval_mode()
predicted = []
correct = []
model.load_state_dict(torch.load("model.torch"))
for i in tqdm(range(0, len(train_labels))):
for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length): for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length):
batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0)
tags = train_labels[i][k: k + seq_length].unsqueeze(1) tags = train_labels[i][k: k + seq_length].unsqueeze(1)
predicted_tags = model.decode(batch_tokens.to(device)) predicted_tags = model.decode(batch_tokens.to(device))
print('dupa') predicted += predicted_tags[0]
correct += [x[0] for x in tags.numpy().tolist()]
print(classification_report(correct, predicted))
print(accuracy_score(correct, predicted))
print(f1_score(correct, predicted, average="weighted"))
predicted = list(map(lambda x: inv_labels_vocab[x], predicted))
slices = [len(x.split(" ")) for x in data["text"]]
with open(save_path, "a") as save:
accumulator = 0
for slice in slices:
save.write(' '.join(predicted[accumulator: accumulator + slice]))
accumulator += slice