Przeniesienie trenowania, łączenie aktów

This commit is contained in:
Anna Nowak 2021-05-17 11:20:18 +02:00
parent 3c39ab377d
commit 65e298c256
2 changed files with 112 additions and 57 deletions

View File

@ -1,18 +1,8 @@
import jsgf import jsgf
import codecs
from conllu import parse_incr
from tabulate import tabulate from tabulate import tabulate
import os.path from flair.data import Sentence, Token
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import random import random
import torch import torch
@ -30,7 +20,7 @@ class ML_NLU:
def __init__(self, acts, arguments): def __init__(self, acts, arguments):
self.acts = acts self.acts = acts
self.arguments = arguments self.arguments = arguments
self.model = self.setup() self.slot_model, self.frame_model = self.setup()
def nolabel2o(self, line, i): def nolabel2o(self, line, i):
return 'O' if line[i] == 'NoLabel' else line[i] return 'O' if line[i] == 'NoLabel' else line[i]
@ -54,61 +44,48 @@ class ML_NLU:
return SentenceDataset(fsentences) return SentenceDataset(fsentences)
def predict(self, model, sentence): def predict(self, sentence):
csentence = [{'form': word} for word in sentence] csentence = [{'form': word} for word in sentence]
fsentence = self.conllu2flair([csentence])[0] fsentence = self.conllu2flair([csentence])[0]
model.predict(fsentence) self.slot_model.predict(fsentence)
return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)] self.frame_model.predict(fsentence)
possible_intents = {}
for token in fsentence:
for intent in token.annotation_layers["frame"]:
if(intent.value in possible_intents):
possible_intents[intent.value] += intent.score
else:
possible_intents[intent.value] = intent.score
return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)], max(possible_intents)
def setup(self): def setup(self):
slot_model = SequenceTagger.load('slot-model/final-model.pt')
if os.path.isfile('slot-model/final-model.pt'): frame_model = SequenceTagger.load('frame-model/final-model.pt')
model = SequenceTagger.load('slot-model/final-model.pt') return slot_model, frame_model
else:
fields = ['id', 'form', 'frame', 'slot']
with open('Janet.conllu', encoding='utf-8') as trainfile:
trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
with open('Janet.conllu', encoding='utf-8') as testfile:
testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
tabulate(trainset[0], tablefmt='html')
corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))
tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type='slot', use_crf=True)
trainer = ModelTrainer(tagger, corpus)
trainer.train('slot-model',
learning_rate=0.1,
mini_batch_size=32,
max_epochs=10,
train_with_dev=False)
model = SequenceTagger.load('slot-model/final-model.pt')
return model
def test_nlu(self, utterance): def test_nlu(self, utterance):
if utterance: if utterance:
return tabulate(self.predict(self.model, utterance.split()), tablefmt='tsv') slots, act = self.predict(utterance.split())
slots = [x for x in slots if x[1] != 'O']
arguments = self.convert_slot_to_argument(slots)
return {'act': act, 'slots': arguments}
else: else:
return 'Critical Error' return 'Critical Error'
def convert_slot_to_argument(self, slots):
arguments = []
candidate = None
for slot in slots:
if slot[1].startswith("B-"):
if(candidate != None):
arguments.append(candidate)
candidate = [slot[1].replace("B-", ""), slot[0]]
if slot[1].startswith("I-") and candidate != None and slot[1].endswith(candidate[0]):
candidate[1] += " " + slot[0]
if(candidate != None):
arguments.append(candidate)
return [(x[0], x[1]) for x in arguments]
class Book_NLU: #Natural Language Understanding class Book_NLU: #Natural Language Understanding
""" """
Moduł odpowiedzialny za analizę tekstu. W wyniku jego działania tekstowa reprezentacja wypowiedzi użytkownika zostaje zamieniona na jej reprezentację semantyczną, najczęściej w postaci ramy. Moduł odpowiedzialny za analizę tekstu. W wyniku jego działania tekstowa reprezentacja wypowiedzi użytkownika zostaje zamieniona na jej reprezentację semantyczną, najczęściej w postaci ramy.

78
train.py Normal file
View File

@ -0,0 +1,78 @@
from conllu import parse_incr
from tabulate import tabulate
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
def nolabel2o(line, i):
return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(sentences, label=None):
fsentences = []
for sentence in sentences:
fsentence = Sentence()
for token in sentence:
ftoken = Token(token['form'])
if label:
ftoken.add_tag(label, token[label])
fsentence.add_token(ftoken)
fsentences.append(fsentence)
return SentenceDataset(fsentences)
fields = ['id', 'form', 'frame', 'slot']
with open('Janet.conllu', encoding='utf-8') as trainfile:
slot_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o}))
with open('Janet.conllu', encoding='utf-8') as trainfile:
frame_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'frame': nolabel2o}))
tabulate(slot_trainset[0], tablefmt='html')
slot_corpus = Corpus(train=conllu2flair(slot_trainset, 'slot'), test=conllu2flair(slot_trainset, 'slot'))
frame_corpus = Corpus(train=conllu2flair(frame_trainset, 'frame'), test=conllu2flair(frame_trainset, 'frame'))
slot_tag_dictionary = slot_corpus.make_tag_dictionary(tag_type='slot')
frame_tag_dictionary = frame_corpus.make_tag_dictionary(tag_type='frame')
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
slot_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
tag_dictionary=slot_tag_dictionary,
tag_type='slot', use_crf=True)
frame_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
tag_dictionary=frame_tag_dictionary,
tag_type='frame', use_crf=True)
# slot_trainer = ModelTrainer(slot_tagger, slot_corpus)
# slot_trainer.train('slot-model',
# learning_rate=0.1,
# mini_batch_size=32,
# max_epochs=100,
# train_with_dev=False)
frame_trainer = ModelTrainer(frame_tagger, frame_corpus)
frame_trainer.train('frame-model',
learning_rate=0.1,
mini_batch_size=32,
max_epochs=100,
train_with_dev=False)