From 65e298c256b37e578734fac4e2b0bdd2eb344e4b Mon Sep 17 00:00:00 2001 From: Anna Nowak Date: Mon, 17 May 2021 11:20:18 +0200 Subject: [PATCH] =?UTF-8?q?Przeniesienie=20trenowania,=20=C5=82=C4=85czeni?= =?UTF-8?q?e=20akt=C3=B3w?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makiety.py | 91 ++++++++++++++++++++---------------------------------- train.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 57 deletions(-) create mode 100644 train.py diff --git a/Makiety.py b/Makiety.py index af358b5..8a16597 100644 --- a/Makiety.py +++ b/Makiety.py @@ -1,18 +1,8 @@ import jsgf -import codecs -from conllu import parse_incr from tabulate import tabulate -import os.path - - -from flair.data import Corpus, Sentence, Token +from flair.data import Sentence, Token from flair.datasets import SentenceDataset -from flair.embeddings import StackedEmbeddings -from flair.embeddings import WordEmbeddings -from flair.embeddings import CharacterEmbeddings -from flair.embeddings import FlairEmbeddings from flair.models import SequenceTagger -from flair.trainers import ModelTrainer import random import torch @@ -30,7 +20,7 @@ class ML_NLU: def __init__(self, acts, arguments): self.acts = acts self.arguments = arguments - self.model = self.setup() + self.slot_model, self.frame_model = self.setup() def nolabel2o(self, line, i): return 'O' if line[i] == 'NoLabel' else line[i] @@ -54,60 +44,47 @@ class ML_NLU: return SentenceDataset(fsentences) - def predict(self, model, sentence): + def predict(self, sentence): csentence = [{'form': word} for word in sentence] fsentence = self.conllu2flair([csentence])[0] - model.predict(fsentence) - return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)] + self.slot_model.predict(fsentence) + self.frame_model.predict(fsentence) + possible_intents = {} + for token in fsentence: + for intent in token.annotation_layers["frame"]: + if(intent.value in possible_intents): + possible_intents[intent.value] += intent.score + else: + possible_intents[intent.value] = intent.score + return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)], max(possible_intents) def setup(self): - - if os.path.isfile('slot-model/final-model.pt'): - model = SequenceTagger.load('slot-model/final-model.pt') - else: - fields = ['id', 'form', 'frame', 'slot'] - - with open('Janet.conllu', encoding='utf-8') as trainfile: - trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o})) - with open('Janet.conllu', encoding='utf-8') as testfile: - testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o})) - - tabulate(trainset[0], tablefmt='html') - - corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot')) - tag_dictionary = corpus.make_tag_dictionary(tag_type='slot') - - embedding_types = [ - WordEmbeddings('pl'), - FlairEmbeddings('pl-forward'), - FlairEmbeddings('pl-backward'), - CharacterEmbeddings(), - ] - - embeddings = StackedEmbeddings(embeddings=embedding_types) - tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, - tag_dictionary=tag_dictionary, - tag_type='slot', use_crf=True) - - trainer = ModelTrainer(tagger, corpus) - trainer.train('slot-model', - learning_rate=0.1, - mini_batch_size=32, - max_epochs=10, - train_with_dev=False) - - model = SequenceTagger.load('slot-model/final-model.pt') - - - return model + slot_model = SequenceTagger.load('slot-model/final-model.pt') + frame_model = SequenceTagger.load('frame-model/final-model.pt') + return slot_model, frame_model def test_nlu(self, utterance): - if utterance: - return tabulate(self.predict(self.model, utterance.split()), tablefmt='tsv') - + slots, act = self.predict(utterance.split()) + slots = [x for x in slots if x[1] != 'O'] + arguments = self.convert_slot_to_argument(slots) + return {'act': act, 'slots': arguments} else: return 'Critical Error' + + def convert_slot_to_argument(self, slots): + arguments = [] + candidate = None + for slot in slots: + if slot[1].startswith("B-"): + if(candidate != None): + arguments.append(candidate) + candidate = [slot[1].replace("B-", ""), slot[0]] + if slot[1].startswith("I-") and candidate != None and slot[1].endswith(candidate[0]): + candidate[1] += " " + slot[0] + if(candidate != None): + arguments.append(candidate) + return [(x[0], x[1]) for x in arguments] class Book_NLU: #Natural Language Understanding """ diff --git a/train.py b/train.py new file mode 100644 index 0000000..5a4c0c9 --- /dev/null +++ b/train.py @@ -0,0 +1,78 @@ +from conllu import parse_incr +from tabulate import tabulate +from flair.data import Corpus, Sentence, Token +from flair.datasets import SentenceDataset +from flair.embeddings import StackedEmbeddings +from flair.embeddings import WordEmbeddings +from flair.embeddings import CharacterEmbeddings +from flair.embeddings import FlairEmbeddings +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer + +def nolabel2o(line, i): + return 'O' if line[i] == 'NoLabel' else line[i] + +def conllu2flair(sentences, label=None): + fsentences = [] + + for sentence in sentences: + fsentence = Sentence() + + for token in sentence: + ftoken = Token(token['form']) + + if label: + ftoken.add_tag(label, token[label]) + + fsentence.add_token(ftoken) + + fsentences.append(fsentence) + + return SentenceDataset(fsentences) + +fields = ['id', 'form', 'frame', 'slot'] + +with open('Janet.conllu', encoding='utf-8') as trainfile: + slot_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o})) +with open('Janet.conllu', encoding='utf-8') as trainfile: + frame_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'frame': nolabel2o})) + +tabulate(slot_trainset[0], tablefmt='html') + + +slot_corpus = Corpus(train=conllu2flair(slot_trainset, 'slot'), test=conllu2flair(slot_trainset, 'slot')) +frame_corpus = Corpus(train=conllu2flair(frame_trainset, 'frame'), test=conllu2flair(frame_trainset, 'frame')) + +slot_tag_dictionary = slot_corpus.make_tag_dictionary(tag_type='slot') +frame_tag_dictionary = frame_corpus.make_tag_dictionary(tag_type='frame') + + +embedding_types = [ + WordEmbeddings('pl'), + FlairEmbeddings('pl-forward'), + FlairEmbeddings('pl-backward'), + CharacterEmbeddings(), +] + +embeddings = StackedEmbeddings(embeddings=embedding_types) +slot_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, + tag_dictionary=slot_tag_dictionary, + tag_type='slot', use_crf=True) +frame_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, + tag_dictionary=frame_tag_dictionary, + tag_type='frame', use_crf=True) + +# slot_trainer = ModelTrainer(slot_tagger, slot_corpus) +# slot_trainer.train('slot-model', +# learning_rate=0.1, +# mini_batch_size=32, +# max_epochs=100, +# train_with_dev=False) + + +frame_trainer = ModelTrainer(frame_tagger, frame_corpus) +frame_trainer.train('frame-model', + learning_rate=0.1, + mini_batch_size=32, + max_epochs=100, + train_with_dev=False) \ No newline at end of file