Przeniesienie trenowania, łączenie aktów

2021-05-17 11:20:18 +02:00 · 2021-05-17 11:20:18 +02:00 · 65e298c256
commit 65e298c256
parent 3c39ab377d
2 changed files with 112 additions and 57 deletions
--- a/Makiety.py
+++ b/Makiety.py
@ -1,18 +1,8 @@
 import jsgf
-import codecs
-from conllu import parse_incr
 from tabulate import tabulate
-import os.path
-
-
-from flair.data import Corpus, Sentence, Token
+from flair.data import Sentence, Token
 from flair.datasets import SentenceDataset
-from flair.embeddings import StackedEmbeddings
-from flair.embeddings import WordEmbeddings
-from flair.embeddings import CharacterEmbeddings
-from flair.embeddings import FlairEmbeddings
 from flair.models import SequenceTagger
-from flair.trainers import ModelTrainer

 import random
 import torch
@ -30,7 +20,7 @@ class ML_NLU:
    def __init__(self, acts, arguments):
        self.acts = acts
        self.arguments = arguments
-        self.model = self.setup()
+        self.slot_model, self.frame_model = self.setup()

    def nolabel2o(self, line, i):
        return 'O' if line[i] == 'NoLabel' else line[i]
@ -54,60 +44,47 @@ class ML_NLU:
        return SentenceDataset(fsentences)


-    def predict(self, model, sentence):
+    def predict(self, sentence):
        csentence = [{'form': word} for word in sentence]
        fsentence = self.conllu2flair([csentence])[0]
-        model.predict(fsentence)
-        return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]
+        self.slot_model.predict(fsentence)
+        self.frame_model.predict(fsentence)
+        possible_intents = {}
+        for token in fsentence:
+            for intent in token.annotation_layers["frame"]:
+                if(intent.value in possible_intents):
+                    possible_intents[intent.value] += intent.score
+                else:
+                    possible_intents[intent.value] = intent.score
+        return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)], max(possible_intents)

    def setup(self):
-
-        if os.path.isfile('slot-model/final-model.pt'):
-            model = SequenceTagger.load('slot-model/final-model.pt')
-        else:
-            fields = ['id', 'form', 'frame', 'slot']
-
-            with open('Janet.conllu', encoding='utf-8') as trainfile:
-                trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
-            with open('Janet.conllu', encoding='utf-8') as testfile:
-                testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
-
-            tabulate(trainset[0], tablefmt='html')
-
-            corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))
-            tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')
-
-            embedding_types = [
-            WordEmbeddings('pl'),
-            FlairEmbeddings('pl-forward'),
-            FlairEmbeddings('pl-backward'),
-            CharacterEmbeddings(),
-            ]
-
-            embeddings = StackedEmbeddings(embeddings=embedding_types)
-            tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
-                                    tag_dictionary=tag_dictionary,
-                                    tag_type='slot', use_crf=True)
-
-            trainer = ModelTrainer(tagger, corpus)
-            trainer.train('slot-model',
-                        learning_rate=0.1,
-                        mini_batch_size=32,
-                        max_epochs=10,
-                        train_with_dev=False)
-
-            model = SequenceTagger.load('slot-model/final-model.pt')
-        
-
-        return model
+        slot_model = SequenceTagger.load('slot-model/final-model.pt')
+        frame_model = SequenceTagger.load('frame-model/final-model.pt')
+        return slot_model, frame_model

    def test_nlu(self, utterance):
-
        if utterance:
-            return tabulate(self.predict(self.model, utterance.split()), tablefmt='tsv')
-            
+            slots, act = self.predict(utterance.split())
+            slots = [x for x in slots if x[1] != 'O']
+            arguments = self.convert_slot_to_argument(slots)
+            return {'act': act, 'slots': arguments}
        else:
            return 'Critical Error'
+    
+    def convert_slot_to_argument(self, slots):
+        arguments = []
+        candidate = None
+        for slot in slots:
+            if slot[1].startswith("B-"):
+                if(candidate != None):
+                    arguments.append(candidate)
+                candidate = [slot[1].replace("B-", ""), slot[0]]
+            if slot[1].startswith("I-") and candidate != None and slot[1].endswith(candidate[0]):
+                candidate[1] += " " + slot[0]
+        if(candidate != None):
+            arguments.append(candidate)
+        return [(x[0], x[1]) for x in arguments]

 class Book_NLU: #Natural Language Understanding
    """
--- a/train.py
+++ b/train.py
@ -0,0 +1,78 @@
+from conllu import parse_incr
+from tabulate import tabulate
+from flair.data import Corpus, Sentence, Token
+from flair.datasets import SentenceDataset
+from flair.embeddings import StackedEmbeddings
+from flair.embeddings import WordEmbeddings
+from flair.embeddings import CharacterEmbeddings
+from flair.embeddings import FlairEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+def nolabel2o(line, i):
+    return 'O' if line[i] == 'NoLabel' else line[i]
+
+def conllu2flair(sentences, label=None):
+    fsentences = []
+
+    for sentence in sentences:
+        fsentence = Sentence()
+
+        for token in sentence:
+            ftoken = Token(token['form'])
+
+            if label:
+                ftoken.add_tag(label, token[label])
+
+            fsentence.add_token(ftoken)
+
+        fsentences.append(fsentence)
+
+    return SentenceDataset(fsentences)
+
+fields = ['id', 'form', 'frame', 'slot']
+
+with open('Janet.conllu', encoding='utf-8') as trainfile:
+    slot_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o}))
+with open('Janet.conllu', encoding='utf-8') as trainfile:
+    frame_trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'frame': nolabel2o}))
+
+tabulate(slot_trainset[0], tablefmt='html')
+
+
+slot_corpus = Corpus(train=conllu2flair(slot_trainset, 'slot'), test=conllu2flair(slot_trainset, 'slot'))
+frame_corpus = Corpus(train=conllu2flair(frame_trainset, 'frame'), test=conllu2flair(frame_trainset, 'frame'))
+
+slot_tag_dictionary = slot_corpus.make_tag_dictionary(tag_type='slot')
+frame_tag_dictionary = frame_corpus.make_tag_dictionary(tag_type='frame')
+
+
+embedding_types = [
+    WordEmbeddings('pl'),
+    FlairEmbeddings('pl-forward'),
+    FlairEmbeddings('pl-backward'),
+    CharacterEmbeddings(),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+slot_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
+                        tag_dictionary=slot_tag_dictionary,
+                        tag_type='slot', use_crf=True)
+frame_tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
+                        tag_dictionary=frame_tag_dictionary,
+                        tag_type='frame', use_crf=True)
+
+# slot_trainer = ModelTrainer(slot_tagger, slot_corpus)
+# slot_trainer.train('slot-model',
+#             learning_rate=0.1,
+#             mini_batch_size=32,
+#             max_epochs=100,
+#             train_with_dev=False)
+
+            
+frame_trainer = ModelTrainer(frame_tagger, frame_corpus)
+frame_trainer.train('frame-model',
+            learning_rate=0.1,
+            mini_batch_size=32,
+            max_epochs=100,
+            train_with_dev=False)