aitech-sd-lab/modules/NLU.py

import re
from conllu import parse_incr
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import random
import torch
from flair.datasets import CSVClassificationCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, CharacterEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
import os


class NLU:
    def __init__(self):
        self.slot_model = None
        self.intent_model = None

    def nolabel2o(self, line, i):
        return 'O' if line[i] == 'NoLabel' else line[i]

    def conllu2flair(self, sentences, label=None):
        fsentences = []
        for sentence in sentences:
            fsentence = Sentence()
            for token in sentence:
                ftoken = Token(token['form'])
                if label:
                    ftoken.add_tag(label, token[label])
                fsentence.add_token(ftoken)
            fsentences.append(fsentence)
        return SentenceDataset(fsentences)

    def load_slot_model(self, model_path):
        try:
            self.slot_model = SequenceTagger.load(f'{model_path}/best-model.pt')
        except:
            self.slot_model = SequenceTagger.load(f'{model_path}/final-model.pt')

    def train_slot_model(self, train_path, test_path):
        fields = ['id', 'form', 'frame', 'slot']

        with open(train_path, encoding='utf-8') as trainfile:
            trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
        with open(test_path, encoding='utf-8') as testfile:
            testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))

        random.seed(42)
        torch.manual_seed(42)

        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
            torch.cuda.manual_seed_all(0)
            torch.backends.cudnn.enabled = False
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

        corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))
        tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')

        embedding_types = [
            WordEmbeddings('pl'),
            FlairEmbeddings('pl-forward'),
            FlairEmbeddings('pl-backward'),
            CharacterEmbeddings(),
        ]

        embeddings = StackedEmbeddings(embeddings=embedding_types)
        tagger = SequenceTagger(hidden_size=512, embeddings=embeddings,
                                tag_dictionary=tag_dictionary,
                                tag_type='slot', use_crf=True)
        trainer = ModelTrainer(tagger, corpus)

        dirpath = 'slot-model-pl'

        if not os.path.isdir(dirpath):
            trainer.train(dirpath,
                          learning_rate=0.1,
                          mini_batch_size=32,
                          max_epochs=20,
                          train_with_dev=True)

        self.load_slot_model(dirpath)

        # Tworzenie osobnego pliku z metrykami dla modelu
        log_file = open('slot-model-pl/training.log', encoding='utf-8')
        log_lines = log_file.readlines()
        log_file.close()
        with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('nlu_evaluation.txt', 'w',
                                                                                    encoding='utf-8') \
                as eval_file:
            for num, line in enumerate(log_file):
                if line == 'Results:\n':
                    lines_to_write_start = num
            eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n')
            for line in log_lines[lines_to_write_start:]:
                eval_file.write(line)

    def predict_slots(self, sentence):
        sentence = sentence.split()
        csentence = [{'form': word} for word in sentence]
        fsentence = self.conllu2flair([csentence])[0]
        self.slot_model.predict(fsentence)
        return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]

    def load_intent_model(self, model_path):
        try:
            self.intent_model = TextClassifier.load(f'{model_path}/best-model.pt')
        except:
            self.intent_model = TextClassifier.load(f'{model_path}/final-model.pt')

    def train_intent_model(self, data_path):
        column_name_map = {0: "text", 1: "label_intent"}
        corpus = CSVClassificationCorpus(data_path,
                                         column_name_map,
                                         skip_header=False,
                                         delimiter='\t', label_type='label_intent'
                                         )
        label_dict = corpus.make_label_dictionary(label_type='label_intent')

        word_embeddings = [
            WordEmbeddings('pl'),
            FlairEmbeddings('polish-forward'),
            FlairEmbeddings('polish-backward'),
            CharacterEmbeddings(),
        ]

        document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512)
        classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type='label_intent')
        trainer = ModelTrainer(classifier, corpus)

        dirpath = 'intent-model-pl'

        if not os.path.isdir(dirpath):
            trainer.train(dirpath,
                          learning_rate=0.1,
                          mini_batch_size=32,
                          anneal_factor=0.5,
                          patience=5,
                          max_epochs=20)

        self.load_intent_model(dirpath)

    def predict_intent(self, sentence):
        sentence = Sentence(sentence)
        self.intent_model.predict(sentence)
        try:
            label_text = sentence.labels[0].value
        except:
            label_text = ''
        return label_text


def format_prediction(prediction, intent):
    out_list = []
    for idx, tup in enumerate(prediction):
        if tup[1][0] == 'B':
            slot_list = [intent, 'Cinema', tup[1][2:], tup[0]]
            for tup in prediction[idx + 1:]:
                if tup[1][0] != 'I':
                    break
                else:
                    slot_list[3] += ' ' + tup[0]
            out_list.append(slot_list)
    for slot in out_list:
        slot[3] = re.sub("^[!\"#$%&\'()*+,.;:<=>?\[\]^_`{|}~]+", '', slot[3])
        slot[3] = re.sub("[!\"#$%&\'()*+,.;:<=>?\[\]^_`{|}~]+$", '', slot[3])
    return out_list


# Testy
"""
nlu = NLU()
# raz:
nlu.train_slot_model('../data/train+test-pl.conllu', '../data/train+test-pl.conllu')
nlu.train_intent_model('../data/intent_data')
# potem:
# nlu.load_slot_model('slot-model-pl')
# nlu.load_intent_model('intent-model-pl')
sentence = "3 studenckie, miejsca 2-5, rząd 7"
slots = nlu.predict_slots(sentence)
intent = nlu.predict_intent(sentence)
formatted_prediction = format_prediction(slots, intent)
print(formatted_prediction)
"""