import re from conllu import parse_incr from flair.data import Corpus, Sentence, Token from flair.datasets import SentenceDataset from flair.embeddings import StackedEmbeddings from flair.models import SequenceTagger from flair.trainers import ModelTrainer import random import torch from flair.datasets import CSVClassificationCorpus from flair.embeddings import WordEmbeddings, FlairEmbeddings, CharacterEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier import os class NLU: def __init__(self): self.slot_model = None self.intent_model = None def nolabel2o(self, line, i): return 'O' if line[i] == 'NoLabel' else line[i] def conllu2flair(self, sentences, label=None): fsentences = [] for sentence in sentences: fsentence = Sentence() for token in sentence: ftoken = Token(token['form']) if label: ftoken.add_tag(label, token[label]) fsentence.add_token(ftoken) fsentences.append(fsentence) return SentenceDataset(fsentences) def load_slot_model(self, model_path): try: self.slot_model = SequenceTagger.load(f'{model_path}/best-model.pt') except: self.slot_model = SequenceTagger.load(f'{model_path}/final-model.pt') def train_slot_model(self, train_path, test_path): fields = ['id', 'form', 'frame', 'slot'] with open(train_path, encoding='utf-8') as trainfile: trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o})) with open(test_path, encoding='utf-8') as testfile: testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o})) random.seed(42) torch.manual_seed(42) if torch.cuda.is_available(): torch.cuda.manual_seed(0) torch.cuda.manual_seed_all(0) torch.backends.cudnn.enabled = False torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot')) tag_dictionary = corpus.make_tag_dictionary(tag_type='slot') embedding_types = [ WordEmbeddings('pl'), FlairEmbeddings('pl-forward'), FlairEmbeddings('pl-backward'), CharacterEmbeddings(), ] embeddings = StackedEmbeddings(embeddings=embedding_types) tagger = SequenceTagger(hidden_size=512, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type='slot', use_crf=True) trainer = ModelTrainer(tagger, corpus) dirpath = 'slot-model-pl' if not os.path.isdir(dirpath): trainer.train(dirpath, learning_rate=0.1, mini_batch_size=32, max_epochs=20, train_with_dev=True) self.load_slot_model(dirpath) # Tworzenie osobnego pliku z metrykami dla modelu log_file = open('slot-model-pl/training.log', encoding='utf-8') log_lines = log_file.readlines() log_file.close() with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('nlu_evaluation.txt', 'w', encoding='utf-8') \ as eval_file: for num, line in enumerate(log_file): if line == 'Results:\n': lines_to_write_start = num eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n') for line in log_lines[lines_to_write_start:]: eval_file.write(line) def predict_slots(self, sentence): sentence = sentence.split() csentence = [{'form': word} for word in sentence] fsentence = self.conllu2flair([csentence])[0] self.slot_model.predict(fsentence) return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)] def load_intent_model(self, model_path): try: self.intent_model = TextClassifier.load(f'{model_path}/best-model.pt') except: self.intent_model = TextClassifier.load(f'{model_path}/final-model.pt') def train_intent_model(self, data_path): column_name_map = {0: "text", 1: "label_intent"} corpus = CSVClassificationCorpus(data_path, column_name_map, skip_header=False, delimiter='\t', label_type='label_intent' ) label_dict = corpus.make_label_dictionary(label_type='label_intent') word_embeddings = [ WordEmbeddings('pl'), FlairEmbeddings('polish-forward'), FlairEmbeddings('polish-backward'), CharacterEmbeddings(), ] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512) classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type='label_intent') trainer = ModelTrainer(classifier, corpus) dirpath = 'intent-model-pl' if not os.path.isdir(dirpath): trainer.train(dirpath, learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=20) self.load_intent_model(dirpath) def predict_intent(self, sentence): sentence = Sentence(sentence) self.intent_model.predict(sentence) try: label_text = sentence.labels[0].value except: label_text = '' return label_text def format_prediction(prediction, intent): out_list = [] for idx, tup in enumerate(prediction): if tup[1][0] == 'B': slot_list = [intent, 'Cinema', tup[1][2:], tup[0]] for tup in prediction[idx + 1:]: if tup[1][0] != 'I': break else: slot_list[3] += ' ' + tup[0] out_list.append(slot_list) for slot in out_list: slot[3] = re.sub("^[!\"#$%&\'()*+,.;:<=>?\[\]^_`{|}~]+", '', slot[3]) slot[3] = re.sub("[!\"#$%&\'()*+,.;:<=>?\[\]^_`{|}~]+$", '', slot[3]) return out_list # Testy """ nlu = NLU() # raz: nlu.train_slot_model('../data/train+test-pl.conllu', '../data/train+test-pl.conllu') nlu.train_intent_model('../data/intent_data') # potem: # nlu.load_slot_model('slot-model-pl') # nlu.load_intent_model('intent-model-pl') sentence = "3 studenckie, miejsca 2-5, rząd 7" slots = nlu.predict_slots(sentence) intent = nlu.predict_intent(sentence) formatted_prediction = format_prediction(slots, intent) print(formatted_prediction) """