aitech-sd-lab/NLU_lab_7-8/NLU.py

from conllu import parse_incr
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import random
import torch
import os


class NLU:
    def __init__(self):
        self.model = None

    def nolabel2o(self, line, i):
        return 'O' if line[i] == 'NoLabel' else line[i]

    def conllu2flair(self, sentences, label=None):
        fsentences = []
        for sentence in sentences:
            fsentence = Sentence()
            for token in sentence:
                ftoken = Token(token['form'])
                if label:
                    ftoken.add_tag(label, token[label])
                fsentence.add_token(ftoken)
            fsentences.append(fsentence)
        return SentenceDataset(fsentences)

    def load_model(self, model_path):
        self.model = SequenceTagger.load(model_path)

    def train_model(self, train_path, test_path):
        fields = ['id', 'form', 'frame', 'slot']

        with open(train_path, encoding='utf-8') as trainfile:
            trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
        with open(test_path, encoding='utf-8') as testfile:
            testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))

        random.seed(42)
        torch.manual_seed(42)

        if torch.cuda.is_available():
            torch.cuda.manual_seed(0)
            torch.cuda.manual_seed_all(0)
            torch.backends.cudnn.enabled = False
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

        corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))

        tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')

        embedding_types = [
            WordEmbeddings('pl'),
            FlairEmbeddings('pl-forward'),
            FlairEmbeddings('pl-backward'),
            CharacterEmbeddings(),
        ]

        embeddings = StackedEmbeddings(embeddings=embedding_types)
        tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
                                tag_dictionary=tag_dictionary,
                                tag_type='slot', use_crf=True)

        if not os.path.isdir('slot-model-pl'):
            trainer = ModelTrainer(tagger, corpus)
            trainer.train('slot-model-pl',
                          learning_rate=0.1,
                          mini_batch_size=32,
                          max_epochs=10,
                          train_with_dev=True)

        try:
            self.load_model('slot-model-pl/best-model.pt')
        except:
            self.load_model('slot-model-pl/final-model.pt')

        # Tworzenie osobnego pliku z metrykami dla modelu
        log_file = open('slot-model-pl/training.log', encoding='utf-8')
        log_lines = log_file.readlines()
        log_file.close()
        with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w',
                                                                                    encoding='utf-8') \
                as eval_file:
            for num, line in enumerate(log_file):
                if line == 'Results:\n':
                    lines_to_write_start = num
            eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n')
            for line in log_lines[lines_to_write_start:]:
                eval_file.write(line)

    def predict(self, sentence):
        sentence = sentence.split()
        csentence = [{'form': word} for word in sentence]
        fsentence = self.conllu2flair([csentence])[0]
        self.model.predict(fsentence)
        return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]


# Można przetestować...
# nlu = NLU()
# nlu.train_model('train-pl.conllu', 'test-pl.conllu')
# lub
# nlu.load_model('slot-model-pl/final-model.pt')
# print(nlu.predict("Poproszę jeden bilet na film Batman na imię Jan Kowalski"))

# Zwrócone:
# [('Poproszę', 'O'), ('jeden', 'O'), ('bilet', 'O'), ('na', 'O'), ('film', 'O'), ('Batman', 'B-movie'),
# ('na', 'O'), ('imię', 'O'), ('Jan', 'B-name'), ('Kowalski', 'I-name')]
add readme, clean up code 2022-05-06 22:04:07 +02:00			`from conllu import parse_incr`
			`from flair.data import Corpus, Sentence, Token`
			`from flair.datasets import SentenceDataset`
			`from flair.embeddings import StackedEmbeddings`
			`from flair.embeddings import WordEmbeddings`
			`from flair.embeddings import CharacterEmbeddings`
			`from flair.embeddings import FlairEmbeddings`
			`from flair.models import SequenceTagger`
			`from flair.trainers import ModelTrainer`
			`import random`
			`import torch`
			`import os`


			`class NLU:`
			`def __init__(self):`
			`self.model = None`

			`def nolabel2o(self, line, i):`
			`return 'O' if line[i] == 'NoLabel' else line[i]`

			`def conllu2flair(self, sentences, label=None):`
			`fsentences = []`
			`for sentence in sentences:`
			`fsentence = Sentence()`
			`for token in sentence:`
			`ftoken = Token(token['form'])`
			`if label:`
			`ftoken.add_tag(label, token[label])`
			`fsentence.add_token(ftoken)`
			`fsentences.append(fsentence)`
			`return SentenceDataset(fsentences)`

			`def load_model(self, model_path):`
			`self.model = SequenceTagger.load(model_path)`

			`def train_model(self, train_path, test_path):`
			`fields = ['id', 'form', 'frame', 'slot']`

			`with open(train_path, encoding='utf-8') as trainfile:`
			`trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))`
			`with open(test_path, encoding='utf-8') as testfile:`
			`testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))`

			`random.seed(42)`
			`torch.manual_seed(42)`

			`if torch.cuda.is_available():`
			`torch.cuda.manual_seed(0)`
			`torch.cuda.manual_seed_all(0)`
			`torch.backends.cudnn.enabled = False`
			`torch.backends.cudnn.benchmark = False`
			`torch.backends.cudnn.deterministic = True`

			`corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))`

			`tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')`

			`embedding_types = [`
			`WordEmbeddings('pl'),`
			`FlairEmbeddings('pl-forward'),`
			`FlairEmbeddings('pl-backward'),`
			`CharacterEmbeddings(),`
			`]`

			`embeddings = StackedEmbeddings(embeddings=embedding_types)`
			`tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,`
			`tag_dictionary=tag_dictionary,`
			`tag_type='slot', use_crf=True)`

			`if not os.path.isdir('slot-model-pl'):`
			`trainer = ModelTrainer(tagger, corpus)`
			`trainer.train('slot-model-pl',`
			`learning_rate=0.1,`
			`mini_batch_size=32,`
			`max_epochs=10,`
			`train_with_dev=True)`

			`try:`
			`self.load_model('slot-model-pl/best-model.pt')`
			`except:`
			`self.load_model('slot-model-pl/final-model.pt')`

			`# Tworzenie osobnego pliku z metrykami dla modelu`
			`log_file = open('slot-model-pl/training.log', encoding='utf-8')`
			`log_lines = log_file.readlines()`
			`log_file.close()`
			`with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w',`
			`encoding='utf-8') \`
			`as eval_file:`
			`for num, line in enumerate(log_file):`
			`if line == 'Results:\n':`
			`lines_to_write_start = num`
			`eval_file.write('* This evaluation file was generated automatically by the training script *\n\n')`
			`for line in log_lines[lines_to_write_start:]:`
			`eval_file.write(line)`

			`def predict(self, sentence):`
			`sentence = sentence.split()`
			`csentence = [{'form': word} for word in sentence]`
			`fsentence = self.conllu2flair([csentence])[0]`
			`self.model.predict(fsentence)`
			`return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]`


			`# Można przetestować...`
			`# nlu = NLU()`
			`# nlu.train_model('train-pl.conllu', 'test-pl.conllu')`
			`# lub`
			`# nlu.load_model('slot-model-pl/final-model.pt')`
			`# print(nlu.predict("Poproszę jeden bilet na film Batman na imię Jan Kowalski"))`

			`# Zwrócone:`
			`# [('Poproszę', 'O'), ('jeden', 'O'), ('bilet', 'O'), ('na', 'O'), ('film', 'O'), ('Batman', 'B-movie'),`
			`# ('na', 'O'), ('imię', 'O'), ('Jan', 'B-name'), ('Kowalski', 'I-name')]`