diff --git a/NLU_lab_7-8/NLU.py b/NLU_lab_7-8/NLU.py new file mode 100644 index 0000000..bbb5ef1 --- /dev/null +++ b/NLU_lab_7-8/NLU.py @@ -0,0 +1,115 @@ +from conllu import parse_incr +from flair.data import Corpus, Sentence, Token +from flair.datasets import SentenceDataset +from flair.embeddings import StackedEmbeddings +from flair.embeddings import WordEmbeddings +from flair.embeddings import CharacterEmbeddings +from flair.embeddings import FlairEmbeddings +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer +import random +import torch +import os + + +class NLU: + def __init__(self): + self.model = None + + def nolabel2o(self, line, i): + return 'O' if line[i] == 'NoLabel' else line[i] + + def conllu2flair(self, sentences, label=None): + fsentences = [] + for sentence in sentences: + fsentence = Sentence() + for token in sentence: + ftoken = Token(token['form']) + if label: + ftoken.add_tag(label, token[label]) + fsentence.add_token(ftoken) + fsentences.append(fsentence) + return SentenceDataset(fsentences) + + def load_model(self, model_path): + self.model = SequenceTagger.load(model_path) + + def train_model(self, train_path, test_path): + fields = ['id', 'form', 'frame', 'slot'] + + with open(train_path, encoding='utf-8') as trainfile: + trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o})) + with open(test_path, encoding='utf-8') as testfile: + testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o})) + + random.seed(42) + torch.manual_seed(42) + + if torch.cuda.is_available(): + torch.cuda.manual_seed(0) + torch.cuda.manual_seed_all(0) + torch.backends.cudnn.enabled = False + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot')) + + tag_dictionary = corpus.make_tag_dictionary(tag_type='slot') + + embedding_types = [ + WordEmbeddings('pl'), + FlairEmbeddings('pl-forward'), + FlairEmbeddings('pl-backward'), + CharacterEmbeddings(), + ] + + embeddings = StackedEmbeddings(embeddings=embedding_types) + tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, + tag_dictionary=tag_dictionary, + tag_type='slot', use_crf=True) + + if not os.path.isdir('slot-model-pl'): + trainer = ModelTrainer(tagger, corpus) + trainer.train('slot-model-pl', + learning_rate=0.1, + mini_batch_size=32, + max_epochs=10, + train_with_dev=True) + + try: + self.load_model('slot-model-pl/best-model.pt') + except: + self.load_model('slot-model-pl/final-model.pt') + + # Tworzenie osobnego pliku z metrykami dla modelu + log_file = open('slot-model-pl/training.log', encoding='utf-8') + log_lines = log_file.readlines() + log_file.close() + with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w', + encoding='utf-8') \ + as eval_file: + for num, line in enumerate(log_file): + if line == 'Results:\n': + lines_to_write_start = num + eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n') + for line in log_lines[lines_to_write_start:]: + eval_file.write(line) + + def predict(self, sentence): + sentence = sentence.split() + csentence = [{'form': word} for word in sentence] + fsentence = self.conllu2flair([csentence])[0] + self.model.predict(fsentence) + return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)] + + +# Można przetestować... +# nlu = NLU() +# nlu.train_model('train-pl.conllu', 'test-pl.conllu') +# lub +# nlu.load_model('slot-model-pl/final-model.pt') +# print(nlu.predict("Poproszę jeden bilet na film Batman na imię Jan Kowalski")) + +# Zwrócone: +# [('Poproszę', 'O'), ('jeden', 'O'), ('bilet', 'O'), ('na', 'O'), ('film', 'O'), ('Batman', 'B-movie'), +# ('na', 'O'), ('imię', 'O'), ('Jan', 'B-name'), ('Kowalski', 'I-name')] diff --git a/NLU_lab_7-8/README.md b/NLU_lab_7-8/README.md new file mode 100644 index 0000000..5e51f9d --- /dev/null +++ b/NLU_lab_7-8/README.md @@ -0,0 +1,6 @@ +**Moduł NLU - realizacja zadania** + +- Za pomocą skryptu create_datasets.py wywołanego w konsoli z argumentem będącym ścieżką do repozytorium z anotowanymi dialogami (np. python3 create_dataset.py ../dane) automatycznie tworzone są na ich podstawie treningowy i testowy zbiór w formacie .conllu (train-pl.conllu, test-pl.conllu). +- Na obecną chwilę występują rozbieżności w anotacjach plików dokonywanych przez poszczególnych członków zespołu. W wyniku tego tworzone zbiory zawierają mniej oznaczeń slotów niż powinny, co przekłada się na mało zadowalające wyniki modelu. Dokonane zostaną poprawa anotacji i przetrenowanie modelu. +- Obecnie moduł NLU zdefiniowany jest jako klasa w pliku NLU.py. Przy tworzeniu jej instancji można wytrenować model podając zbiory lub załadować model z pliku. W przyszłości zostanie on zintegrowany ze stworzonymi później pozostałymi modułami. Dla testów analizę semantyczną przykładowych wypowiedzi można dokonywać na utworzonej instancji klasy. Kod obrazujący wykorzystanie klasy znajduje się na dole pliku NLU.py. +- Przy trenowaniu modelu dokonywana jest również jego automatyczna ewaluacja, więc nie było potrzeby tworzenia skryptu evaluate.py dokonującego osobnej ewaluacji. Wyniki automatycznej ewaluacji są za to samoistnie zapisywane do osobnego pliku evaluation.txt. \ No newline at end of file diff --git a/NLU_lab_7-8/evaluation.txt b/NLU_lab_7-8/evaluation.txt index 85577ce..01ce4a8 100644 --- a/NLU_lab_7-8/evaluation.txt +++ b/NLU_lab_7-8/evaluation.txt @@ -2,37 +2,31 @@ Results: - F-score (micro) 0.2609 -- F-score (macro) 0.1509 -- Accuracy 0.1648 +- F-score (macro) 0.1489 +- Accuracy 0.1538 By class: precision recall f1-score support - quantity 0.3846 0.8333 0.5263 6 - time 0.3333 0.4286 0.3750 7 - title 0.3333 0.2222 0.2667 9 - goal 0.0000 0.0000 0.0000 10 - area 0.0000 0.0000 0.0000 3 - name 0.7500 0.6000 0.6667 5 - date 0.3333 0.3333 0.3333 3 - interval 0.0000 0.0000 0.0000 1 + name 0.1429 0.2000 0.1667 5 + ticketnumber 0.5000 1.0000 0.6667 3 + movie 0.3333 0.5000 0.4000 2 seat 0.0000 0.0000 0.0000 3 - ticketnumber 0.0000 0.0000 0.0000 3 e-mail 0.0000 0.0000 0.0000 3 phone 1.0000 1.0000 1.0000 1 + title 0.0000 0.0000 0.0000 2 row 0.0000 0.0000 0.0000 2 - movie 0.0000 0.0000 0.0000 2 reducedQuantity 0.0000 0.0000 0.0000 2 - seats 0.0000 0.0000 0.0000 0 purchaseType 0.0000 0.0000 0.0000 1 bankAccountNumber 0.0000 0.0000 0.0000 1 email 0.0000 0.0000 0.0000 1 hour 0.0000 0.0000 0.0000 1 + time 0.0000 0.0000 0.0000 1 seatPlacement 0.0000 0.0000 0.0000 1 - micro avg 0.3000 0.2308 0.2609 65 - macro avg 0.1493 0.1627 0.1509 65 - weighted avg 0.2060 0.2308 0.2079 65 - samples avg 0.1648 0.1648 0.1648 65 + micro avg 0.3529 0.2069 0.2609 29 + macro avg 0.1317 0.1800 0.1489 29 + weighted avg 0.1338 0.2069 0.1598 29 + samples avg 0.1538 0.1538 0.1538 29 -2022-05-02 19:47:10,324 ---------------------------------------------------------------------------------------------------- +2022-05-06 21:01:49,500 ---------------------------------------------------------------------------------------------------- diff --git a/NLU_lab_7-8/main.py b/NLU_lab_7-8/main.py deleted file mode 100644 index 2cbb4fb..0000000 --- a/NLU_lab_7-8/main.py +++ /dev/null @@ -1,97 +0,0 @@ -from conllu import parse_incr -from flair.data import Corpus, Sentence, Token -from flair.datasets import SentenceDataset -from flair.embeddings import StackedEmbeddings -from flair.embeddings import WordEmbeddings -from flair.embeddings import CharacterEmbeddings -from flair.embeddings import FlairEmbeddings -from flair.models import SequenceTagger -from flair.trainers import ModelTrainer -import random -import torch -from tabulate import tabulate - -fields = ['id', 'form', 'frame', 'slot'] - - -def nolabel2o(line, i): - return 'O' if line[i] == 'NoLabel' else line[i] - - -def conllu2flair(sentences, label=None): - fsentences = [] - for sentence in sentences: - fsentence = Sentence() - for token in sentence: - ftoken = Token(token['form']) - if label: - ftoken.add_tag(label, token[label]) - fsentence.add_token(ftoken) - fsentences.append(fsentence) - return SentenceDataset(fsentences) - - -def predict(model, sentence): - csentence = [{'form': word} for word in sentence] - fsentence = conllu2flair([csentence])[0] - model.predict(fsentence) - return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)] - - -with open('train-pl.conllu', encoding='utf-8') as trainfile: - trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o})) -with open('test-pl.conllu', encoding='utf-8') as testfile: - testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': nolabel2o})) - -random.seed(42) -torch.manual_seed(42) - -if torch.cuda.is_available(): - torch.cuda.manual_seed(0) - torch.cuda.manual_seed_all(0) - torch.backends.cudnn.enabled = False - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - -corpus = Corpus(train=conllu2flair(trainset, 'slot'), test=conllu2flair(testset, 'slot')) - -tag_dictionary = corpus.make_tag_dictionary(tag_type='slot') - -embedding_types = [ - WordEmbeddings('pl'), - FlairEmbeddings('pl-forward'), - FlairEmbeddings('pl-backward'), - CharacterEmbeddings(), -] - -embeddings = StackedEmbeddings(embeddings=embedding_types) -tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, - tag_dictionary=tag_dictionary, - tag_type='slot', use_crf=True) - - -trainer = ModelTrainer(tagger, corpus) -trainer.train('slot-model-pl', - learning_rate=0.1, - mini_batch_size=32, - max_epochs=10, - train_with_dev=True) - -try: - model = SequenceTagger.load('slot-model-pl/best-model.pt') -except: - model = SequenceTagger.load('slot-model-pl/final-model.pt') - -log_file = open('slot-model-pl/training.log', encoding='utf-8') -log_lines = log_file.readlines() -log_file.close() -with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w', encoding='utf-8') \ - as eval_file: - for num, line in enumerate(log_file): - if line == 'Results:\n': - lines_to_write_start = num - eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n') - for line in log_lines[lines_to_write_start:]: - eval_file.write(line) - -print(tabulate(predict(model, 'Jeden bilet na imię Jan Kowalski na film Batman'.split())))