add readme, clean up code

This commit is contained in:
Kacper 2022-05-06 22:04:07 +02:00
parent 273f20e2a5
commit 17b941219c
4 changed files with 133 additions and 115 deletions

115
NLU_lab_7-8/NLU.py Normal file
View File

@ -0,0 +1,115 @@
from conllu import parse_incr
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import random
import torch
import os
class NLU:
def __init__(self):
self.model = None
def nolabel2o(self, line, i):
return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(self, sentences, label=None):
fsentences = []
for sentence in sentences:
fsentence = Sentence()
for token in sentence:
ftoken = Token(token['form'])
if label:
ftoken.add_tag(label, token[label])
fsentence.add_token(ftoken)
fsentences.append(fsentence)
return SentenceDataset(fsentences)
def load_model(self, model_path):
self.model = SequenceTagger.load(model_path)
def train_model(self, train_path, test_path):
fields = ['id', 'form', 'frame', 'slot']
with open(train_path, encoding='utf-8') as trainfile:
trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
with open(test_path, encoding='utf-8') as testfile:
testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': self.nolabel2o}))
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
corpus = Corpus(train=self.conllu2flair(trainset, 'slot'), test=self.conllu2flair(testset, 'slot'))
tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type='slot', use_crf=True)
if not os.path.isdir('slot-model-pl'):
trainer = ModelTrainer(tagger, corpus)
trainer.train('slot-model-pl',
learning_rate=0.1,
mini_batch_size=32,
max_epochs=10,
train_with_dev=True)
try:
self.load_model('slot-model-pl/best-model.pt')
except:
self.load_model('slot-model-pl/final-model.pt')
# Tworzenie osobnego pliku z metrykami dla modelu
log_file = open('slot-model-pl/training.log', encoding='utf-8')
log_lines = log_file.readlines()
log_file.close()
with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w',
encoding='utf-8') \
as eval_file:
for num, line in enumerate(log_file):
if line == 'Results:\n':
lines_to_write_start = num
eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n')
for line in log_lines[lines_to_write_start:]:
eval_file.write(line)
def predict(self, sentence):
sentence = sentence.split()
csentence = [{'form': word} for word in sentence]
fsentence = self.conllu2flair([csentence])[0]
self.model.predict(fsentence)
return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]
# Można przetestować...
# nlu = NLU()
# nlu.train_model('train-pl.conllu', 'test-pl.conllu')
# lub
# nlu.load_model('slot-model-pl/final-model.pt')
# print(nlu.predict("Poproszę jeden bilet na film Batman na imię Jan Kowalski"))
# Zwrócone:
# [('Poproszę', 'O'), ('jeden', 'O'), ('bilet', 'O'), ('na', 'O'), ('film', 'O'), ('Batman', 'B-movie'),
# ('na', 'O'), ('imię', 'O'), ('Jan', 'B-name'), ('Kowalski', 'I-name')]

6
NLU_lab_7-8/README.md Normal file
View File

@ -0,0 +1,6 @@
**Moduł NLU - realizacja zadania**
- Za pomocą skryptu create_datasets.py wywołanego w konsoli z argumentem będącym ścieżką do repozytorium z anotowanymi dialogami (np. python3 create_dataset.py ../dane) automatycznie tworzone są na ich podstawie treningowy i testowy zbiór w formacie .conllu (train-pl.conllu, test-pl.conllu).
- Na obecną chwilę występują rozbieżności w anotacjach plików dokonywanych przez poszczególnych członków zespołu. W wyniku tego tworzone zbiory zawierają mniej oznaczeń slotów niż powinny, co przekłada się na mało zadowalające wyniki modelu. Dokonane zostaną poprawa anotacji i przetrenowanie modelu.
- Obecnie moduł NLU zdefiniowany jest jako klasa w pliku NLU.py. Przy tworzeniu jej instancji można wytrenować model podając zbiory lub załadować model z pliku. W przyszłości zostanie on zintegrowany ze stworzonymi później pozostałymi modułami. Dla testów analizę semantyczną przykładowych wypowiedzi można dokonywać na utworzonej instancji klasy. Kod obrazujący wykorzystanie klasy znajduje się na dole pliku NLU.py.
- Przy trenowaniu modelu dokonywana jest również jego automatyczna ewaluacja, więc nie było potrzeby tworzenia skryptu evaluate.py dokonującego osobnej ewaluacji. Wyniki automatycznej ewaluacji są za to samoistnie zapisywane do osobnego pliku evaluation.txt.

View File

@ -2,37 +2,31 @@
Results:
- F-score (micro) 0.2609
- F-score (macro) 0.1509
- Accuracy 0.1648
- F-score (macro) 0.1489
- Accuracy 0.1538
By class:
precision recall f1-score support
quantity 0.3846 0.8333 0.5263 6
time 0.3333 0.4286 0.3750 7
title 0.3333 0.2222 0.2667 9
goal 0.0000 0.0000 0.0000 10
area 0.0000 0.0000 0.0000 3
name 0.7500 0.6000 0.6667 5
date 0.3333 0.3333 0.3333 3
interval 0.0000 0.0000 0.0000 1
name 0.1429 0.2000 0.1667 5
ticketnumber 0.5000 1.0000 0.6667 3
movie 0.3333 0.5000 0.4000 2
seat 0.0000 0.0000 0.0000 3
ticketnumber 0.0000 0.0000 0.0000 3
e-mail 0.0000 0.0000 0.0000 3
phone 1.0000 1.0000 1.0000 1
title 0.0000 0.0000 0.0000 2
row 0.0000 0.0000 0.0000 2
movie 0.0000 0.0000 0.0000 2
reducedQuantity 0.0000 0.0000 0.0000 2
seats 0.0000 0.0000 0.0000 0
purchaseType 0.0000 0.0000 0.0000 1
bankAccountNumber 0.0000 0.0000 0.0000 1
email 0.0000 0.0000 0.0000 1
hour 0.0000 0.0000 0.0000 1
time 0.0000 0.0000 0.0000 1
seatPlacement 0.0000 0.0000 0.0000 1
micro avg 0.3000 0.2308 0.2609 65
macro avg 0.1493 0.1627 0.1509 65
weighted avg 0.2060 0.2308 0.2079 65
samples avg 0.1648 0.1648 0.1648 65
micro avg 0.3529 0.2069 0.2609 29
macro avg 0.1317 0.1800 0.1489 29
weighted avg 0.1338 0.2069 0.1598 29
samples avg 0.1538 0.1538 0.1538 29
2022-05-02 19:47:10,324 ----------------------------------------------------------------------------------------------------
2022-05-06 21:01:49,500 ----------------------------------------------------------------------------------------------------

View File

@ -1,97 +0,0 @@
from conllu import parse_incr
from flair.data import Corpus, Sentence, Token
from flair.datasets import SentenceDataset
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import random
import torch
from tabulate import tabulate
fields = ['id', 'form', 'frame', 'slot']
def nolabel2o(line, i):
return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(sentences, label=None):
fsentences = []
for sentence in sentences:
fsentence = Sentence()
for token in sentence:
ftoken = Token(token['form'])
if label:
ftoken.add_tag(label, token[label])
fsentence.add_token(ftoken)
fsentences.append(fsentence)
return SentenceDataset(fsentences)
def predict(model, sentence):
csentence = [{'form': word} for word in sentence]
fsentence = conllu2flair([csentence])[0]
model.predict(fsentence)
return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]
with open('train-pl.conllu', encoding='utf-8') as trainfile:
trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o}))
with open('test-pl.conllu', encoding='utf-8') as testfile:
testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': nolabel2o}))
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
corpus = Corpus(train=conllu2flair(trainset, 'slot'), test=conllu2flair(testset, 'slot'))
tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
tag_dictionary=tag_dictionary,
tag_type='slot', use_crf=True)
trainer = ModelTrainer(tagger, corpus)
trainer.train('slot-model-pl',
learning_rate=0.1,
mini_batch_size=32,
max_epochs=10,
train_with_dev=True)
try:
model = SequenceTagger.load('slot-model-pl/best-model.pt')
except:
model = SequenceTagger.load('slot-model-pl/final-model.pt')
log_file = open('slot-model-pl/training.log', encoding='utf-8')
log_lines = log_file.readlines()
log_file.close()
with open('slot-model-pl/training.log', encoding='utf-8') as log_file, open('evaluation.txt', 'w', encoding='utf-8') \
as eval_file:
for num, line in enumerate(log_file):
if line == 'Results:\n':
lines_to_write_start = num
eval_file.write('*** This evaluation file was generated automatically by the training script ***\n\n')
for line in log_lines[lines_to_write_start:]:
eval_file.write(line)
print(tabulate(predict(model, 'Jeden bilet na imię Jan Kowalski na film Batman'.split())))