Finalne poprawki

This commit is contained in:
s495727 2024-05-10 01:25:13 +02:00
parent 846c6991e7
commit 16af0e732c
7 changed files with 952 additions and 878 deletions

View File

@ -1,15 +1,15 @@
print("Script to automatically append data to data/dialog.conllu") print("Script to automatically append data to data/dialog.conllu")
print("Start typing now. Press Ctrl+C to stop.") print("Start typing now. Press Ctrl+C to stop.")
while True: while True:
with open("data/train_dialog.conllu", "a") as f: with open("data/train_dialog.conllu", "a") as f:
text = input("Text: ") text = input("Text: ")
act = input("Intent: ") act = input("Intent: ")
slots = text.split(" ") slots = text.split(" ")
f.write( f.write(
f"\n# text: {text}\n# intent: {act}\n# slots:\n" f"\n# text: {text}\n# intent: {act}\n# slots:\n"
) )
for i, slot in enumerate(slots): for i, slot in enumerate(slots):
label = input(f"{i}/{slot} label: ") label = input(f"{i}/{slot} label: ")
f.write(f"{i+1}\t{slot}\t{act}\t{label}\n") f.write(f"{i+1}\t{slot}\t{act}\t{label}\n")
print("---") print("---")

File diff suppressed because it is too large Load Diff

View File

@ -332,11 +332,11 @@
3 w request/menu NoLabel 3 w request/menu NoLabel
4 ofercie request/menu NoLabel 4 ofercie request/menu NoLabel
# text: chciałbym 3 pizze, hawajskie duże # text: chciałbym trzy pizze, hawajskie duże
# intent: inform/order # intent: inform/order
# slots: # slots:
1 chciałbym inform/order NoLabel 1 chciałbym inform/order NoLabel
2 3 inform/order B-quantity 2 trzy inform/order B-quantity
3 pizze, inform/order B-food 3 pizze, inform/order B-food
4 hawajskie inform/order B-pizza 4 hawajskie inform/order B-pizza
5 duże inform/order B-size 5 duże inform/order B-size
@ -585,11 +585,11 @@
4 tuna inform/order B-pizza 4 tuna inform/order B-pizza
5 XL inform/order B-size 5 XL inform/order B-size
# text: wezmę 3 pizze tuna, średnią, dużą i bardzo dużą # text: wezmę 3x pizze tuna, średnią, dużą i bardzo dużą
# intent: inform/order # intent: inform/order
# slots: # slots:
1 wezmę inform/order NoLabel 1 wezmę inform/order NoLabel
2 3 inform/order B-quantity 2 3x inform/order B-quantity
3 pizze inform/order B-food 3 pizze inform/order B-food
4 tuna, inform/order B-pizza 4 tuna, inform/order B-pizza
5 średnią, inform/order B-size 5 średnią, inform/order B-size
@ -825,6 +825,14 @@
1 jakie request/ingredients NoLabel 1 jakie request/ingredients NoLabel
2 składniki request/ingredients NoLabel 2 składniki request/ingredients NoLabel
# text: co jest na pizzy
# intent: request/ingredients
# slots:
1 co request/ingredients NoLabel
2 jest request/ingredients NoLabel
3 na request/ingredients NoLabel
4 pizzy request/ingredients NoLabel
# text: jakie są napoje # text: jakie są napoje
# intent: request/drinks # intent: request/drinks
# slots: # slots:
@ -850,3 +858,54 @@
2 macie request/drinks NoLabel 2 macie request/drinks NoLabel
3 do request/drinks NoLabel 3 do request/drinks NoLabel
4 picia request/drinks NoLabel 4 picia request/drinks NoLabel
# text: czy są dostępne jakieś sosy?
# intent: request/sauce
# slots:
1 czy request/sauce NoLabel
2 są request/sauce NoLabel
3 dostępne request/sauce NoLabel
4 jakieś request/sauce NoLabel
5 sosy? request/sauce NoLabel
# text: Grzegorz Pieczarski
# intent: inform/name
# slots:
1 Grzegorz inform/name B-name
2 Pieczarski inform/name I-name
# text: Sergiusz Kaczmarek
# intent: inform/name
# slots:
1 Sergiusz inform/name B-name
2 Kaczmarek inform/name I-name
# text: jaki koszt dowozu
# intent: request/delivery-price
# slots:
1 jaki request/delivery-price NoLabel
2 koszt request/delivery-price NoLabel
3 dowozu request/delivery-price NoLabel
# text: jakie sosy w menu?
# intent: request/sauce
# slots:
1 jakie request/sauce NoLabel
2 sosy request/sauce NoLabel
3 w request/sauce NoLabel
4 menu? request/sauce NoLabel
# text: Napój pepsi i cola
# intent: inform/order
# slots:
1 Napój inform/order NoLabel
2 pepsi inform/order B-drink
3 i inform/order NoLabel
4 cola inform/order B-drink
# text: woda i sok
# intent: inform/order
# slots:
1 woda inform/order B-drink
2 i inform/order NoLabel
3 sok inform/order B-drink

View File

@ -1,45 +1,63 @@
import re import re
import os import os
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from nlu_utils import predict_multiple from nlu_utils import predict_multiple
from flair.models import SequenceTagger from flair.models import SequenceTagger
from conllu import parse_incr
def __parse_acts(acts): from flair.data import Corpus
acts_split = acts.split('&') from nlu_utils import conllu2flair, nolabel2o
remove_slot_regex = "[\(\[].*?[\)\]]"
return set(re.sub(remove_slot_regex, "", act) for act in acts_split) # Frame model evaluation
frame_model = SequenceTagger.load('frame-model-prod/best-model.pt')
def __parse_predictions(predictions): with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
return set(prediction.split('/')[0] for prediction in predictions) testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={}))
# Exploratory tests corpus = Corpus(test=conllu2flair(testset, "frame"))
frame_model = SequenceTagger.load('frame-model-prod/best-model.pt') result = frame_model.evaluate(corpus.test, mini_batch_size=1, gold_label_type="frame")
# slot_model = SequenceTagger.load('slot-model-prod/final-model.pt') print(result.detailed_results)
total_acts = 0 # Slot model evaluation
act_correct_predictions = 0 slot_model = SequenceTagger.load('slot-model-prod/best-model.pt')
slot_correct_predictions = 0
with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
for file_name in os.listdir('data'): testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={'slot': nolabel2o}))
if file_name.split('.')[-1] != 'tsv':
continue corpus = Corpus(test=conllu2flair(testset, "slot"))
result = slot_model.evaluate(corpus.test, mini_batch_size=8, gold_label_type="slot")
df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt']) print(result.detailed_results)
df = df[df.kto == 'user']
all_data = np.array(df) # Custom evaluation
def __parse_acts(acts):
for row in all_data: acts_split = acts.split('&')
sentence = row[1] remove_slot_regex = "[\(\[].*?[\)\]]"
acts = __parse_acts(row[2]) return set(re.sub(remove_slot_regex, "", act) for act in acts_split)
predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame') def __parse_predictions(predictions):
predictions = __parse_predictions(predictions_raw) return set(prediction.split('/')[0] for prediction in predictions)
for act in acts: total_acts = 0
total_acts += 1 act_correct_predictions = 0
if act in predictions: slot_correct_predictions = 0
act_correct_predictions += 1
for file_name in os.listdir('data'):
if file_name.split('.')[-1] != 'tsv':
continue
df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt'])
df = df[df.kto == 'user']
all_data = np.array(df)
for row in all_data:
sentence = row[1]
acts = __parse_acts(row[2])
predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame')
predictions = __parse_predictions(predictions_raw)
for act in acts:
total_acts += 1
if act in predictions:
act_correct_predictions += 1
print(f"Accuracy - predicting acts: {(act_correct_predictions / total_acts)*100} ({act_correct_predictions}/{total_acts})") print(f"Accuracy - predicting acts: {(act_correct_predictions / total_acts)*100} ({act_correct_predictions}/{total_acts})")

View File

@ -1,30 +1,30 @@
from flair.models import SequenceTagger from flair.models import SequenceTagger
from nlu_utils import predict_single, predict_multiple, predict_and_annotate from nlu_utils import predict_single, predict_multiple, predict_and_annotate
# Exploratory tests # Exploratory tests
frame_model = SequenceTagger.load('frame-model/best-model.pt') frame_model = SequenceTagger.load('frame-model/best-model.pt')
tests = [ tests = [
'chciałbym zamówić pizzę', 'chciałbym zamówić pizzę',
'na godzinę 12', 'na godzinę 12',
'prosiłbym o pizzę z pieczarkami', 'prosiłbym o pizzę z pieczarkami',
'to wszystko, jaka cena?', 'to wszystko, jaka cena?',
'ile kosztuje pizza', 'ile kosztuje pizza',
'do widzenia', 'do widzenia',
'tak', 'tak',
'nie dziękuję', 'nie dziękuję',
'dodatkowy ser', 'dodatkowy ser',
'pizzę barcelona bez cebuli', 'pizzę barcelona bez cebuli',
] ]
# print("=== Exploratory tests - frame model ===") # print("=== Exploratory tests - frame model ===")
for test in tests: for test in tests:
print(f"Sentence: {test}") print(f"Sentence: {test}")
print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}") print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}")
print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}") print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}")
print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}") print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}")
print("=== Exploratory tests - slot model ===") print("=== Exploratory tests - slot model ===")
slot_model = SequenceTagger.load('slot-model/final-model.pt') slot_model = SequenceTagger.load('slot-model/final-model.pt')
for test in tests: for test in tests:
print(f"Sentence: {test}") print(f"Sentence: {test}")
print(f"Prediction: {predict_and_annotate(slot_model, test.split(), 'slot')}") print(f"Prediction: {predict_and_annotate(slot_model, test.split(), 'slot')}")

View File

@ -1,46 +1,42 @@
from conllu import parse_incr from conllu import parse_incr
from flair.data import Corpus from flair.data import Corpus
from flair.embeddings import StackedEmbeddings from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger from flair.models import SequenceTagger
from flair.trainers import ModelTrainer from flair.trainers import ModelTrainer
from nlu_utils import conllu2flair, nolabel2o from nlu_utils import conllu2flair, nolabel2o
import random import torch
import torch if torch.cuda.is_available():
random.seed(42) torch.backends.cudnn.enabled = False
torch.manual_seed(42) torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
torch.cuda.manual_seed(0) def train_model(label_type, field_parsers = {}):
torch.cuda.manual_seed_all(0) with open('data/train_dialog.conllu', encoding='utf-8') as f:
torch.backends.cudnn.enabled = False trainset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
torch.backends.cudnn.benchmark = False with open('data/test_dialog_46.conllu', encoding='utf-8') as f:
torch.backends.cudnn.deterministic = True testset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
breakpoint()
def train_model(label_type, field_parsers = {}): corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(testset, label_type))
with open('data/train_dialog.conllu', encoding='utf-8') as trainfile: label_dictionary = corpus.make_label_dictionary(label_type=label_type)
trainset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
embedding_types = [
corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(trainset, label_type)) WordEmbeddings('pl'),
label_dictionary = corpus.make_label_dictionary(label_type=label_type) FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
embedding_types = [ CharacterEmbeddings(),
WordEmbeddings('pl'), ]
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'), embeddings = StackedEmbeddings(embeddings=embedding_types)
CharacterEmbeddings(), tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
]
frame_trainer = ModelTrainer(tagger, corpus)
embeddings = StackedEmbeddings(embeddings=embedding_types) frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=16, max_epochs=75, train_with_dev=False)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
if __name__ == '__main__':
frame_trainer = ModelTrainer(tagger, corpus) train_model("frame")
frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=32, max_epochs=75, train_with_dev=False) # train_model('slot', field_parsers={'slot': nolabel2o})
if __name__ == '__main__':
train_model("frame")
train_model('slot', field_parsers={'slot': nolabel2o})

View File

@ -1,100 +1,101 @@
from flair.data import Sentence from flair.data import Sentence
from flair.datasets import FlairDatapointDataset from flair.datasets import FlairDatapointDataset
def nolabel2o(line, i): def nolabel2o(line, i):
return 'O' if line[i] == 'NoLabel' else line[i] return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(sentences, label=None): def conllu2flair(sentences, label=None):
if label == "frame": if label == "frame":
return conllu2flair_frame(sentences, label) return conllu2flair_frame(sentences, label)
else: else:
return conllu2flair_slot(sentences, label) return conllu2flair_slot(sentences, label)
def conllu2flair_frame(sentences, label=None): def conllu2flair_frame(sentences, label=None):
fsentences = [] fsentences = []
for sentence in sentences: for sentence in sentences:
tokens = [token["form"] for token in sentence] tokens = [token["form"] for token in sentence]
fsentence = Sentence(' '.join(tokens), use_tokenizer=False) fsentence = Sentence(' '.join(tokens), use_tokenizer=False)
for i in range(len(fsentence)): for i in range(len(fsentence)):
fsentence[i:i+1].add_label(label, sentence[i][label]) fsentence[i:i+1].add_label(label, sentence[i][label])
fsentences.append(fsentence) fsentences.append(fsentence)
return FlairDatapointDataset(fsentences) return FlairDatapointDataset(fsentences)
def conllu2flair_slot(sentences, label=None): def conllu2flair_slot(sentences, label=None):
fsentences = [] fsentences = []
for sentence in sentences:
for sentence in sentences: fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False)
fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False) start_idx = None
start_idx = None end_idx = None
end_idx = None tag = None
tag = None
if label:
if label: for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)):
for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)): if token[label].startswith('B-'):
if token[label].startswith('B-'): if start_idx is not None:
start_idx = idx fsentence[start_idx:end_idx+1].add_label(label, tag)
end_idx = idx start_idx = idx
tag = token[label][2:] end_idx = idx
elif token[label].startswith('I-'): tag = token[label][2:]
end_idx = idx elif token[label].startswith('I-'):
elif token[label] == 'O': end_idx = idx
if start_idx is not None: elif token[label] == 'O':
fsentence[start_idx:end_idx+1].add_label(label, tag) if start_idx is not None:
start_idx = None fsentence[start_idx:end_idx+1].add_label(label, tag)
end_idx = None start_idx = None
tag = None end_idx = None
tag = None
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag) if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
fsentences.append(fsentence)
return FlairDatapointDataset(fsentences) fsentences.append(fsentence)
return FlairDatapointDataset(fsentences)
def __predict(model, csentence):
fsentence = conllu2flair([csentence])[0] def __predict(model, csentence):
model.predict(fsentence) fsentence = conllu2flair([csentence])[0]
return fsentence model.predict(fsentence)
return fsentence
def __csentence(sentence, label_type):
if label_type == "frame": def __csentence(sentence, label_type):
return [{'form': word } for word in sentence] if label_type == "frame":
else: return [{'form': word } for word in sentence]
return [{'form': word, 'slot': 'O'} for word in sentence] else:
return [{'form': word, 'slot': 'O'} for word in sentence]
def predict_single(model, sentence, label_type):
csentence = __csentence(sentence, label_type) def predict_single(model, sentence, label_type):
fsentence = __predict(model, csentence) csentence = __csentence(sentence, label_type)
intent = {} fsentence = __predict(model, csentence)
intent = {}
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value for span in fsentence.get_spans(label_type):
if tag in intent: tag = span.get_label(label_type).value
intent[tag] += 1 if tag in intent:
else: intent[tag] += 1
intent[tag] = 1 else:
intent[tag] = 1
return max(intent, key=intent.get)
return max(intent, key=intent.get)
def predict_multiple(model, sentence, label_type):
csentence = __csentence(sentence, label_type) def predict_multiple(model, sentence, label_type):
fsentence = __predict(model, csentence) csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
def predict_and_annotate(model, sentence, label_type):
csentence = __csentence(sentence, label_type) def predict_and_annotate(model, sentence, label_type):
fsentence = __predict(model, csentence) csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value for span in fsentence.get_spans(label_type):
if label_type == "frame": tag = span.get_label(label_type).value
csentence[span.tokens[0].idx-1]['frame'] = tag if label_type == "frame":
else: csentence[span.tokens[0].idx-1]['frame'] = tag
csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}' else:
for token in span.tokens[1:]: csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}'
csentence[token.idx - 1]['slot'] = f'I-{tag}' for token in span.tokens[1:]:
csentence[token.idx - 1]['slot'] = f'I-{tag}'
return csentence return csentence