Finalne poprawki

This commit is contained in:
s495727 2024-05-10 01:25:13 +02:00
parent 846c6991e7
commit 16af0e732c
7 changed files with 952 additions and 878 deletions

View File

@ -1,15 +1,15 @@
print("Script to automatically append data to data/dialog.conllu")
print("Start typing now. Press Ctrl+C to stop.")
while True:
with open("data/train_dialog.conllu", "a") as f:
text = input("Text: ")
act = input("Intent: ")
slots = text.split(" ")
f.write(
f"\n# text: {text}\n# intent: {act}\n# slots:\n"
)
for i, slot in enumerate(slots):
label = input(f"{i}/{slot} label: ")
f.write(f"{i+1}\t{slot}\t{act}\t{label}\n")
print("---")
print("Script to automatically append data to data/dialog.conllu")
print("Start typing now. Press Ctrl+C to stop.")
while True:
with open("data/train_dialog.conllu", "a") as f:
text = input("Text: ")
act = input("Intent: ")
slots = text.split(" ")
f.write(
f"\n# text: {text}\n# intent: {act}\n# slots:\n"
)
for i, slot in enumerate(slots):
label = input(f"{i}/{slot} label: ")
f.write(f"{i+1}\t{slot}\t{act}\t{label}\n")
print("---")

File diff suppressed because it is too large Load Diff

View File

@ -332,11 +332,11 @@
3 w request/menu NoLabel
4 ofercie request/menu NoLabel
# text: chciałbym 3 pizze, hawajskie duże
# text: chciałbym trzy pizze, hawajskie duże
# intent: inform/order
# slots:
1 chciałbym inform/order NoLabel
2 3 inform/order B-quantity
2 trzy inform/order B-quantity
3 pizze, inform/order B-food
4 hawajskie inform/order B-pizza
5 duże inform/order B-size
@ -585,11 +585,11 @@
4 tuna inform/order B-pizza
5 XL inform/order B-size
# text: wezmę 3 pizze tuna, średnią, dużą i bardzo dużą
# text: wezmę 3x pizze tuna, średnią, dużą i bardzo dużą
# intent: inform/order
# slots:
1 wezmę inform/order NoLabel
2 3 inform/order B-quantity
2 3x inform/order B-quantity
3 pizze inform/order B-food
4 tuna, inform/order B-pizza
5 średnią, inform/order B-size
@ -825,6 +825,14 @@
1 jakie request/ingredients NoLabel
2 składniki request/ingredients NoLabel
# text: co jest na pizzy
# intent: request/ingredients
# slots:
1 co request/ingredients NoLabel
2 jest request/ingredients NoLabel
3 na request/ingredients NoLabel
4 pizzy request/ingredients NoLabel
# text: jakie są napoje
# intent: request/drinks
# slots:
@ -850,3 +858,54 @@
2 macie request/drinks NoLabel
3 do request/drinks NoLabel
4 picia request/drinks NoLabel
# text: czy są dostępne jakieś sosy?
# intent: request/sauce
# slots:
1 czy request/sauce NoLabel
2 są request/sauce NoLabel
3 dostępne request/sauce NoLabel
4 jakieś request/sauce NoLabel
5 sosy? request/sauce NoLabel
# text: Grzegorz Pieczarski
# intent: inform/name
# slots:
1 Grzegorz inform/name B-name
2 Pieczarski inform/name I-name
# text: Sergiusz Kaczmarek
# intent: inform/name
# slots:
1 Sergiusz inform/name B-name
2 Kaczmarek inform/name I-name
# text: jaki koszt dowozu
# intent: request/delivery-price
# slots:
1 jaki request/delivery-price NoLabel
2 koszt request/delivery-price NoLabel
3 dowozu request/delivery-price NoLabel
# text: jakie sosy w menu?
# intent: request/sauce
# slots:
1 jakie request/sauce NoLabel
2 sosy request/sauce NoLabel
3 w request/sauce NoLabel
4 menu? request/sauce NoLabel
# text: Napój pepsi i cola
# intent: inform/order
# slots:
1 Napój inform/order NoLabel
2 pepsi inform/order B-drink
3 i inform/order NoLabel
4 cola inform/order B-drink
# text: woda i sok
# intent: inform/order
# slots:
1 woda inform/order B-drink
2 i inform/order NoLabel
3 sok inform/order B-drink

View File

@ -1,45 +1,63 @@
import re
import os
import pandas as pd
import numpy as np
from nlu_utils import predict_multiple
from flair.models import SequenceTagger
def __parse_acts(acts):
acts_split = acts.split('&')
remove_slot_regex = "[\(\[].*?[\)\]]"
return set(re.sub(remove_slot_regex, "", act) for act in acts_split)
def __parse_predictions(predictions):
return set(prediction.split('/')[0] for prediction in predictions)
# Exploratory tests
frame_model = SequenceTagger.load('frame-model-prod/best-model.pt')
# slot_model = SequenceTagger.load('slot-model-prod/final-model.pt')
total_acts = 0
act_correct_predictions = 0
slot_correct_predictions = 0
for file_name in os.listdir('data'):
if file_name.split('.')[-1] != 'tsv':
continue
df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt'])
df = df[df.kto == 'user']
all_data = np.array(df)
for row in all_data:
sentence = row[1]
acts = __parse_acts(row[2])
predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame')
predictions = __parse_predictions(predictions_raw)
for act in acts:
total_acts += 1
if act in predictions:
act_correct_predictions += 1
import re
import os
import pandas as pd
import numpy as np
from nlu_utils import predict_multiple
from flair.models import SequenceTagger
from conllu import parse_incr
from flair.data import Corpus
from nlu_utils import conllu2flair, nolabel2o
# Frame model evaluation
frame_model = SequenceTagger.load('frame-model-prod/best-model.pt')
with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={}))
corpus = Corpus(test=conllu2flair(testset, "frame"))
result = frame_model.evaluate(corpus.test, mini_batch_size=1, gold_label_type="frame")
print(result.detailed_results)
# Slot model evaluation
slot_model = SequenceTagger.load('slot-model-prod/best-model.pt')
with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={'slot': nolabel2o}))
corpus = Corpus(test=conllu2flair(testset, "slot"))
result = slot_model.evaluate(corpus.test, mini_batch_size=8, gold_label_type="slot")
print(result.detailed_results)
# Custom evaluation
def __parse_acts(acts):
acts_split = acts.split('&')
remove_slot_regex = "[\(\[].*?[\)\]]"
return set(re.sub(remove_slot_regex, "", act) for act in acts_split)
def __parse_predictions(predictions):
return set(prediction.split('/')[0] for prediction in predictions)
total_acts = 0
act_correct_predictions = 0
slot_correct_predictions = 0
for file_name in os.listdir('data'):
if file_name.split('.')[-1] != 'tsv':
continue
df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt'])
df = df[df.kto == 'user']
all_data = np.array(df)
for row in all_data:
sentence = row[1]
acts = __parse_acts(row[2])
predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame')
predictions = __parse_predictions(predictions_raw)
for act in acts:
total_acts += 1
if act in predictions:
act_correct_predictions += 1
print(f"Accuracy - predicting acts: {(act_correct_predictions / total_acts)*100} ({act_correct_predictions}/{total_acts})")

View File

@ -1,30 +1,30 @@
from flair.models import SequenceTagger
from nlu_utils import predict_single, predict_multiple, predict_and_annotate
# Exploratory tests
frame_model = SequenceTagger.load('frame-model/best-model.pt')
tests = [
'chciałbym zamówić pizzę',
'na godzinę 12',
'prosiłbym o pizzę z pieczarkami',
'to wszystko, jaka cena?',
'ile kosztuje pizza',
'do widzenia',
'tak',
'nie dziękuję',
'dodatkowy ser',
'pizzę barcelona bez cebuli',
]
# print("=== Exploratory tests - frame model ===")
for test in tests:
print(f"Sentence: {test}")
print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}")
print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}")
print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}")
print("=== Exploratory tests - slot model ===")
slot_model = SequenceTagger.load('slot-model/final-model.pt')
for test in tests:
print(f"Sentence: {test}")
from flair.models import SequenceTagger
from nlu_utils import predict_single, predict_multiple, predict_and_annotate
# Exploratory tests
frame_model = SequenceTagger.load('frame-model/best-model.pt')
tests = [
'chciałbym zamówić pizzę',
'na godzinę 12',
'prosiłbym o pizzę z pieczarkami',
'to wszystko, jaka cena?',
'ile kosztuje pizza',
'do widzenia',
'tak',
'nie dziękuję',
'dodatkowy ser',
'pizzę barcelona bez cebuli',
]
# print("=== Exploratory tests - frame model ===")
for test in tests:
print(f"Sentence: {test}")
print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}")
print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}")
print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}")
print("=== Exploratory tests - slot model ===")
slot_model = SequenceTagger.load('slot-model/final-model.pt')
for test in tests:
print(f"Sentence: {test}")
print(f"Prediction: {predict_and_annotate(slot_model, test.split(), 'slot')}")

View File

@ -1,46 +1,42 @@
from conllu import parse_incr
from flair.data import Corpus
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from nlu_utils import conllu2flair, nolabel2o
import random
import torch
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def train_model(label_type, field_parsers = {}):
with open('data/train_dialog.conllu', encoding='utf-8') as trainfile:
trainset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(trainset, label_type))
label_dictionary = corpus.make_label_dictionary(label_type=label_type)
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
frame_trainer = ModelTrainer(tagger, corpus)
frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=32, max_epochs=75, train_with_dev=False)
if __name__ == '__main__':
train_model("frame")
train_model('slot', field_parsers={'slot': nolabel2o})
from conllu import parse_incr
from flair.data import Corpus
from flair.embeddings import StackedEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from nlu_utils import conllu2flair, nolabel2o
import torch
if torch.cuda.is_available():
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def train_model(label_type, field_parsers = {}):
with open('data/train_dialog.conllu', encoding='utf-8') as f:
trainset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
with open('data/test_dialog_46.conllu', encoding='utf-8') as f:
testset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
breakpoint()
corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(testset, label_type))
label_dictionary = corpus.make_label_dictionary(label_type=label_type)
embedding_types = [
WordEmbeddings('pl'),
FlairEmbeddings('pl-forward'),
FlairEmbeddings('pl-backward'),
CharacterEmbeddings(),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
frame_trainer = ModelTrainer(tagger, corpus)
frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=16, max_epochs=75, train_with_dev=False)
if __name__ == '__main__':
train_model("frame")
# train_model('slot', field_parsers={'slot': nolabel2o})

View File

@ -1,100 +1,101 @@
from flair.data import Sentence
from flair.datasets import FlairDatapointDataset
def nolabel2o(line, i):
return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(sentences, label=None):
if label == "frame":
return conllu2flair_frame(sentences, label)
else:
return conllu2flair_slot(sentences, label)
def conllu2flair_frame(sentences, label=None):
fsentences = []
for sentence in sentences:
tokens = [token["form"] for token in sentence]
fsentence = Sentence(' '.join(tokens), use_tokenizer=False)
for i in range(len(fsentence)):
fsentence[i:i+1].add_label(label, sentence[i][label])
fsentences.append(fsentence)
return FlairDatapointDataset(fsentences)
def conllu2flair_slot(sentences, label=None):
fsentences = []
for sentence in sentences:
fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False)
start_idx = None
end_idx = None
tag = None
if label:
for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)):
if token[label].startswith('B-'):
start_idx = idx
end_idx = idx
tag = token[label][2:]
elif token[label].startswith('I-'):
end_idx = idx
elif token[label] == 'O':
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
start_idx = None
end_idx = None
tag = None
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
fsentences.append(fsentence)
return FlairDatapointDataset(fsentences)
def __predict(model, csentence):
fsentence = conllu2flair([csentence])[0]
model.predict(fsentence)
return fsentence
def __csentence(sentence, label_type):
if label_type == "frame":
return [{'form': word } for word in sentence]
else:
return [{'form': word, 'slot': 'O'} for word in sentence]
def predict_single(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
intent = {}
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value
if tag in intent:
intent[tag] += 1
else:
intent[tag] = 1
return max(intent, key=intent.get)
def predict_multiple(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
def predict_and_annotate(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value
if label_type == "frame":
csentence[span.tokens[0].idx-1]['frame'] = tag
else:
csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}'
for token in span.tokens[1:]:
csentence[token.idx - 1]['slot'] = f'I-{tag}'
from flair.data import Sentence
from flair.datasets import FlairDatapointDataset
def nolabel2o(line, i):
return 'O' if line[i] == 'NoLabel' else line[i]
def conllu2flair(sentences, label=None):
if label == "frame":
return conllu2flair_frame(sentences, label)
else:
return conllu2flair_slot(sentences, label)
def conllu2flair_frame(sentences, label=None):
fsentences = []
for sentence in sentences:
tokens = [token["form"] for token in sentence]
fsentence = Sentence(' '.join(tokens), use_tokenizer=False)
for i in range(len(fsentence)):
fsentence[i:i+1].add_label(label, sentence[i][label])
fsentences.append(fsentence)
return FlairDatapointDataset(fsentences)
def conllu2flair_slot(sentences, label=None):
fsentences = []
for sentence in sentences:
fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False)
start_idx = None
end_idx = None
tag = None
if label:
for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)):
if token[label].startswith('B-'):
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
start_idx = idx
end_idx = idx
tag = token[label][2:]
elif token[label].startswith('I-'):
end_idx = idx
elif token[label] == 'O':
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
start_idx = None
end_idx = None
tag = None
if start_idx is not None:
fsentence[start_idx:end_idx+1].add_label(label, tag)
fsentences.append(fsentence)
return FlairDatapointDataset(fsentences)
def __predict(model, csentence):
fsentence = conllu2flair([csentence])[0]
model.predict(fsentence)
return fsentence
def __csentence(sentence, label_type):
if label_type == "frame":
return [{'form': word } for word in sentence]
else:
return [{'form': word, 'slot': 'O'} for word in sentence]
def predict_single(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
intent = {}
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value
if tag in intent:
intent[tag] += 1
else:
intent[tag] = 1
return max(intent, key=intent.get)
def predict_multiple(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
def predict_and_annotate(model, sentence, label_type):
csentence = __csentence(sentence, label_type)
fsentence = __predict(model, csentence)
for span in fsentence.get_spans(label_type):
tag = span.get_label(label_type).value
if label_type == "frame":
csentence[span.tokens[0].idx-1]['frame'] = tag
else:
csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}'
for token in span.tokens[1:]:
csentence[token.idx - 1]['slot'] = f'I-{tag}'
return csentence