Finalne poprawki

2024-05-10 01:25:13 +02:00 · 2024-05-10 01:25:13 +02:00 · 16af0e732c
commit 16af0e732c
parent 846c6991e7
7 changed files with 952 additions and 878 deletions
--- a/conllu_generator.py
+++ b/conllu_generator.py
@ -1,15 +1,15 @@
-print("Script to automatically append data to data/dialog.conllu")
-print("Start typing now. Press Ctrl+C to stop.")
-
-while True:
-    with open("data/train_dialog.conllu", "a") as f:
-        text = input("Text: ")
-        act = input("Intent: ")
-        slots = text.split(" ")
-        f.write(
-            f"\n# text: {text}\n# intent: {act}\n# slots:\n"
-        )
-        for i, slot in enumerate(slots):
-            label = input(f"{i}/{slot} label: ")
-            f.write(f"{i+1}\t{slot}\t{act}\t{label}\n")
-        print("---")
+print("Script to automatically append data to data/dialog.conllu")
+print("Start typing now. Press Ctrl+C to stop.")
+
+while True:
+    with open("data/train_dialog.conllu", "a") as f:
+        text = input("Text: ")
+        act = input("Intent: ")
+        slots = text.split(" ")
+        f.write(
+            f"\n# text: {text}\n# intent: {act}\n# slots:\n"
+        )
+        for i, slot in enumerate(slots):
+            label = input(f"{i}/{slot} label: ")
+            f.write(f"{i+1}\t{slot}\t{act}\t{label}\n")
+        print("---")
--- a/data/test_dialog_46.conllu
+++ b/data/test_dialog_46.conllu
--- a/data/train_dialog.conllu
+++ b/data/train_dialog.conllu
@ -332,11 +332,11 @@
 3	w	request/menu	NoLabel
 4	ofercie	request/menu	NoLabel

-# text: chciałbym 3 pizze, hawajskie duże
+# text: chciałbym trzy pizze, hawajskie duże
 # intent: inform/order
 # slots:
 1	chciałbym	inform/order	NoLabel
-2	3	inform/order	B-quantity
+2	trzy	inform/order	B-quantity
 3	pizze,	inform/order	B-food
 4	hawajskie	inform/order	B-pizza
 5	duże	inform/order	B-size
@ -585,11 +585,11 @@
 4	tuna	inform/order	B-pizza
 5	XL	inform/order	B-size

-# text: wezmę 3 pizze tuna, średnią, dużą i bardzo dużą
+# text: wezmę 3x pizze tuna, średnią, dużą i bardzo dużą
 # intent: inform/order
 # slots:
 1	wezmę	inform/order	NoLabel
-2	3	inform/order	B-quantity
+2	3x	inform/order	B-quantity
 3	pizze	inform/order	B-food
 4	tuna,	inform/order	B-pizza
 5	średnią,	inform/order	B-size
@ -825,6 +825,14 @@
 1	jakie	request/ingredients	NoLabel
 2	składniki	request/ingredients	NoLabel

+# text: co jest na pizzy
+# intent: request/ingredients
+# slots:
+1	co	request/ingredients	NoLabel
+2	jest	request/ingredients	NoLabel
+3	na	request/ingredients	NoLabel
+4	pizzy	request/ingredients	NoLabel
+
 # text: jakie są napoje
 # intent: request/drinks
 # slots:
@ -850,3 +858,54 @@
 2	macie	request/drinks	NoLabel
 3	do	request/drinks	NoLabel
 4	picia	request/drinks	NoLabel
+
+# text: czy są dostępne jakieś sosy?
+# intent: request/sauce
+# slots:
+1	czy	request/sauce	NoLabel
+2	są	request/sauce	NoLabel
+3	dostępne	request/sauce	NoLabel
+4	jakieś	request/sauce	NoLabel
+5	sosy?	request/sauce	NoLabel
+
+# text: Grzegorz Pieczarski
+# intent: inform/name
+# slots:
+1	Grzegorz	inform/name	B-name
+2	Pieczarski	inform/name	I-name
+
+# text: Sergiusz Kaczmarek
+# intent: inform/name
+# slots:
+1	Sergiusz	inform/name	B-name
+2	Kaczmarek	inform/name	I-name
+
+# text: jaki koszt dowozu
+# intent: request/delivery-price
+# slots:
+1	jaki	request/delivery-price	NoLabel
+2	koszt	request/delivery-price	NoLabel
+3	dowozu	request/delivery-price	NoLabel
+
+# text: jakie sosy w menu?
+# intent: request/sauce
+# slots:
+1	jakie	request/sauce	NoLabel
+2	sosy	request/sauce	NoLabel
+3	w	request/sauce	NoLabel
+4	menu?	request/sauce	NoLabel
+
+# text: Napój pepsi i cola
+# intent: inform/order
+# slots:
+1	Napój	inform/order	NoLabel
+2	pepsi	inform/order	B-drink
+3	i	inform/order	NoLabel
+4	cola	inform/order	B-drink
+
+# text: woda i sok
+# intent: inform/order
+# slots:
+1	woda	inform/order	B-drink
+2	i	inform/order	NoLabel
+3	sok	inform/order	B-drink
--- a/evaluate.py
+++ b/evaluate.py
@ -1,45 +1,63 @@
-import re
-import os
-import pandas as pd
-import numpy as np
-from nlu_utils import predict_multiple
-from flair.models import SequenceTagger
-
-def __parse_acts(acts):
-    acts_split = acts.split('&')
-    remove_slot_regex = "[\(\[].*?[\)\]]"
-    return set(re.sub(remove_slot_regex, "", act) for act in acts_split)
-
-def __parse_predictions(predictions):
-    return set(prediction.split('/')[0] for prediction in predictions)
-
-# Exploratory tests
-frame_model = SequenceTagger.load('frame-model-prod/best-model.pt')
-# slot_model = SequenceTagger.load('slot-model-prod/final-model.pt')
-
-total_acts = 0
-act_correct_predictions = 0
-slot_correct_predictions = 0
-
-for file_name in os.listdir('data'):
-    if file_name.split('.')[-1] != 'tsv':
-        continue
-
-    df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt'])
-    df = df[df.kto == 'user']
-    all_data = np.array(df)
-
-    for row in all_data:
-        sentence = row[1]
-        acts = __parse_acts(row[2])
-
-        predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame')
-        predictions = __parse_predictions(predictions_raw)
-
-        for act in acts:
-            total_acts += 1
-            if act in predictions:
-                act_correct_predictions += 1
-                
-
+import re
+import os
+import pandas as pd
+import numpy as np
+from nlu_utils import predict_multiple
+from flair.models import SequenceTagger
+from conllu import parse_incr
+from flair.data import Corpus
+from nlu_utils import conllu2flair, nolabel2o
+
+# Frame model evaluation
+frame_model = SequenceTagger.load('frame-model-prod/best-model.pt')
+with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
+    testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={}))
+
+corpus = Corpus(test=conllu2flair(testset, "frame"))
+result = frame_model.evaluate(corpus.test, mini_batch_size=1, gold_label_type="frame")
+print(result.detailed_results)
+
+# Slot model evaluation
+slot_model = SequenceTagger.load('slot-model-prod/best-model.pt')
+
+with open('data/test_dialog_46.conllu', encoding='utf-8') as trainfile:
+    testset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers={'slot': nolabel2o}))
+
+corpus = Corpus(test=conllu2flair(testset, "slot"))
+result = slot_model.evaluate(corpus.test, mini_batch_size=8, gold_label_type="slot")
+print(result.detailed_results)
+
+# Custom evaluation
+def __parse_acts(acts):
+    acts_split = acts.split('&')
+    remove_slot_regex = "[\(\[].*?[\)\]]"
+    return set(re.sub(remove_slot_regex, "", act) for act in acts_split)
+
+def __parse_predictions(predictions):
+    return set(prediction.split('/')[0] for prediction in predictions)
+
+total_acts = 0
+act_correct_predictions = 0
+slot_correct_predictions = 0
+
+for file_name in os.listdir('data'):
+    if file_name.split('.')[-1] != 'tsv':
+        continue
+
+    df = pd.read_csv(f'data/{file_name}', sep='\t', names=['kto', 'treść', 'akt'])
+    df = df[df.kto == 'user']
+    all_data = np.array(df)
+
+    for row in all_data:
+        sentence = row[1]
+        acts = __parse_acts(row[2])
+
+        predictions_raw = predict_multiple(frame_model, sentence.split(), 'frame')
+        predictions = __parse_predictions(predictions_raw)
+
+        for act in acts:
+            total_acts += 1
+            if act in predictions:
+                act_correct_predictions += 1
+
 print(f"Accuracy - predicting acts: {(act_correct_predictions / total_acts)*100} ({act_correct_predictions}/{total_acts})")
--- a/nlu_tests.py
+++ b/nlu_tests.py
@ -1,30 +1,30 @@
-from flair.models import SequenceTagger
-from nlu_utils import predict_single, predict_multiple, predict_and_annotate
-
-# Exploratory tests
-frame_model = SequenceTagger.load('frame-model/best-model.pt')
-tests = [
-    'chciałbym zamówić pizzę',
-    'na godzinę 12',
-    'prosiłbym o pizzę z pieczarkami',
-    'to wszystko, jaka cena?',
-    'ile kosztuje pizza',
-    'do widzenia',
-    'tak',
-    'nie dziękuję',
-    'dodatkowy ser',
-    'pizzę barcelona bez cebuli',
-]
-
-# print("=== Exploratory tests - frame model ===")
-for test in tests:
-    print(f"Sentence: {test}")
-    print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}")
-    print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}")
-    print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}")
-
-print("=== Exploratory tests - slot model ===")
-slot_model = SequenceTagger.load('slot-model/final-model.pt')
-for test in tests:
-    print(f"Sentence: {test}")
+from flair.models import SequenceTagger
+from nlu_utils import predict_single, predict_multiple, predict_and_annotate
+
+# Exploratory tests
+frame_model = SequenceTagger.load('frame-model/best-model.pt')
+tests = [
+    'chciałbym zamówić pizzę',
+    'na godzinę 12',
+    'prosiłbym o pizzę z pieczarkami',
+    'to wszystko, jaka cena?',
+    'ile kosztuje pizza',
+    'do widzenia',
+    'tak',
+    'nie dziękuję',
+    'dodatkowy ser',
+    'pizzę barcelona bez cebuli',
+]
+
+# print("=== Exploratory tests - frame model ===")
+for test in tests:
+    print(f"Sentence: {test}")
+    print(f"Single prediction: {predict_single(frame_model, test.split(), 'frame')}")
+    print(f"Multiple predictions: {predict_multiple(frame_model, test.split(), 'frame')}")
+    print(f"Annotated sentence: {predict_and_annotate(frame_model, test.split(), 'frame')}")
+
+print("=== Exploratory tests - slot model ===")
+slot_model = SequenceTagger.load('slot-model/final-model.pt')
+for test in tests:
+    print(f"Sentence: {test}")
    print(f"Prediction: {predict_and_annotate(slot_model, test.split(), 'slot')}")
--- a/nlu_train.py
+++ b/nlu_train.py
@ -1,46 +1,42 @@
-from conllu import parse_incr
-from flair.data import Corpus
-from flair.embeddings import StackedEmbeddings
-from flair.embeddings import WordEmbeddings
-from flair.embeddings import CharacterEmbeddings
-from flair.embeddings import FlairEmbeddings
-from flair.models import SequenceTagger
-from flair.trainers import ModelTrainer
-from nlu_utils import conllu2flair, nolabel2o
-
-import random
-import torch
-random.seed(42)
-torch.manual_seed(42)
-
-if torch.cuda.is_available():
-    torch.cuda.manual_seed(0)
-    torch.cuda.manual_seed_all(0)
-    torch.backends.cudnn.enabled = False
-    torch.backends.cudnn.benchmark = False
-    torch.backends.cudnn.deterministic = True
-
-
-def train_model(label_type, field_parsers = {}):
-    with open('data/train_dialog.conllu', encoding='utf-8') as trainfile:
-        trainset = list(parse_incr(trainfile, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
-
-    corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(trainset, label_type))
-    label_dictionary = corpus.make_label_dictionary(label_type=label_type)
-
-    embedding_types = [
-        WordEmbeddings('pl'),
-        FlairEmbeddings('pl-forward'),
-        FlairEmbeddings('pl-backward'),
-        CharacterEmbeddings(),
-    ]
-
-    embeddings = StackedEmbeddings(embeddings=embedding_types)
-    tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
-                
-    frame_trainer = ModelTrainer(tagger, corpus)
-    frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=32, max_epochs=75, train_with_dev=False)
-
-if __name__ == '__main__':
-    train_model("frame")
-    train_model('slot', field_parsers={'slot': nolabel2o})
+from conllu import parse_incr
+from flair.data import Corpus
+from flair.embeddings import StackedEmbeddings
+from flair.embeddings import WordEmbeddings
+from flair.embeddings import CharacterEmbeddings
+from flair.embeddings import FlairEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+from nlu_utils import conllu2flair, nolabel2o
+
+import torch
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = False
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+
+def train_model(label_type, field_parsers = {}):
+    with open('data/train_dialog.conllu', encoding='utf-8') as f:
+        trainset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
+    with open('data/test_dialog_46.conllu', encoding='utf-8') as f:
+        testset = list(parse_incr(f, fields=['id', 'form', 'frame', 'slot'], field_parsers=field_parsers))
+
+    breakpoint()
+    corpus = Corpus(train=conllu2flair(trainset, label_type), test=conllu2flair(testset, label_type))
+    label_dictionary = corpus.make_label_dictionary(label_type=label_type)
+
+    embedding_types = [
+        WordEmbeddings('pl'),
+        FlairEmbeddings('pl-forward'),
+        FlairEmbeddings('pl-backward'),
+        CharacterEmbeddings(),
+    ]
+
+    embeddings = StackedEmbeddings(embeddings=embedding_types)
+    tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=label_dictionary, tag_type=label_type, use_crf=True, tag_format="BIO")
+                
+    frame_trainer = ModelTrainer(tagger, corpus)
+    frame_trainer.train(f'{label_type}-model', learning_rate=0.1, mini_batch_size=16, max_epochs=75, train_with_dev=False)
+
+if __name__ == '__main__':
+    train_model("frame")
+    # train_model('slot', field_parsers={'slot': nolabel2o})
--- a/nlu_utils.py
+++ b/nlu_utils.py
@ -1,100 +1,101 @@
-from flair.data import Sentence
-from flair.datasets import FlairDatapointDataset
-
-def nolabel2o(line, i):
-    return 'O' if line[i] == 'NoLabel' else line[i]
-
-def conllu2flair(sentences, label=None):
-    if label == "frame":
-        return conllu2flair_frame(sentences, label)
-    else:
-        return conllu2flair_slot(sentences, label)
-
-def conllu2flair_frame(sentences, label=None):
-    fsentences = []
-    for sentence in sentences:
-        tokens = [token["form"] for token in sentence]
-        fsentence = Sentence(' '.join(tokens), use_tokenizer=False)
-
-        for i in range(len(fsentence)):
-            fsentence[i:i+1].add_label(label, sentence[i][label])
-
-        fsentences.append(fsentence)
-
-    return FlairDatapointDataset(fsentences)
-
-def conllu2flair_slot(sentences, label=None):
-    fsentences = []
-
-    for sentence in sentences:
-        fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False)
-        start_idx = None
-        end_idx = None
-        tag = None
-
-        if label:
-            for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)):
-                if token[label].startswith('B-'):
-                    start_idx = idx
-                    end_idx = idx
-                    tag = token[label][2:]
-                elif token[label].startswith('I-'):
-                    end_idx = idx
-                elif token[label] == 'O':
-                    if start_idx is not None:
-                        fsentence[start_idx:end_idx+1].add_label(label, tag)
-                        start_idx = None
-                        end_idx = None
-                        tag = None
-
-            if start_idx is not None:
-                fsentence[start_idx:end_idx+1].add_label(label, tag)
-
-        fsentences.append(fsentence)
-    return FlairDatapointDataset(fsentences)
-
-def __predict(model, csentence):
-    fsentence = conllu2flair([csentence])[0]
-    model.predict(fsentence)
-    return fsentence
-
-def __csentence(sentence, label_type):
-    if label_type == "frame":
-        return [{'form': word } for word in sentence]
-    else:
-        return [{'form': word, 'slot': 'O'} for word in sentence]
-
-def predict_single(model, sentence, label_type):
-    csentence = __csentence(sentence, label_type)
-    fsentence = __predict(model, csentence)
-    intent = {}
-
-    for span in fsentence.get_spans(label_type):
-        tag = span.get_label(label_type).value
-        if tag in intent:
-            intent[tag] += 1
-        else:
-            intent[tag] = 1
-
-    return max(intent, key=intent.get)
-
-def predict_multiple(model, sentence, label_type):
-    csentence = __csentence(sentence, label_type)
-    fsentence = __predict(model, csentence)
-
-    return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
-
-def predict_and_annotate(model, sentence, label_type):
-    csentence = __csentence(sentence, label_type)
-    fsentence = __predict(model, csentence)
-
-    for span in fsentence.get_spans(label_type):
-        tag = span.get_label(label_type).value
-        if label_type == "frame":
-            csentence[span.tokens[0].idx-1]['frame'] = tag
-        else:
-            csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}'
-            for token in span.tokens[1:]:
-                csentence[token.idx - 1]['slot'] = f'I-{tag}'
-
+from flair.data import Sentence
+from flair.datasets import FlairDatapointDataset
+
+def nolabel2o(line, i):
+    return 'O' if line[i] == 'NoLabel' else line[i]
+
+def conllu2flair(sentences, label=None):
+    if label == "frame":
+        return conllu2flair_frame(sentences, label)
+    else:
+        return conllu2flair_slot(sentences, label)
+
+def conllu2flair_frame(sentences, label=None):
+    fsentences = []
+    for sentence in sentences:
+        tokens = [token["form"] for token in sentence]
+        fsentence = Sentence(' '.join(tokens), use_tokenizer=False)
+
+        for i in range(len(fsentence)):
+            fsentence[i:i+1].add_label(label, sentence[i][label])
+
+        fsentences.append(fsentence)
+
+    return FlairDatapointDataset(fsentences)
+
+def conllu2flair_slot(sentences, label=None):
+    fsentences = []
+    for sentence in sentences:
+        fsentence = Sentence(' '.join(token['form'] for token in sentence), use_tokenizer=False)
+        start_idx = None
+        end_idx = None
+        tag = None
+
+        if label:
+            for idx, (token, ftoken) in enumerate(zip(sentence, fsentence)):
+                if token[label].startswith('B-'):
+                    if start_idx is not None:
+                        fsentence[start_idx:end_idx+1].add_label(label, tag)
+                    start_idx = idx
+                    end_idx = idx
+                    tag = token[label][2:]
+                elif token[label].startswith('I-'):
+                    end_idx = idx
+                elif token[label] == 'O':
+                    if start_idx is not None:
+                        fsentence[start_idx:end_idx+1].add_label(label, tag)
+                        start_idx = None
+                        end_idx = None
+                        tag = None
+
+            if start_idx is not None:
+                fsentence[start_idx:end_idx+1].add_label(label, tag)
+
+        fsentences.append(fsentence)
+    return FlairDatapointDataset(fsentences)
+
+def __predict(model, csentence):
+    fsentence = conllu2flair([csentence])[0]
+    model.predict(fsentence)
+    return fsentence
+
+def __csentence(sentence, label_type):
+    if label_type == "frame":
+        return [{'form': word } for word in sentence]
+    else:
+        return [{'form': word, 'slot': 'O'} for word in sentence]
+
+def predict_single(model, sentence, label_type):
+    csentence = __csentence(sentence, label_type)
+    fsentence = __predict(model, csentence)
+    intent = {}
+
+    for span in fsentence.get_spans(label_type):
+        tag = span.get_label(label_type).value
+        if tag in intent:
+            intent[tag] += 1
+        else:
+            intent[tag] = 1
+
+    return max(intent, key=intent.get)
+
+def predict_multiple(model, sentence, label_type):
+    csentence = __csentence(sentence, label_type)
+    fsentence = __predict(model, csentence)
+
+    return set(span.get_label(label_type).value for span in fsentence.get_spans(label_type))
+
+def predict_and_annotate(model, sentence, label_type):
+    csentence = __csentence(sentence, label_type)
+    fsentence = __predict(model, csentence)
+
+    for span in fsentence.get_spans(label_type):
+        tag = span.get_label(label_type).value
+        if label_type == "frame":
+            csentence[span.tokens[0].idx-1]['frame'] = tag
+        else:
+            csentence[span.tokens[0].idx - 1]['slot'] = f'B-{tag}'
+            for token in span.tokens[1:]:
+                csentence[token.idx - 1]['slot'] = f'I-{tag}'
+
    return csentence