From b9410fe8de1675f6a6596c0b76e5832c271c219b Mon Sep 17 00:00:00 2001 From: Adrian Charkiewicz Date: Wed, 25 May 2022 13:12:41 +0200 Subject: [PATCH] initial generowanie odpowiedzi --- ...g-semantyczny-uczenie(zmodyfikowany).ipynb | 1362 ++++++++--------- lab/11-generowanie-odpowiedzi.ipynb | 433 ++++++ 2 files changed, 1113 insertions(+), 682 deletions(-) create mode 100644 lab/11-generowanie-odpowiedzi.ipynb diff --git a/lab/08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb b/lab/08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb index 5bdbcf1..cb10ad7 100644 --- a/lab/08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb +++ b/lab/08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb @@ -1,684 +1,682 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false - }, - "source": [ - "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", - "
\n", - "

Systemy Dialogowe

\n", - "

8. Parsing semantyczny z wykorzystaniem technik uczenia maszynowego [laboratoria]

\n", - "

Marek Kubis (2021)

\n", - "
\n", - "\n", - "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parsing semantyczny z wykorzystaniem technik uczenia maszynowego\n", - "================================================================\n", - "\n", - "Wprowadzenie\n", - "------------\n", - "Problem wykrywania slotów i ich wartości w wypowiedziach użytkownika można sformułować jako zadanie\n", - "polegające na przewidywaniu dla poszczególnych słów etykiet wskazujących na to czy i do jakiego\n", - "slotu dane słowo należy.\n", - "\n", - "> chciałbym zarezerwować stolik na jutro**/day** na godzinę dwunastą**/hour** czterdzieści**/hour** pięć**/hour** na pięć**/size** osób\n", - "\n", - "Granice slotów oznacza się korzystając z wybranego schematu etykietowania.\n", - "\n", - "### Schemat IOB\n", - "\n", - "| Prefix | Znaczenie |\n", - "|:------:|:---------------------------|\n", - "| I | wnętrze slotu (inside) |\n", - "| O | poza slotem (outside) |\n", - "| B | początek slotu (beginning) |\n", - "\n", - "> chciałbym zarezerwować stolik na jutro**/B-day** na godzinę dwunastą**/B-hour** czterdzieści**/I-hour** pięć**/I-hour** na pięć**/B-size** osób\n", - "\n", - "### Schemat IOBES\n", - "\n", - "| Prefix | Znaczenie |\n", - "|:------:|:---------------------------|\n", - "| I | wnętrze slotu (inside) |\n", - "| O | poza slotem (outside) |\n", - "| B | początek slotu (beginning) |\n", - "| E | koniec slotu (ending) |\n", - "| S | pojedyncze słowo (single) |\n", - "\n", - "> chciałbym zarezerwować stolik na jutro**/S-day** na godzinę dwunastą**/B-hour** czterdzieści**/I-hour** pięć**/E-hour** na pięć**/S-size** osób\n", - "\n", - "Jeżeli dla tak sformułowanego zadania przygotujemy zbiór danych\n", - "złożony z wypowiedzi użytkownika z oznaczonymi slotami (tzw. *zbiór uczący*),\n", - "to możemy zastosować techniki (nadzorowanego) uczenia maszynowego w celu zbudowania modelu\n", - "annotującego wypowiedzi użytkownika etykietami slotów.\n", - "\n", - "Do zbudowania takiego modelu można wykorzystać między innymi:\n", - "\n", - " 1. warunkowe pola losowe (Lafferty i in.; 2001),\n", - "\n", - " 2. rekurencyjne sieci neuronowe, np. sieci LSTM (Hochreiter i Schmidhuber; 1997),\n", - "\n", - " 3. transformery (Vaswani i in., 2017).\n", - "\n", - "Przykład\n", - "--------\n", - "Skorzystamy ze zbioru danych przygotowanego przez Schustera (2019)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zbiór ten gromadzi wypowiedzi w trzech językach opisane slotami dla dwunastu ram należących do trzech dziedzin `Alarm`, `Reminder` oraz `Weather`. Dane wczytamy korzystając z biblioteki [conllu](https://pypi.org/project/conllu/)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# text: halo\t\t\t\n", - "\n", - "# intent: hello\t\t\t\n", - "\n", - "# slots: \t\t\t\n", - "\n", - "1\thalo\thello\tNoLabel\n", - "\n", - "\t\t\t\n", - "\n", - "# text: chaciałbym pójść na premierę filmu jakie premiery są w tym tygodniu\t\t\t\n", - "\n", - "# intent: reqmore\t\t\t\n", - "\n", - "# slots: \t\t\t\n", - "\n", - "1\tchaciałbym\treqmore\tNoLabel\n", - "\n", - "2\tpójść\treqmore\tNoLabel\n", - "\n", - "3\tna\treqmore\tNoLabel\n", - "\n", - "4\tpremierę\treqmore\tNoLabel\n", - "\n", - "5\tfilmu\treqmore\tNoLabel\n", - "\n", - "6\tjakie\treqmore\tB-goal\n", - "\n", - "7\tpremiery\treqmore\tI-goal\n", - "\n" - ] - } - ], - "source": [ - "from conllu import parse_incr\n", - "fields = ['id', 'form', 'frame', 'slot']\n", - "\n", - "def nolabel2o(line, i):\n", - " return 'O' if line[i] == 'NoLabel' else line[i]\n", - "# pathTrain = '../tasks/zad8/en/train-en.conllu'\n", - "# pathTest = '../tasks/zad8/en/test-en.conllu'\n", - "\n", - "pathTrain = '../tasks/zad8/pl/train.conllu'\n", - "pathTest = '../tasks/zad8/pl/test.conllu'\n", - "\n", - "with open(pathTrain, encoding=\"UTF-8\") as trainfile:\n", - " i=0\n", - " for line in trainfile:\n", - " print(line)\n", - " i+=1\n", - " if i==15: break \n", - " trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o}))\n", - "with open(pathTest, encoding=\"UTF-8\") as testfile:\n", - " testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': nolabel2o}))\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zobaczmy kilka przykładowych wypowiedzi z tego zbioru." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
1wybieraminformO
2batmana informB-title
" - ], - "text/plain": [ - "'\\n\\n\\n\\n\\n
1wybieraminformO
2batmana informB-title
'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from tabulate import tabulate\n", - "tabulate(trainset[1], tablefmt='html')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
1chcę informO
2zarezerwowaćinformB-goal
3bilety informO
" - ], - "text/plain": [ - "'\\n\\n\\n\\n\\n\\n
1chcę informO
2zarezerwowaćinformB-goal
3bilety informO
'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tabulate(trainset[16], tablefmt='html')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
1chciałbym informO
2anulować informO
3rezerwacjęinformO
4biletu informO
" - ], - "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n
1chciałbym informO
2anulować informO
3rezerwacjęinformO
4biletu informO
'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tabulate(trainset[20], tablefmt='html')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Budując model skorzystamy z architektury opartej o rekurencyjne sieci neuronowe\n", - "zaimplementowanej w bibliotece [flair](https://github.com/flairNLP/flair) (Akbik i in. 2018)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from flair.data import Corpus, Sentence, Token\n", - "from flair.datasets import SentenceDataset\n", - "from flair.embeddings import StackedEmbeddings\n", - "from flair.embeddings import WordEmbeddings\n", - "from flair.embeddings import CharacterEmbeddings\n", - "from flair.embeddings import FlairEmbeddings\n", - "from flair.models import SequenceTagger\n", - "from flair.trainers import ModelTrainer\n", - "from flair.datasets import DataLoader\n", - "\n", - "# determinizacja obliczeń\n", - "import random\n", - "import torch\n", - "random.seed(42)\n", - "torch.manual_seed(42)\n", - "\n", - "if torch.cuda.is_available():\n", - " torch.cuda.manual_seed(0)\n", - " torch.cuda.manual_seed_all(0)\n", - " torch.backends.cudnn.enabled = False\n", - " torch.backends.cudnn.benchmark = False\n", - " torch.backends.cudnn.deterministic = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dane skonwertujemy do formatu wykorzystywanego przez `flair`, korzystając z następującej funkcji." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Corpus: 346 train + 38 dev + 32 test sentences\n", - "Dictionary with 78 tags: , O, O/reqmore, B-interval/reqmore, I-interval/reqmore, O/inform, B-title/inform, B-date/inform, I-date/inform, B-time/inform, B-quantity/inform, B-area/inform, I-area/inform, B-goal/inform, O/bye, O/hello, O/reqmore inform, B-goal/reqmore inform, I-goal/reqmore inform, B-date/reqmore inform, B-interval/reqmore inform, O/null, O/help, B-goal/reqmore, I-goal/reqmore, B-title/reqmore, B-title/reqmore inform, I-title/reqmore inform, O/ack, O/reqalts\n" - ] - } - ], - "source": [ - "def conllu2flair(sentences, label1=None, label2=None):\n", - " fsentences = []\n", - "\n", - " for sentence in sentences:\n", - " fsentence = Sentence()\n", - "\n", - " for token in sentence:\n", - " ftoken = Token(token['form'])\n", - "\n", - " if label1:\n", - " if label2:\n", - " ftoken.add_tag(label1, token[label1] + \"/\" + token[label2])\n", - " else:\n", - " ftoken.add_tag(label1, token[label1])\n", - " \n", - " fsentence.add_token(ftoken)\n", - "\n", - " fsentences.append(fsentence)\n", - "\n", - " return SentenceDataset(fsentences)\n", - "\n", - "corpus = Corpus(train=conllu2flair(trainset, 'slot', \"frame\"), test=conllu2flair(testset, 'slot', \"frame\"))\n", - "print(corpus)\n", - "tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')\n", - "print(tag_dictionary)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nasz model będzie wykorzystywał wektorowe reprezentacje słów (zob. [Word Embeddings](https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md))." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_types = [\n", - " WordEmbeddings('pl'),\n", - " FlairEmbeddings('polish-forward'),\n", - " FlairEmbeddings('polish-backward'),\n", - " CharacterEmbeddings(),\n", - "]\n", - "\n", - "embeddings = StackedEmbeddings(embeddings=embedding_types)\n", - "tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,\n", - " tag_dictionary=tag_dictionary,\n", - " tag_type='slot', use_crf=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zobaczmy jak wygląda architektura sieci neuronowej, która będzie odpowiedzialna za przewidywanie\n", - "slotów w wypowiedziach." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SequenceTagger(\n", - " (embeddings): StackedEmbeddings(\n", - " (list_embedding_0): WordEmbeddings('pl')\n", - " (list_embedding_1): FlairEmbeddings(\n", - " (lm): LanguageModel(\n", - " (drop): Dropout(p=0.25, inplace=False)\n", - " (encoder): Embedding(1602, 100)\n", - " (rnn): LSTM(100, 2048)\n", - " (decoder): Linear(in_features=2048, out_features=1602, bias=True)\n", - " )\n", - " )\n", - " (list_embedding_2): FlairEmbeddings(\n", - " (lm): LanguageModel(\n", - " (drop): Dropout(p=0.25, inplace=False)\n", - " (encoder): Embedding(1602, 100)\n", - " (rnn): LSTM(100, 2048)\n", - " (decoder): Linear(in_features=2048, out_features=1602, bias=True)\n", - " )\n", - " )\n", - " (list_embedding_3): CharacterEmbeddings(\n", - " (char_embedding): Embedding(275, 25)\n", - " (char_rnn): LSTM(25, 25, bidirectional=True)\n", - " )\n", - " )\n", - " (word_dropout): WordDropout(p=0.05)\n", - " (locked_dropout): LockedDropout(p=0.5)\n", - " (embedding2nn): Linear(in_features=4446, out_features=4446, bias=True)\n", - " (rnn): LSTM(4446, 256, batch_first=True, bidirectional=True)\n", - " (linear): Linear(in_features=512, out_features=78, bias=True)\n", - " (beta): 1.0\n", - " (weights): None\n", - " (weight_tensor) None\n", - ")\n" - ] - } - ], - "source": [ - "print(tagger)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wykonamy dziesięć iteracji (epok) uczenia a wynikowy model zapiszemy w katalogu `slot-model`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "modelPath = 'slot-model/final-model.pt'\n", - "\n", - "from os.path import exists\n", - "\n", - "fileExists = exists(modelPath)\n", - "\n", - "if(not fileExists):\n", - " trainer = ModelTrainer(tagger, corpus)\n", - " trainer.train('slot-model',\n", - " learning_rate=0.1,\n", - " mini_batch_size=32,\n", - " max_epochs=10,\n", - " train_with_dev=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jakość wyuczonego modelu możemy ocenić, korzystając z zaraportowanych powyżej metryk, tj.:\n", - "\n", - " - *tp (true positives)*\n", - "\n", - " > liczba słów oznaczonych w zbiorze testowym etykietą $e$, które model oznaczył tą etykietą\n", - "\n", - " - *fp (false positives)*\n", - "\n", - " > liczba słów nieoznaczonych w zbiorze testowym etykietą $e$, które model oznaczył tą etykietą\n", - "\n", - " - *fn (false negatives)*\n", - "\n", - " > liczba słów oznaczonych w zbiorze testowym etykietą $e$, którym model nie nadał etykiety $e$\n", - "\n", - " - *precision*\n", - "\n", - " > $$\\frac{tp}{tp + fp}$$\n", - "\n", - " - *recall*\n", - "\n", - " > $$\\frac{tp}{tp + fn}$$\n", - "\n", - " - $F_1$\n", - "\n", - " > $$\\frac{2 \\cdot precision \\cdot recall}{precision + recall}$$\n", - "\n", - " - *micro* $F_1$\n", - "\n", - " > $F_1$ w którym $tp$, $fp$ i $fn$ są liczone łącznie dla wszystkich etykiet, tj. $tp = \\sum_{e}{{tp}_e}$, $fn = \\sum_{e}{{fn}_e}$, $fp = \\sum_{e}{{fp}_e}$\n", - "\n", - " - *macro* $F_1$\n", - "\n", - " > średnia arytmetyczna z $F_1$ obliczonych dla poszczególnych etykiet z osobna.\n", - "\n", - "Wyuczony model możemy wczytać z pliku korzystając z metody `load`." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-22 15:25:19,970 loading file slot-model/final-model.pt\n" - ] - } - ], - "source": [ - "model = SequenceTagger.load(modelPath)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wczytany model możemy wykorzystać do przewidywania slotów w wypowiedziach użytkownika, korzystając\n", - "z przedstawionej poniżej funkcji `predict`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('co', 'O/reqmore'), ('gracie', 'O/reqmore'), ('obecnie', 'O/reqmore')]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def predict(model, sentence):\n", - " csentence = [{'form': word} for word in sentence]\n", - " fsentence = conllu2flair([csentence])[0]\n", - " model.predict(fsentence)\n", - " return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]\n", - "\n", - "predict(model, 'co gracie obecnie'.split())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jak pokazuje przykład poniżej model wyuczony tylko na 100 przykładach popełnia w dosyć prostej\n", - "wypowiedzi błąd etykietując słowo `alarm` tagiem `B-weather/noun`." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
kiedy O/reqmore
gracieO/reqmore
film O/reqmore
zorro O/reqmore
" - ], - "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n
kiedy O/reqmore
gracieO/reqmore
film O/reqmore
zorro O/reqmore
'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tabulate(predict(model, 'kiedy gracie film zorro'.split()), tablefmt='html')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'testset' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Develop\\wmi\\AITECH\\sem1\\Systemy dialogowe\\lab\\08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb Cell 25'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mrecall: \u001b[39m\u001b[39m\"\u001b[39m, recallScore)\n\u001b[0;32m 38\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mf1: \u001b[39m\u001b[39m\"\u001b[39m, f1Score)\n\u001b[1;32m---> 40\u001b[0m \u001b[39meval\u001b[39;49m()\n", - "\u001b[1;32mc:\\Develop\\wmi\\AITECH\\sem1\\Systemy dialogowe\\lab\\08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb Cell 25'\u001b[0m in \u001b[0;36meval\u001b[1;34m()\u001b[0m\n\u001b[0;32m 14\u001b[0m fp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m 15\u001b[0m fn \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[1;32m---> 16\u001b[0m sentences \u001b[39m=\u001b[39m [sentence \u001b[39mfor\u001b[39;00m sentence \u001b[39min\u001b[39;00m testset]\n\u001b[0;32m 17\u001b[0m \u001b[39mfor\u001b[39;00m sentence \u001b[39min\u001b[39;00m sentences:\n\u001b[0;32m 18\u001b[0m \u001b[39m# get sentence as terms list\u001b[39;00m\n\u001b[0;32m 19\u001b[0m termsList \u001b[39m=\u001b[39m [w[\u001b[39m\"\u001b[39m\u001b[39mform\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mfor\u001b[39;00m w \u001b[39min\u001b[39;00m sentence]\n", - "\u001b[1;31mNameError\u001b[0m: name 'testset' is not defined" - ] - } - ], - "source": [ - "# evaluation\n", - "\n", - "def precision(tpScore, fpScore):\n", - " return float(tpScore) / (tpScore + fpScore)\n", - "\n", - "def recall(tpScore, fnScore):\n", - " return float(tpScore) / (tpScore + fnScore)\n", - "\n", - "def f1(precision, recall):\n", - " return 2 * precision * recall/(precision + recall)\n", - "\n", - "def eval():\n", - " tp = 0\n", - " fp = 0\n", - " fn = 0\n", - " sentences = [sentence for sentence in testset]\n", - " for sentence in sentences:\n", - " # get sentence as terms list\n", - " termsList = [w[\"form\"] for w in sentence]\n", - " # predict tags\n", - " predTags = [tag[1] for tag in predict(model, termsList)]\n", - " \n", - " expTags = [token[\"slot\"] + \"/\" + token[\"frame\"] for token in sentence]\n", - " for i in range(len(predTags)):\n", - " if (expTags[i][0] == \"O\" and expTags[i] != predTags[i]):\n", - " fp += 1\n", - " elif ((expTags[i][0] != \"O\") & (predTags[i][0] == \"O\")):\n", - " fn += 1\n", - " elif ((expTags[i][0] != \"O\") & (predTags[i] == expTags[i])):\n", - " tp += 1\n", - "\n", - " precisionScore = precision(tp, fp)\n", - " recallScore = recall(tp, fn)\n", - " f1Score = f1(precisionScore, recallScore)\n", - " print(\"stats: \")\n", - " print(\"precision: \", precisionScore)\n", - " print(\"recall: \", recallScore)\n", - " print(\"f1: \", f1Score)\n", - "\n", - "eval()\n", - "\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Literatura\n", - "----------\n", - " 1. Sebastian Schuster, Sonal Gupta, Rushin Shah, Mike Lewis, Cross-lingual Transfer Learning for Multilingual Task Oriented Dialog. NAACL-HLT (1) 2019, pp. 3795-3805\n", - " 2. John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In Proceedings of the Eighteenth International Conference on Machine Learning (ICML '01). Morgan Kaufmann Publishers Inc., San Francisco, CA, USA, 282–289, https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers\n", - " 3. Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long Short-Term Memory. Neural Comput. 9, 8 (November 15, 1997), 1735–1780, https://doi.org/10.1162/neco.1997.9.8.1735\n", - " 4. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin, Attention is All you Need, NIPS 2017, pp. 5998-6008, https://arxiv.org/abs/1706.03762\n", - " 5. Alan Akbik, Duncan Blythe, Roland Vollgraf, Contextual String Embeddings for Sequence Labeling, Proceedings of the 27th International Conference on Computational Linguistics, pp. 1638–1649, https://www.aclweb.org/anthology/C18-1139.pdf\n" - ] - } - ], - "metadata": { - "author": "Marek Kubis", - "email": "mkubis@amu.edu.pl", - "interpreter": { - "hash": "2f9d6cf1e3d8195079a65c851de355134a77367bcd714b1a5d498c42d3c07114" - }, - "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python", - "notebook_metadata_filter": "-all" - }, - "kernelspec": { - "display_name": "Python 3.8.3 64-bit", - "language": "python", - "name": "python3" - }, - "lang": "pl", - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - }, - "subtitle": "8.Parsing semantyczny z wykorzystaniem technik uczenia maszynowego[laboratoria]", - "title": "Systemy Dialogowe", - "year": "2021" - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Systemy Dialogowe

\n", + "

8. Parsing semantyczny z wykorzystaniem technik uczenia maszynowego [laboratoria]

\n", + "

Marek Kubis (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Parsing semantyczny z wykorzystaniem technik uczenia maszynowego\n", + "================================================================\n", + "\n", + "Wprowadzenie\n", + "------------\n", + "Problem wykrywania slotów i ich wartości w wypowiedziach użytkownika można sformułować jako zadanie\n", + "polegające na przewidywaniu dla poszczególnych słów etykiet wskazujących na to czy i do jakiego\n", + "slotu dane słowo należy.\n", + "\n", + "> chciałbym zarezerwować stolik na jutro**/day** na godzinę dwunastą**/hour** czterdzieści**/hour** pięć**/hour** na pięć**/size** osób\n", + "\n", + "Granice slotów oznacza się korzystając z wybranego schematu etykietowania.\n", + "\n", + "### Schemat IOB\n", + "\n", + "| Prefix | Znaczenie |\n", + "|:------:|:---------------------------|\n", + "| I | wnętrze slotu (inside) |\n", + "| O | poza slotem (outside) |\n", + "| B | początek slotu (beginning) |\n", + "\n", + "> chciałbym zarezerwować stolik na jutro**/B-day** na godzinę dwunastą**/B-hour** czterdzieści**/I-hour** pięć**/I-hour** na pięć**/B-size** osób\n", + "\n", + "### Schemat IOBES\n", + "\n", + "| Prefix | Znaczenie |\n", + "|:------:|:---------------------------|\n", + "| I | wnętrze slotu (inside) |\n", + "| O | poza slotem (outside) |\n", + "| B | początek slotu (beginning) |\n", + "| E | koniec slotu (ending) |\n", + "| S | pojedyncze słowo (single) |\n", + "\n", + "> chciałbym zarezerwować stolik na jutro**/S-day** na godzinę dwunastą**/B-hour** czterdzieści**/I-hour** pięć**/E-hour** na pięć**/S-size** osób\n", + "\n", + "Jeżeli dla tak sformułowanego zadania przygotujemy zbiór danych\n", + "złożony z wypowiedzi użytkownika z oznaczonymi slotami (tzw. *zbiór uczący*),\n", + "to możemy zastosować techniki (nadzorowanego) uczenia maszynowego w celu zbudowania modelu\n", + "annotującego wypowiedzi użytkownika etykietami slotów.\n", + "\n", + "Do zbudowania takiego modelu można wykorzystać między innymi:\n", + "\n", + " 1. warunkowe pola losowe (Lafferty i in.; 2001),\n", + "\n", + " 2. rekurencyjne sieci neuronowe, np. sieci LSTM (Hochreiter i Schmidhuber; 1997),\n", + "\n", + " 3. transformery (Vaswani i in., 2017).\n", + "\n", + "Przykład\n", + "--------\n", + "Skorzystamy ze zbioru danych przygotowanego przez Schustera (2019)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zbiór ten gromadzi wypowiedzi w trzech językach opisane slotami dla dwunastu ram należących do trzech dziedzin `Alarm`, `Reminder` oraz `Weather`. Dane wczytamy korzystając z biblioteki [conllu](https://pypi.org/project/conllu/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# text: halo\t\t\t\n", + "\n", + "# intent: hello\t\t\t\n", + "\n", + "# slots: \t\t\t\n", + "\n", + "1\thalo\thello\tNoLabel\n", + "\n", + "\t\t\t\n", + "\n", + "# text: chaciałbym pójść na premierę filmu jakie premiery są w tym tygodniu\t\t\t\n", + "\n", + "# intent: reqmore\t\t\t\n", + "\n", + "# slots: \t\t\t\n", + "\n", + "1\tchaciałbym\treqmore\tNoLabel\n", + "\n", + "2\tpójść\treqmore\tNoLabel\n", + "\n", + "3\tna\treqmore\tNoLabel\n", + "\n", + "4\tpremierę\treqmore\tNoLabel\n", + "\n", + "5\tfilmu\treqmore\tNoLabel\n", + "\n", + "6\tjakie\treqmore\tB-goal\n", + "\n", + "7\tpremiery\treqmore\tI-goal\n", + "\n" + ] + } + ], + "source": [ + "from conllu import parse_incr\n", + "fields = ['id', 'form', 'frame', 'slot']\n", + "\n", + "def nolabel2o(line, i):\n", + " return 'O' if line[i] == 'NoLabel' else line[i]\n", + "# pathTrain = '../tasks/zad8/en/train-en.conllu'\n", + "# pathTest = '../tasks/zad8/en/test-en.conllu'\n", + "\n", + "pathTrain = '../tasks/zad8/pl/train.conllu'\n", + "pathTest = '../tasks/zad8/pl/test.conllu'\n", + "\n", + "with open(pathTrain, encoding=\"UTF-8\") as trainfile:\n", + " i=0\n", + " for line in trainfile:\n", + " print(line)\n", + " i+=1\n", + " if i==15: break \n", + " trainset = list(parse_incr(trainfile, fields=fields, field_parsers={'slot': nolabel2o}))\n", + "with open(pathTest, encoding=\"UTF-8\") as testfile:\n", + " testset = list(parse_incr(testfile, fields=fields, field_parsers={'slot': nolabel2o}))\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zobaczmy kilka przykładowych wypowiedzi z tego zbioru." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
1wybieraminformO
2batmana informB-title
" + ], + "text/plain": [ + "'\\n\\n\\n\\n\\n
1wybieraminformO
2batmana informB-title
'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from tabulate import tabulate\n", + "tabulate(trainset[1], tablefmt='html')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
1chcę informO
2zarezerwowaćinformB-goal
3bilety informO
" + ], + "text/plain": [ + "'\\n\\n\\n\\n\\n\\n
1chcę informO
2zarezerwowaćinformB-goal
3bilety informO
'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tabulate(trainset[16], tablefmt='html')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
1chciałbym informO
2anulować informO
3rezerwacjęinformO
4biletu informO
" + ], + "text/plain": [ + "'\\n\\n\\n\\n\\n\\n\\n
1chciałbym informO
2anulować informO
3rezerwacjęinformO
4biletu informO
'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tabulate(trainset[20], tablefmt='html')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Budując model skorzystamy z architektury opartej o rekurencyjne sieci neuronowe\n", + "zaimplementowanej w bibliotece [flair](https://github.com/flairNLP/flair) (Akbik i in. 2018)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from flair.data import Corpus, Sentence, Token\n", + "from flair.datasets import SentenceDataset\n", + "from flair.embeddings import StackedEmbeddings\n", + "from flair.embeddings import WordEmbeddings\n", + "from flair.embeddings import CharacterEmbeddings\n", + "from flair.embeddings import FlairEmbeddings\n", + "from flair.models import SequenceTagger\n", + "from flair.trainers import ModelTrainer\n", + "from flair.datasets import DataLoader\n", + "\n", + "# determinizacja obliczeń\n", + "import random\n", + "import torch\n", + "random.seed(42)\n", + "torch.manual_seed(42)\n", + "\n", + "if torch.cuda.is_available():\n", + " torch.cuda.manual_seed(0)\n", + " torch.cuda.manual_seed_all(0)\n", + " torch.backends.cudnn.enabled = False\n", + " torch.backends.cudnn.benchmark = False\n", + " torch.backends.cudnn.deterministic = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dane skonwertujemy do formatu wykorzystywanego przez `flair`, korzystając z następującej funkcji." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: 346 train + 38 dev + 32 test sentences\n", + "Dictionary with 78 tags: , O, O/reqmore, B-interval/reqmore, I-interval/reqmore, O/inform, B-title/inform, B-date/inform, I-date/inform, B-time/inform, B-quantity/inform, B-area/inform, I-area/inform, B-goal/inform, O/bye, O/hello, O/reqmore inform, B-goal/reqmore inform, I-goal/reqmore inform, B-date/reqmore inform, B-interval/reqmore inform, O/null, O/help, B-goal/reqmore, I-goal/reqmore, B-title/reqmore, B-title/reqmore inform, I-title/reqmore inform, O/ack, O/reqalts\n" + ] + } + ], + "source": [ + "def conllu2flair(sentences, label1=None, label2=None):\n", + " fsentences = []\n", + "\n", + " for sentence in sentences:\n", + " fsentence = Sentence()\n", + "\n", + " for token in sentence:\n", + " ftoken = Token(token['form'])\n", + "\n", + " if label1:\n", + " if label2:\n", + " ftoken.add_tag(label1, token[label1] + \"/\" + token[label2])\n", + " else:\n", + " ftoken.add_tag(label1, token[label1])\n", + " \n", + " fsentence.add_token(ftoken)\n", + "\n", + " fsentences.append(fsentence)\n", + "\n", + " return SentenceDataset(fsentences)\n", + "\n", + "corpus = Corpus(train=conllu2flair(trainset, 'slot', \"frame\"), test=conllu2flair(testset, 'slot', \"frame\"))\n", + "print(corpus)\n", + "tag_dictionary = corpus.make_tag_dictionary(tag_type='slot')\n", + "print(tag_dictionary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nasz model będzie wykorzystywał wektorowe reprezentacje słów (zob. [Word Embeddings](https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md))." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_types = [\n", + " WordEmbeddings('pl'),\n", + " FlairEmbeddings('polish-forward'),\n", + " FlairEmbeddings('polish-backward'),\n", + " CharacterEmbeddings(),\n", + "]\n", + "\n", + "embeddings = StackedEmbeddings(embeddings=embedding_types)\n", + "tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,\n", + " tag_dictionary=tag_dictionary,\n", + " tag_type='slot', use_crf=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zobaczmy jak wygląda architektura sieci neuronowej, która będzie odpowiedzialna za przewidywanie\n", + "slotów w wypowiedziach." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SequenceTagger(\n", + " (embeddings): StackedEmbeddings(\n", + " (list_embedding_0): WordEmbeddings('pl')\n", + " (list_embedding_1): FlairEmbeddings(\n", + " (lm): LanguageModel(\n", + " (drop): Dropout(p=0.25, inplace=False)\n", + " (encoder): Embedding(1602, 100)\n", + " (rnn): LSTM(100, 2048)\n", + " (decoder): Linear(in_features=2048, out_features=1602, bias=True)\n", + " )\n", + " )\n", + " (list_embedding_2): FlairEmbeddings(\n", + " (lm): LanguageModel(\n", + " (drop): Dropout(p=0.25, inplace=False)\n", + " (encoder): Embedding(1602, 100)\n", + " (rnn): LSTM(100, 2048)\n", + " (decoder): Linear(in_features=2048, out_features=1602, bias=True)\n", + " )\n", + " )\n", + " (list_embedding_3): CharacterEmbeddings(\n", + " (char_embedding): Embedding(275, 25)\n", + " (char_rnn): LSTM(25, 25, bidirectional=True)\n", + " )\n", + " )\n", + " (word_dropout): WordDropout(p=0.05)\n", + " (locked_dropout): LockedDropout(p=0.5)\n", + " (embedding2nn): Linear(in_features=4446, out_features=4446, bias=True)\n", + " (rnn): LSTM(4446, 256, batch_first=True, bidirectional=True)\n", + " (linear): Linear(in_features=512, out_features=78, bias=True)\n", + " (beta): 1.0\n", + " (weights): None\n", + " (weight_tensor) None\n", + ")\n" + ] + } + ], + "source": [ + "print(tagger)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wykonamy dziesięć iteracji (epok) uczenia a wynikowy model zapiszemy w katalogu `slot-model`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "modelPath = 'slot-model/final-model.pt'\n", + "\n", + "from os.path import exists\n", + "\n", + "fileExists = exists(modelPath)\n", + "\n", + "if(not fileExists):\n", + " trainer = ModelTrainer(tagger, corpus)\n", + " trainer.train('slot-model',\n", + " learning_rate=0.1,\n", + " mini_batch_size=32,\n", + " max_epochs=10,\n", + " train_with_dev=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jakość wyuczonego modelu możemy ocenić, korzystając z zaraportowanych powyżej metryk, tj.:\n", + "\n", + " - *tp (true positives)*\n", + "\n", + " > liczba słów oznaczonych w zbiorze testowym etykietą $e$, które model oznaczył tą etykietą\n", + "\n", + " - *fp (false positives)*\n", + "\n", + " > liczba słów nieoznaczonych w zbiorze testowym etykietą $e$, które model oznaczył tą etykietą\n", + "\n", + " - *fn (false negatives)*\n", + "\n", + " > liczba słów oznaczonych w zbiorze testowym etykietą $e$, którym model nie nadał etykiety $e$\n", + "\n", + " - *precision*\n", + "\n", + " > $$\\frac{tp}{tp + fp}$$\n", + "\n", + " - *recall*\n", + "\n", + " > $$\\frac{tp}{tp + fn}$$\n", + "\n", + " - $F_1$\n", + "\n", + " > $$\\frac{2 \\cdot precision \\cdot recall}{precision + recall}$$\n", + "\n", + " - *micro* $F_1$\n", + "\n", + " > $F_1$ w którym $tp$, $fp$ i $fn$ są liczone łącznie dla wszystkich etykiet, tj. $tp = \\sum_{e}{{tp}_e}$, $fn = \\sum_{e}{{fn}_e}$, $fp = \\sum_{e}{{fp}_e}$\n", + "\n", + " - *macro* $F_1$\n", + "\n", + " > średnia arytmetyczna z $F_1$ obliczonych dla poszczególnych etykiet z osobna.\n", + "\n", + "Wyuczony model możemy wczytać z pliku korzystając z metody `load`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-05-22 15:25:19,970 loading file slot-model/final-model.pt\n" + ] + } + ], + "source": [ + "model = SequenceTagger.load(modelPath)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wczytany model możemy wykorzystać do przewidywania slotów w wypowiedziach użytkownika, korzystając\n", + "z przedstawionej poniżej funkcji `predict`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('co', 'O/reqmore'), ('gracie', 'O/reqmore'), ('obecnie', 'O/reqmore')]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def predict(model, sentence):\n", + " csentence = [{'form': word} for word in sentence]\n", + " fsentence = conllu2flair([csentence])[0]\n", + " model.predict(fsentence)\n", + " return [(token, ftoken.get_tag('slot').value) for token, ftoken in zip(sentence, fsentence)]\n", + "\n", + "predict(model, 'co gracie obecnie'.split())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jak pokazuje przykład poniżej model wyuczony tylko na 100 przykładach popełnia w dosyć prostej\n", + "wypowiedzi błąd etykietując słowo `alarm` tagiem `B-weather/noun`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
kiedy O/reqmore
gracieO/reqmore
film O/reqmore
zorro O/reqmore
" + ], + "text/plain": [ + "'\\n\\n\\n\\n\\n\\n\\n
kiedy O/reqmore
gracieO/reqmore
film O/reqmore
zorro O/reqmore
'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tabulate(predict(model, 'kiedy gracie film zorro'.split()), tablefmt='html')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'testset' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Develop\\wmi\\AITECH\\sem1\\Systemy dialogowe\\lab\\08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb Cell 25'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 37\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mrecall: \u001b[39m\u001b[39m\"\u001b[39m, recallScore)\n\u001b[0;32m 38\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mf1: \u001b[39m\u001b[39m\"\u001b[39m, f1Score)\n\u001b[1;32m---> 40\u001b[0m \u001b[39meval\u001b[39;49m()\n", + "\u001b[1;32mc:\\Develop\\wmi\\AITECH\\sem1\\Systemy dialogowe\\lab\\08-parsing-semantyczny-uczenie(zmodyfikowany).ipynb Cell 25'\u001b[0m in \u001b[0;36meval\u001b[1;34m()\u001b[0m\n\u001b[0;32m 14\u001b[0m fp \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[0;32m 15\u001b[0m fn \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[1;32m---> 16\u001b[0m sentences \u001b[39m=\u001b[39m [sentence \u001b[39mfor\u001b[39;00m sentence \u001b[39min\u001b[39;00m testset]\n\u001b[0;32m 17\u001b[0m \u001b[39mfor\u001b[39;00m sentence \u001b[39min\u001b[39;00m sentences:\n\u001b[0;32m 18\u001b[0m \u001b[39m# get sentence as terms list\u001b[39;00m\n\u001b[0;32m 19\u001b[0m termsList \u001b[39m=\u001b[39m [w[\u001b[39m\"\u001b[39m\u001b[39mform\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39mfor\u001b[39;00m w \u001b[39min\u001b[39;00m sentence]\n", + "\u001b[1;31mNameError\u001b[0m: name 'testset' is not defined" + ] + } + ], + "source": [ + "# evaluation\n", + "\n", + "def precision(tpScore, fpScore):\n", + " return float(tpScore) / (tpScore + fpScore)\n", + "\n", + "def recall(tpScore, fnScore):\n", + " return float(tpScore) / (tpScore + fnScore)\n", + "\n", + "def f1(precision, recall):\n", + " return 2 * precision * recall/(precision + recall)\n", + "\n", + "def eval():\n", + " tp = 0\n", + " fp = 0\n", + " fn = 0\n", + " sentences = [sentence for sentence in testset]\n", + " for sentence in sentences:\n", + " # get sentence as terms list\n", + " termsList = [w[\"form\"] for w in sentence]\n", + " # predict tags\n", + " predTags = [tag[1] for tag in predict(model, termsList)]\n", + " \n", + " expTags = [token[\"slot\"] + \"/\" + token[\"frame\"] for token in sentence]\n", + " for i in range(len(predTags)):\n", + " if (expTags[i][0] == \"O\" and expTags[i] != predTags[i]):\n", + " fp += 1\n", + " elif ((expTags[i][0] != \"O\") & (predTags[i][0] == \"O\")):\n", + " fn += 1\n", + " elif ((expTags[i][0] != \"O\") & (predTags[i] == expTags[i])):\n", + " tp += 1\n", + "\n", + " precisionScore = precision(tp, fp)\n", + " recallScore = recall(tp, fn)\n", + " f1Score = f1(precisionScore, recallScore)\n", + " print(\"stats: \")\n", + " print(\"precision: \", precisionScore)\n", + " print(\"recall: \", recallScore)\n", + " print(\"f1: \", f1Score)\n", + "\n", + "eval()\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Literatura\n", + "----------\n", + " 1. Sebastian Schuster, Sonal Gupta, Rushin Shah, Mike Lewis, Cross-lingual Transfer Learning for Multilingual Task Oriented Dialog. NAACL-HLT (1) 2019, pp. 3795-3805\n", + " 2. John D. Lafferty, Andrew McCallum, and Fernando C. N. Pereira. 2001. Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data. In Proceedings of the Eighteenth International Conference on Machine Learning (ICML '01). Morgan Kaufmann Publishers Inc., San Francisco, CA, USA, 282–289, https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers\n", + " 3. Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long Short-Term Memory. Neural Comput. 9, 8 (November 15, 1997), 1735–1780, https://doi.org/10.1162/neco.1997.9.8.1735\n", + " 4. Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin, Attention is All you Need, NIPS 2017, pp. 5998-6008, https://arxiv.org/abs/1706.03762\n", + " 5. Alan Akbik, Duncan Blythe, Roland Vollgraf, Contextual String Embeddings for Sequence Labeling, Proceedings of the 27th International Conference on Computational Linguistics, pp. 1638–1649, https://www.aclweb.org/anthology/C18-1139.pdf\n" + ] + } + ], + "metadata": { + "author": "Marek Kubis", + "email": "mkubis@amu.edu.pl", + "interpreter": { + "hash": "2f9d6cf1e3d8195079a65c851de355134a77367bcd714b1a5d498c42d3c07114" + }, + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "subtitle": "8.Parsing semantyczny z wykorzystaniem technik uczenia maszynowego[laboratoria]", + "title": "Systemy Dialogowe", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/lab/11-generowanie-odpowiedzi.ipynb b/lab/11-generowanie-odpowiedzi.ipynb new file mode 100644 index 0000000..20c19d2 --- /dev/null +++ b/lab/11-generowanie-odpowiedzi.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Systemy Dialogowe

\n", + "

11. Generowanie odpowiedzi [laboratoria]

\n", + "

Marek Kubis (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generowanie odpowiedzi\n", + "======================\n", + "\n", + "W systemie dialogowym taktyka prowadzenia dialogu odpowiada za wyznaczanie aktów systemowych, czyli wskazanie tego **co ma zostać przez system wypowiedziane** i/lub wykonane.\n", + "Zadaniem modułu generowania odpowiedzi jest zamiana aktów dialogowych na wypowiedzi w języku\n", + "naturalnym, czyli wskazanie tego **w jaki sposób** ma zostać wypowiedziane to co ma zostać\n", + "wypowiedziane.\n", + "\n", + "Generowanie odpowiedzi przy użyciu szablonów\n", + "--------------------------------------------\n", + "Podstawowe narzędzie wykorzystywane w modułach generowania odpowiedzi stanowią szablony tekstowe\n", + "interpolujące zmienne. W Pythonie mechanizm ten jest dostępny za pośrednictwem\n", + "[f-stringów](https://docs.python.org/3/reference/lexical_analysis.html#f-strings), metody\n", + "[format](https://docs.python.org/3/library/string.html#formatstrings) oraz zewnętrznych bibliotek takich, jak [Jinja2](https://jinja.palletsprojects.com/).\n", + "\n", + "O ile podejście wykorzystujące wbudowane mechanizmy języka Python sprawdza się w prostych\n", + "przypadkach..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "def nlg(system_act):\n", + " domain, intent, slot, value = system_act\n", + "\n", + " if intent == 'Inform' and slot == 'Phone':\n", + " return f'Numer telefonu to {value}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nlg(['Hotel', 'Inform', 'Phone', '1234567890'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + "... to trzeba mieć świadomość, że w toku prac nad agentem dialogowym może być konieczne\n", + "uwzględnienie m.in.:\n", + "\n", + " 1. szablonów zależnych od wartości slotów" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "def nlg(system_act):\n", + " domain, intent, slot, value = system_act\n", + "\n", + " if domain == 'Restaurant' and intent == 'Inform' and slot == 'Count':\n", + " if value == 0:\n", + " return f'Nie znalazłem restauracji spełniających podane kryteria.'\n", + " elif value == 1:\n", + " return f'Znalazłem jedną restaurację spełniającą podane kryteria.'\n", + " elif value <= 4:\n", + " return f'Znalazłem {value} restauracje spełniające podane kryteria.'\n", + " elif value <= 9:\n", + " return f'Znalazłem {value} restauracji spełniających podane kryteria.'\n", + " else:\n", + " return f'Znalazłem wiele restauracji spełniających podane kryteria.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "nlg(['Restaurant', 'Inform', 'Count', 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "nlg(['Restaurant', 'Inform', 'Count', 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "nlg(['Restaurant', 'Inform', 'Count', 2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "nlg(['Restaurant', 'Inform', 'Count', 6])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nlg(['Restaurant', 'Inform', 'Count', 100])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + " 2. wielu wariantów tej samej wypowiedzi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def nlg(system_act):\n", + " domain, intent, slot, value = system_act\n", + "\n", + " if intent == 'Affirm':\n", + " r = random.randint(1, 3)\n", + "\n", + " if r == 1:\n", + " return 'Tak'\n", + " elif r == 2:\n", + " return 'Zgadza się'\n", + " else:\n", + " return 'Potwierdzam'\n", + "\n", + "nlg(['Hotel', 'Affirm', '', ''])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + " 3. wielojęzycznego interfejsu użytkownika" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "def nlg_en(system_act):\n", + " domain, intent, slot, value = system_act\n", + "\n", + " if domain == 'Hotel' and intent == 'Request' and slot == 'CreditCardNo':\n", + " return 'What is your credit card number?'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nlg_en(['Hotel', 'Request', 'CreditCardNo', '?'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "lines_to_next_cell": 0 + }, + "source": [ + "Generowanie odpowiedzi z wykorzystaniem uczenia maszynowego\n", + "-----------------------------------------------------------\n", + "Obok mechanizmu szablonów do generowania odpowiedzi można również\n", + "stosować techniki uczenia maszynowego.\n", + "Zagadnienie to stanowiło\n", + "przedmiot konkursu [E2E NLG Challenge](http://www.macs.hw.ac.uk/InteractionLab/E2E/) (Novikova i in., 2017).\n", + "Przyjrzyjmy się danym, jakie udostępnili organizatorzy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p l10\n", + "!curl -L -C - https://github.com/tuetschek/e2e-dataset/releases/download/v1.0.0/e2e-dataset.zip -o l10/e2e-dataset.zip\n", + "!unzip l10/e2e-dataset.zip -d l10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "trainset = pd.read_csv('l10/e2e-dataset/trainset.csv')\n", + "trainset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zadanie\n", + "-------\n", + "Zaimplementować moduł generowania odpowiedzi obejmujący akty systemowe występujące w zgromadzonym korpusie.\n", + "\n", + "Termin: 1.06.2022, godz. 23:59.\n", + "\n", + "Literatura\n", + "----------\n", + " 1. Jekaterina Novikova, Ondřej Dušek, Verena Rieser, The E2E Dataset: New Challenges For End-to-End Generation, Proceedings of the SIGDIAL 2017 Conference, pages 201-206, Saarbrücken, Germany https://arxiv.org/pdf/1706.09254.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv('../data/dialog-17-04-03.tsv', sep='\\t', header=None)\n", + "df.columns = ['user','text','data']\n", + "df= df[df.user=='system']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(axis=1, labels=['user'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usertextdata
1systemWitamy w internetowym systemie rezerwacji Nach...welcomemsg()
3systemSystem Nachos obsługuje następujące kina: Mult...select(location)
\n", + "
" + ], + "text/plain": [ + " user text data\n", + "1 system Witamy w internetowym systemie rezerwacji Nach... welcomemsg()\n", + "3 system System Nachos obsługuje następujące kina: Mult... select(location)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def nlg(system_act):\n", + " intent, slot, value = system_act\n", + "\n", + " if intent=='welcomemsg':\n", + " return 'Witamy w internetowym systemie rezerwacji Nachos, w czym mogę pomóc?'\n", + " elif intent=='inform':\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " if domain == 'Restaurant' and intent == 'Inform' and slot == 'Count':\n", + " if value == 0:\n", + " return f'Nie znalazłem restauracji spełniających podane kryteria.'\n", + " elif value == 1:\n", + " return f'Znalazłem jedną restaurację spełniającą podane kryteria.'\n", + " elif value <= 4:\n", + " return f'Znalazłem {value} restauracje spełniające podane kryteria.'\n", + " elif value <= 9:\n", + " return f'Znalazłem {value} restauracji spełniających podane kryteria.'\n", + " else:\n", + " return f'Znalazłem wiele restauracji spełniających podane kryteria.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inform(quantity=2) AND inform(time=12:00)\n", + "['inform','quantity','2']" + ] + } + ], + "metadata": { + "author": "Marek Kubis", + "email": "mkubis@amu.edu.pl", + "jupytext": { + "cell_metadata_filter": "-all", + "main_language": "python", + "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "subtitle": "11.Generowanie odpowiedzi[laboratoria]", + "title": "Systemy Dialogowe", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +}