systemy_dialogowe/slots.ipynb at master

Maciej Tyczynski fbd54e4bcd added notebook with predicting slots - currently it predicts only padding tokens. Need fixing

2023-05-05 10:29:40 +02:00

33 KiB

Raw Permalink Blame History

import pandas as pd
import numpy as np
import tokenization

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa

import sklearn
from sklearn.model_selection import train_test_split
import glob
import os

c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_addons\utils\tfa_eol_msg.py:23: UserWarning: 

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

  warnings.warn(

path = os.getcwd()+'\data'
tsv_files = glob.glob(os.path.join(path, "*.tsv"))

dfs = []
for filename in tsv_files:
    df = pd.read_csv(filename, index_col=None, header=None, delimiter='\t',names=["speaker", "sentence", "dialogue_act"])
    dfs.append(df)

combined_df = pd.concat(dfs, axis=0, ignore_index=True)

combined_df

	speaker	sentence	dialogue_act
0	user	Co proszę?	null()/hello()
1	system	Witam w systemie rezerwacji hotelu. Gdzie chci...	welcomemsg()
2	user	W jakim kraju/B-country mogę zarezerwować hotel?	help(country)
3	system	Mamy szeroki wybór hoteli na całym świecie.	expl-conf()
4	user	Przedstaw proszę oferty z obszaru Górnego Kara...	request(country=Górny Karabuch)
...	...	...	...
347	system	Okej w takim razie, proponuję ten sam hotel w ...	offer(price=110, date=02.07.2023- 08.07.2023)
348	user	Jak najbardziej. Proszę o zarezerwowanie/B-res...	confirm()
349	system	Dobrze, numer rezerwacji to 912312. Dokładny A...	inform(reservation_number=912312, address=3 ma...
350	user	Nie, dziękuję i życzę miłego dnia	negate()&thankyou()&bye()
351	system	Dziękuję bardzo wzajemnie.	thankyou()

352 rows × 3 columns

def extract_labels(sentence):
    tokens = sentence.split()
    labels = []
    for token in tokens:
        parts = token.split("/")
        if len(parts) > 1:
            label = parts[1]
            if label.startswith('B-'):
                labels.append(label[0:])
            elif label.startswith('I-'):
                labels.append(label[0:])
            else:
                labels.append('O')
        else:
            labels.append('O')
    return labels

labels = combined_df['sentence'].apply(extract_labels)

labels_list = [label for sentence in labels for label in sentence ]

unique_labels = set(labels_list)

unique_labels

{'B-alternative',
 'B-animal',
 'B-area',
 'B-area,',
 'B-area.',
 'B-available?',
 'B-beggining',
 'B-checkin?',
 'B-city',
 'B-city,',
 'B-city.',
 'B-confirmation',
 'B-country',
 'B-country,',
 'B-country?',
 'B-date',
 'B-date?',
 'B-day',
 'B-day-11',
 'B-day-28',
 'B-days',
 'B-days,',
 'B-days.',
 'B-email',
 'B-facilities',
 'B-facilities,',
 'B-facilities.',
 'B-facilities...',
 'B-facilities?',
 'B-finish',
 'B-first',
 'B-hotel',
 'B-hotel.',
 'B-hotel?',
 'B-insurance',
 'B-insurance?',
 'B-location',
 'B-month',
 'B-month.',
 'B-month?',
 'B-next',
 'B-nights',
 'B-nights,',
 'B-number_of_rooms',
 'B-payment',
 'B-payment?',
 'B-people',
 'B-people?',
 'B-per_night',
 'B-per_night.',
 'B-price',
 'B-price!',
 'B-price.',
 'B-price?',
 'B-reservation',
 'B-reservation_number',
 'B-room_size',
 'B-room_size,',
 'B-room_size.',
 'B-room_size?',
 'B-rooms',
 'B-sickness',
 'B-size',
 'B-size,',
 'B-size.',
 'B-size?',
 'B-stars',
 'B-stars?',
 'B-sum',
 'B-week',
 'B-weekend',
 'B-weekend.',
 'B-weekend?',
 'B-year',
 'B-year.',
 'I-area',
 'I-country',
 'I-country.',
 'I-day',
 'I-day.',
 'I-days',
 'I-facilities',
 'I-facilities.',
 'I-hotel',
 'I-location,',
 'I-month',
 'I-payment',
 'I-perperson',
 'I-room_size',
 'I-year',
 'O'}

num_labels = unique_labels.__len__()+1

OTHER_LABEL = "pad"
label_map = {label: i for i, label in enumerate(unique_labels, start=1)}
label_map[OTHER_LABEL] = 0

# Convert the flattened labels array to a numpy array of integers

label_map["pad"]

label_map['O']

labels

0                                                 [O, O]
1                            [O, O, O, O, O, O, O, O, O]
2                             [O, O, B-country, O, O, O]
3                                  [O, O, O, O, O, O, O]
4                          [O, O, O, O, O, O, B-country]
                             ...                        
347    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
348                    [O, O, O, O, B-reservation, O, O]
349    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
350                                   [O, O, O, O, O, O]
351                                            [O, O, O]
Name: sentence, Length: 352, dtype: object

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = AutoModel.from_pretrained("allegro/herbert-base-cased")

c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

# 1. Preprocess the data
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
# Initialize the tokenizer
# Define a maximum sequence length
max_length = 128

# Tokenize the text
tokens = tokenizer(combined_df["sentence"].tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

# Create attention masks
attention_masks = tokens["attention_mask"]

# Truncate or pad the labels to match the sequence length
labels = [[label_map.get(l, 0) for l in lab] for lab in labels]
labels = pad_sequences(labels, maxlen=max_length, value=0, dtype=np.int32, truncating='post', padding='post')
labels = torch.tensor(labels, dtype=torch.long)
# Convert the preprocessed data into a PyTorch Dataset
dataset = TensorDataset(tokens["input_ids"], attention_masks, labels)



# 2. Define the NER model

from transformers import AutoModelForTokenClassification

# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased", num_labels=num_labels)

# 3. Train the NER model


# Convert the preprocessed data into a PyTorch Dataset
dataset = dataset = TensorDataset(
    tokens["input_ids"],
    attention_masks,
    torch.tensor(labels, dtype=torch.long)
)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\3997969718.py:42: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  torch.tensor(labels, dtype=torch.long)

# Define the training parameters
batch_size = 8
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset) // batch_size + 1)
loss_fn = torch.nn.CrossEntropyLoss()

# Define the training loop
def train(model, dataloader, optimizer, scheduler, loss_fn):
    model.train()
    for batch in dataloader:
        inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
        labels = batch[2].to(model.device)
        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, num_labels), labels.view(-1))

        loss.backward()

        optimizer.step()
        scheduler.step()

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tokens["input_ids"], labels, test_size=0.2)

train_masks, test_masks, _, _ = train_test_split(attention_masks, tokens["input_ids"], test_size=0.2)

# Convert the preprocessed data into PyTorch Dataloaders
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:30: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:33: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))

labels

tensor([[55, 55,  0,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 73,  ...,  0,  0,  0],
        ...,
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0]])


# Train the model
epochs = 3
for epoch in range(epochs):
    train(model, train_dataloader, optimizer, scheduler, loss_fn)
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
            labels = batch[2].to(model.device)
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)

            # Calculate the accuracy
            accuracy = (predictions == labels).float().mean().item()

            print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90
Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90
Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90

predictions[1]

[1;31m---------------------------------------------------------------------------[0m
[1;31mIndexError[0m                                Traceback (most recent call last)
Cell [1;32mIn[271], line 1[0m
[1;32m----> 1[0m predictions[[39m1[39;49m]

[1;31mIndexError[0m: index 1 is out of bounds for dimension 0 with size 1

labels

tensor([[25, 25, 25, 25, 25, 25, 25, 84,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 84,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 15, 25, 25, 32,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 75,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 66,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])

# Define the sentence
sentence = "Hej, chciałbym zamówić pokój w Poznaniu na termin 25.03 - 17.04"

# Tokenize the sentence
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors="pt")

# Create the attention mask
attention_mask = torch.ones_like(input_ids)

# Make the prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = outputs.logits.argmax(dim=-1)
index_label_map = {v: k for k, v in label_map.items()}
# Decode the predicted labels
predicted_labels = [index_label_map[label] for label in predictions[0].tolist()]

# Print the predicted labels
print(predicted_labels)

['pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']

label_map["O"]

predictions[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

[(0, 0), (0, 2), (3, 9), (9, 10), (0, 5)]

33 KiB Raw Permalink Blame History Unescape Escape

33 KiB

Raw Permalink Blame History