systemy_dialogowe/slots.ipynb

33 KiB
Raw Blame History

import pandas as pd
import numpy as np
import tokenization

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa

import sklearn
from sklearn.model_selection import train_test_split
import glob
import os
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_addons\utils\tfa_eol_msg.py:23: UserWarning: 

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

  warnings.warn(
path = os.getcwd()+'\data'
tsv_files = glob.glob(os.path.join(path, "*.tsv"))
dfs = []
for filename in tsv_files:
    df = pd.read_csv(filename, index_col=None, header=None, delimiter='\t',names=["speaker", "sentence", "dialogue_act"])
    dfs.append(df)
combined_df = pd.concat(dfs, axis=0, ignore_index=True)
combined_df
speaker sentence dialogue_act
0 user Co proszę? null()/hello()
1 system Witam w systemie rezerwacji hotelu. Gdzie chci... welcomemsg()
2 user W jakim kraju/B-country mogę zarezerwować hotel? help(country)
3 system Mamy szeroki wybór hoteli na całym świecie. expl-conf()
4 user Przedstaw proszę oferty z obszaru Górnego Kara... request(country=Górny Karabuch)
... ... ... ...
347 system Okej w takim razie, proponuję ten sam hotel w ... offer(price=110, date=02.07.2023- 08.07.2023)
348 user Jak najbardziej. Proszę o zarezerwowanie/B-res... confirm()
349 system Dobrze, numer rezerwacji to 912312. Dokładny A... inform(reservation_number=912312, address=3 ma...
350 user Nie, dziękuję i życzę miłego dnia negate()&thankyou()&bye()
351 system Dziękuję bardzo wzajemnie. thankyou()

352 rows × 3 columns

def extract_labels(sentence):
    tokens = sentence.split()
    labels = []
    for token in tokens:
        parts = token.split("/")
        if len(parts) > 1:
            label = parts[1]
            if label.startswith('B-'):
                labels.append(label[0:])
            elif label.startswith('I-'):
                labels.append(label[0:])
            else:
                labels.append('O')
        else:
            labels.append('O')
    return labels
labels = combined_df['sentence'].apply(extract_labels)
labels_list = [label for sentence in labels for label in sentence ]
unique_labels = set(labels_list)
unique_labels
{'B-alternative',
 'B-animal',
 'B-area',
 'B-area,',
 'B-area.',
 'B-available?',
 'B-beggining',
 'B-checkin?',
 'B-city',
 'B-city,',
 'B-city.',
 'B-confirmation',
 'B-country',
 'B-country,',
 'B-country?',
 'B-date',
 'B-date?',
 'B-day',
 'B-day-11',
 'B-day-28',
 'B-days',
 'B-days,',
 'B-days.',
 'B-email',
 'B-facilities',
 'B-facilities,',
 'B-facilities.',
 'B-facilities...',
 'B-facilities?',
 'B-finish',
 'B-first',
 'B-hotel',
 'B-hotel.',
 'B-hotel?',
 'B-insurance',
 'B-insurance?',
 'B-location',
 'B-month',
 'B-month.',
 'B-month?',
 'B-next',
 'B-nights',
 'B-nights,',
 'B-number_of_rooms',
 'B-payment',
 'B-payment?',
 'B-people',
 'B-people?',
 'B-per_night',
 'B-per_night.',
 'B-price',
 'B-price!',
 'B-price.',
 'B-price?',
 'B-reservation',
 'B-reservation_number',
 'B-room_size',
 'B-room_size,',
 'B-room_size.',
 'B-room_size?',
 'B-rooms',
 'B-sickness',
 'B-size',
 'B-size,',
 'B-size.',
 'B-size?',
 'B-stars',
 'B-stars?',
 'B-sum',
 'B-week',
 'B-weekend',
 'B-weekend.',
 'B-weekend?',
 'B-year',
 'B-year.',
 'I-area',
 'I-country',
 'I-country.',
 'I-day',
 'I-day.',
 'I-days',
 'I-facilities',
 'I-facilities.',
 'I-hotel',
 'I-location,',
 'I-month',
 'I-payment',
 'I-perperson',
 'I-room_size',
 'I-year',
 'O'}
num_labels = unique_labels.__len__()+1
OTHER_LABEL = "pad"
label_map = {label: i for i, label in enumerate(unique_labels, start=1)}
label_map[OTHER_LABEL] = 0

# Convert the flattened labels array to a numpy array of integers
label_map["pad"]
0
label_map['O']
55
labels
0                                                 [O, O]
1                            [O, O, O, O, O, O, O, O, O]
2                             [O, O, B-country, O, O, O]
3                                  [O, O, O, O, O, O, O]
4                          [O, O, O, O, O, O, B-country]
                             ...                        
347    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
348                    [O, O, O, O, B-reservation, O, O]
349    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
350                                   [O, O, O, O, O, O]
351                                            [O, O, O]
Name: sentence, Length: 352, dtype: object
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = AutoModel.from_pretrained("allegro/herbert-base-cased")
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
# 1. Preprocess the data
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
# Initialize the tokenizer
# Define a maximum sequence length
max_length = 128

# Tokenize the text
tokens = tokenizer(combined_df["sentence"].tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

# Create attention masks
attention_masks = tokens["attention_mask"]

# Truncate or pad the labels to match the sequence length
labels = [[label_map.get(l, 0) for l in lab] for lab in labels]
labels = pad_sequences(labels, maxlen=max_length, value=0, dtype=np.int32, truncating='post', padding='post')
labels = torch.tensor(labels, dtype=torch.long)
# Convert the preprocessed data into a PyTorch Dataset
dataset = TensorDataset(tokens["input_ids"], attention_masks, labels)



# 2. Define the NER model

from transformers import AutoModelForTokenClassification

# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased", num_labels=num_labels)

# 3. Train the NER model


# Convert the preprocessed data into a PyTorch Dataset
dataset = dataset = TensorDataset(
    tokens["input_ids"],
    attention_masks,
    torch.tensor(labels, dtype=torch.long)
)


Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\3997969718.py:42: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  torch.tensor(labels, dtype=torch.long)
# Define the training parameters
batch_size = 8
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset) // batch_size + 1)
loss_fn = torch.nn.CrossEntropyLoss()

# Define the training loop
def train(model, dataloader, optimizer, scheduler, loss_fn):
    model.train()
    for batch in dataloader:
        inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
        labels = batch[2].to(model.device)
        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = loss_fn(outputs.logits.view(-1, num_labels), labels.view(-1))

        loss.backward()

        optimizer.step()
        scheduler.step()

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tokens["input_ids"], labels, test_size=0.2)

train_masks, test_masks, _, _ = train_test_split(attention_masks, tokens["input_ids"], test_size=0.2)

# Convert the preprocessed data into PyTorch Dataloaders
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:30: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:33: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
labels
tensor([[55, 55,  0,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 73,  ...,  0,  0,  0],
        ...,
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0],
        [55, 55, 55,  ...,  0,  0,  0]])

# Train the model
epochs = 3
for epoch in range(epochs):
    train(model, train_dataloader, optimizer, scheduler, loss_fn)
    # Evaluate the model
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
            labels = batch[2].to(model.device)
            outputs = model(**inputs)
            predictions = outputs.logits.argmax(dim=-1)

            # Calculate the accuracy
            accuracy = (predictions == labels).float().mean().item()

            print(f"Accuracy: {accuracy:.2f}")
Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90
Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90
Accuracy: 0.88
Accuracy: 0.95
Accuracy: 0.92
Accuracy: 0.94
Accuracy: 0.93
Accuracy: 0.94
Accuracy: 0.94
Accuracy: 0.92
Accuracy: 0.90
predictions[1]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[271], line 1
----> 1 predictions[1]

IndexError: index 1 is out of bounds for dimension 0 with size 1
labels
tensor([[25, 25, 25, 25, 25, 25, 25, 84,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 84,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 15, 25, 25, 32,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 75,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [25, 25, 25, 25, 25, 25, 66,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
# Define the sentence
sentence = "Hej, chciałbym zamówić pokój w Poznaniu na termin 25.03 - 17.04"

# Tokenize the sentence
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors="pt")

# Create the attention mask
attention_mask = torch.ones_like(input_ids)

# Make the prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = outputs.logits.argmax(dim=-1)
index_label_map = {v: k for k, v in label_map.items()}
# Decode the predicted labels
predicted_labels = [index_label_map[label] for label in predictions[0].tolist()]

# Print the predicted labels
print(predicted_labels)


['pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']
label_map["O"]
25
predictions[0]
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
[(0, 0), (0, 2), (3, 9), (9, 10), (0, 5)]