33 KiB
33 KiB
import pandas as pd
import numpy as np
import tokenization
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_addons as tfa
import sklearn
from sklearn.model_selection import train_test_split
import glob
import os
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflow_addons\utils\tfa_eol_msg.py:23: UserWarning: TensorFlow Addons (TFA) has ended development and introduction of new features. TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024. Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). For more information see: https://github.com/tensorflow/addons/issues/2807 warnings.warn(
path = os.getcwd()+'\data'
tsv_files = glob.glob(os.path.join(path, "*.tsv"))
dfs = []
for filename in tsv_files:
df = pd.read_csv(filename, index_col=None, header=None, delimiter='\t',names=["speaker", "sentence", "dialogue_act"])
dfs.append(df)
combined_df = pd.concat(dfs, axis=0, ignore_index=True)
combined_df
speaker | sentence | dialogue_act | |
---|---|---|---|
0 | user | Co proszę? | null()/hello() |
1 | system | Witam w systemie rezerwacji hotelu. Gdzie chci... | welcomemsg() |
2 | user | W jakim kraju/B-country mogę zarezerwować hotel? | help(country) |
3 | system | Mamy szeroki wybór hoteli na całym świecie. | expl-conf() |
4 | user | Przedstaw proszę oferty z obszaru Górnego Kara... | request(country=Górny Karabuch) |
... | ... | ... | ... |
347 | system | Okej w takim razie, proponuję ten sam hotel w ... | offer(price=110, date=02.07.2023- 08.07.2023) |
348 | user | Jak najbardziej. Proszę o zarezerwowanie/B-res... | confirm() |
349 | system | Dobrze, numer rezerwacji to 912312. Dokładny A... | inform(reservation_number=912312, address=3 ma... |
350 | user | Nie, dziękuję i życzę miłego dnia | negate()&thankyou()&bye() |
351 | system | Dziękuję bardzo wzajemnie. | thankyou() |
352 rows × 3 columns
def extract_labels(sentence):
tokens = sentence.split()
labels = []
for token in tokens:
parts = token.split("/")
if len(parts) > 1:
label = parts[1]
if label.startswith('B-'):
labels.append(label[0:])
elif label.startswith('I-'):
labels.append(label[0:])
else:
labels.append('O')
else:
labels.append('O')
return labels
labels = combined_df['sentence'].apply(extract_labels)
labels_list = [label for sentence in labels for label in sentence ]
unique_labels = set(labels_list)
unique_labels
{'B-alternative', 'B-animal', 'B-area', 'B-area,', 'B-area.', 'B-available?', 'B-beggining', 'B-checkin?', 'B-city', 'B-city,', 'B-city.', 'B-confirmation', 'B-country', 'B-country,', 'B-country?', 'B-date', 'B-date?', 'B-day', 'B-day-11', 'B-day-28', 'B-days', 'B-days,', 'B-days.', 'B-email', 'B-facilities', 'B-facilities,', 'B-facilities.', 'B-facilities...', 'B-facilities?', 'B-finish', 'B-first', 'B-hotel', 'B-hotel.', 'B-hotel?', 'B-insurance', 'B-insurance?', 'B-location', 'B-month', 'B-month.', 'B-month?', 'B-next', 'B-nights', 'B-nights,', 'B-number_of_rooms', 'B-payment', 'B-payment?', 'B-people', 'B-people?', 'B-per_night', 'B-per_night.', 'B-price', 'B-price!', 'B-price.', 'B-price?', 'B-reservation', 'B-reservation_number', 'B-room_size', 'B-room_size,', 'B-room_size.', 'B-room_size?', 'B-rooms', 'B-sickness', 'B-size', 'B-size,', 'B-size.', 'B-size?', 'B-stars', 'B-stars?', 'B-sum', 'B-week', 'B-weekend', 'B-weekend.', 'B-weekend?', 'B-year', 'B-year.', 'I-area', 'I-country', 'I-country.', 'I-day', 'I-day.', 'I-days', 'I-facilities', 'I-facilities.', 'I-hotel', 'I-location,', 'I-month', 'I-payment', 'I-perperson', 'I-room_size', 'I-year', 'O'}
num_labels = unique_labels.__len__()+1
OTHER_LABEL = "pad"
label_map = {label: i for i, label in enumerate(unique_labels, start=1)}
label_map[OTHER_LABEL] = 0
# Convert the flattened labels array to a numpy array of integers
label_map["pad"]
0
label_map['O']
55
labels
0 [O, O] 1 [O, O, O, O, O, O, O, O, O] 2 [O, O, B-country, O, O, O] 3 [O, O, O, O, O, O, O] 4 [O, O, O, O, O, O, B-country] ... 347 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... 348 [O, O, O, O, B-reservation, O, O] 349 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... 350 [O, O, O, O, O, O] 351 [O, O, O] Name: sentence, Length: 352, dtype: object
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model = AutoModel.from_pretrained("allegro/herbert-base-cased")
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight'] - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
# 1. Preprocess the data
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
# Initialize the tokenizer
# Define a maximum sequence length
max_length = 128
# Tokenize the text
tokens = tokenizer(combined_df["sentence"].tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
# Create attention masks
attention_masks = tokens["attention_mask"]
# Truncate or pad the labels to match the sequence length
labels = [[label_map.get(l, 0) for l in lab] for lab in labels]
labels = pad_sequences(labels, maxlen=max_length, value=0, dtype=np.int32, truncating='post', padding='post')
labels = torch.tensor(labels, dtype=torch.long)
# Convert the preprocessed data into a PyTorch Dataset
dataset = TensorDataset(tokens["input_ids"], attention_masks, labels)
# 2. Define the NER model
from transformers import AutoModelForTokenClassification
# Load the pre-trained model
model = AutoModelForTokenClassification.from_pretrained("allegro/herbert-base-cased", num_labels=num_labels)
# 3. Train the NER model
# Convert the preprocessed data into a PyTorch Dataset
dataset = dataset = TensorDataset(
tokens["input_ids"],
attention_masks,
torch.tensor(labels, dtype=torch.long)
)
Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.sso.sso_relationship.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight'] - This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of BertForTokenClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. C:\Users\macty\AppData\Local\Temp\ipykernel_1684\3997969718.py:42: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). torch.tensor(labels, dtype=torch.long)
# Define the training parameters
batch_size = 8
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset) // batch_size + 1)
loss_fn = torch.nn.CrossEntropyLoss()
# Define the training loop
def train(model, dataloader, optimizer, scheduler, loss_fn):
model.train()
for batch in dataloader:
inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
labels = batch[2].to(model.device)
optimizer.zero_grad()
outputs = model(**inputs, labels=labels)
loss = loss_fn(outputs.logits.view(-1, num_labels), labels.view(-1))
loss.backward()
optimizer.step()
scheduler.step()
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(tokens["input_ids"], labels, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_masks, tokens["input_ids"], test_size=0.2)
# Convert the preprocessed data into PyTorch Dataloaders
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
c:\Users\macty\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:30: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels)) C:\Users\macty\AppData\Local\Temp\ipykernel_1684\2322694894.py:33: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels))
labels
tensor([[55, 55, 0, ..., 0, 0, 0], [55, 55, 55, ..., 0, 0, 0], [55, 55, 73, ..., 0, 0, 0], ..., [55, 55, 55, ..., 0, 0, 0], [55, 55, 55, ..., 0, 0, 0], [55, 55, 55, ..., 0, 0, 0]])
# Train the model
epochs = 3
for epoch in range(epochs):
train(model, train_dataloader, optimizer, scheduler, loss_fn)
# Evaluate the model
model.eval()
with torch.no_grad():
for batch in test_dataloader:
inputs = {key: value.to(model.device) for key, value in dict(zip(["input_ids", "attention_mask"], batch)).items()}
labels = batch[2].to(model.device)
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
# Calculate the accuracy
accuracy = (predictions == labels).float().mean().item()
print(f"Accuracy: {accuracy:.2f}")
Accuracy: 0.88 Accuracy: 0.95 Accuracy: 0.92 Accuracy: 0.94 Accuracy: 0.93 Accuracy: 0.94 Accuracy: 0.94 Accuracy: 0.92 Accuracy: 0.90 Accuracy: 0.88 Accuracy: 0.95 Accuracy: 0.92 Accuracy: 0.94 Accuracy: 0.93 Accuracy: 0.94 Accuracy: 0.94 Accuracy: 0.92 Accuracy: 0.90 Accuracy: 0.88 Accuracy: 0.95 Accuracy: 0.92 Accuracy: 0.94 Accuracy: 0.93 Accuracy: 0.94 Accuracy: 0.94 Accuracy: 0.92 Accuracy: 0.90
predictions[1]
[1;31m---------------------------------------------------------------------------[0m [1;31mIndexError[0m Traceback (most recent call last) Cell [1;32mIn[271], line 1[0m [1;32m----> 1[0m predictions[[39m1[39;49m] [1;31mIndexError[0m: index 1 is out of bounds for dimension 0 with size 1
labels
tensor([[25, 25, 25, 25, 25, 25, 25, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 25, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 25, 25, 25, 25, 25, 15, 25, 25, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [25, 25, 25, 25, 25, 25, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
# Define the sentence
sentence = "Hej, chciałbym zamówić pokój w Poznaniu na termin 25.03 - 17.04"
# Tokenize the sentence
input_ids = tokenizer.encode(sentence, add_special_tokens=True, return_tensors="pt")
# Create the attention mask
attention_mask = torch.ones_like(input_ids)
# Make the prediction
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predictions = outputs.logits.argmax(dim=-1)
index_label_map = {v: k for k, v in label_map.items()}
# Decode the predicted labels
predicted_labels = [index_label_map[label] for label in predictions[0].tolist()]
# Print the predicted labels
print(predicted_labels)
['pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']
label_map["O"]
25
predictions[0]
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
[(0, 0), (0, 2), (3, 9), (9, 10), (0, 5)]