AMUseBot/ai_talks/AMUseBotBackend/utils/chatbot_prototype.ipynb
2023-06-05 21:23:33 +02:00

214 KiB
Raw Blame History

Download CookDial from git

! git clone https://github.com/YiweiJiang2015/CookDial.git

Or download CookDial from Google Drive

# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
path_to_file = "/content/CookDial"
path_to_output = "/content/drive/MyDrive/CookDial"

CookDial from MyDrive

import zipfile
with zipfile.ZipFile(path_to_output + ".zip","r") as zip_ref:
    zip_ref.extractall(path_to_file)

CookDial to MyDrive

import shutil

shutil.make_archive(path_to_output, 'zip', path_to_file)
'/content/drive/MyDrive/CookDial.zip'

Get dialogues number

import os
APP_FOLDER = '/content/CookDial/data/dialog'
totalFiles = 0
for base, _, files in os.walk(APP_FOLDER):
    print('Searching in : ',base)
    for File in files:
        totalFiles += 1

print('Total number of files',totalFiles)
Searching in :  /content/CookDial/data/dialog
Total number of files 260

Read content of data

import re 

pattern = re.compile(r"\"intent\": \"([^\"]*)", re.IGNORECASE)

def parse_annotation(annotation):
    # print(annotation)
    result = re.search(pattern, annotation)
    value = result.group(1)
    value = value.replace(";", "")
    value = value.replace(" ", "#")
    return value
import json
import pandas as pd

utt_dict = {'label': [], 'sentence': []}

for number in range(totalFiles):
    with open(APP_FOLDER + "/" + f"{number:03d}" + ".1.json") as f:

        data = json.load(f)
        for row in data['messages']:
            if False == row["bot"]:
                parsed_ann = parse_annotation(row["annotations"])
                if "" != parsed_ann:
                    utt_dict["label"].append(parsed_ann)
                    utt_dict["sentence"].append(row["utterance"].lower())

IntentDataFrame = pd.DataFrame(utt_dict)
IntentDataFrame.sample(n=5)
label sentence
2260 10 ok good! i am ready to start now.
4431 17 great. the chicken are added back. can i eat it?
4425 19 ok. done. how long to wait from now?
2246 29 ok nice, for how long should they cook?
141 5 ok. i have added the baking powder to the bowl.
print("There are {} rows and {} columns".format(IntentDataFrame.shape[0], IntentDataFrame.shape[1]))
There are 4610 rows and 2 columns
# explore unique labels
print(IntentDataFrame.label.unique())
['greeting#req_start' 'req_temperature' 'thank#req_instruction'
 'confirm#req_instruction' 'req_repeat' 'confirm' 'confirm#req_repeat'
 'negate#thank' 'negate' 'req_amount' 'req_instruction'
 'confirm#req_parallel_action' 'req_amount#req_ingredient' 'thank#confirm'
 'req_use_all' 'thank' 'other' 'confirm#req_is_recipe_finished' 'req_tool'
 'confirm#req_duration' 'confirm#thank' 'affirm#req_instruction'
 'req_repeat#confirm' 'confirm#req_temperature'
 'confirm#req_is_recipe_ongoing' 'req_ingredient' 'confirm#req_amount'
 'thank#confirm#req_instruction' 'thank#req_repeat' 'req_duration'
 'thank#req_duration' 'confirm#thank#req_instruction'
 'thank#confirm#req_is_recipe_finished' 'req_repeat#thank'
 'greeting#req_title' 'req_start' 'confirm#other' 'affirm'
 'confirm#req_start' 'confirm#req_duration#req_is_recipe_finished'
 'affirm#req_amount' 'req_ingredient_list' 'thank#goodbye'
 'req_parallel_action' 'confirm#goodbye' 'affirm#req_ingredient'
 'thank#req_ingredient' 'thank#confirm#req_ingredient'
 'req_ingredient_list_length' 'other#req_instruction' 'affirm#req_start'
 'thank#req_is_recipe_ongoing' 'req_is_recipe_ongoing' 'goodbye'
 'req_ingredient_list#confirm' 'affirm#thank#other'
 'req_repeat#req_amount' 'other#req_repeat' 'confirm#req_tool'
 'req_is_recipe_finished' 'thank#req_parallel_action'
 'affirm#req_ingredient_list' 'confirm#req_ingredient' 'affirm#confirm'
 'confirm#req_ingredient_list_ends' 'req_title' 'req_ingredient_list_ends'
 'req_substitute' 'negate#req_instruction' 'thank#req_is_recipe_finished'
 'thank#req_ingredient_list' 'affirm#thank' 'thank#req_tool'
 'affirm#req_ingredient_list_length' 'confirm#req_substitute'
 'affirm#other' 'confirm#req_instruction#req_duration'
 'req_ingredient_list#req_ingredient_list_length' 'confirm#affirm'
 'affirm#thank#req_ingredient' 'confirm#req_use_all'
 'req_amount#req_substitute' 'req_instruction#req_duration'
 'negate#confirm#req_instruction' 'thank#other' 'greeting'
 'other#req_temperature' 'req_ingredient_list_length#confirm'
 'thank#confirm#req_duration' 'greeting#req_ingredient_list'
 'thank#req_amount']
# explore which labels are the most and least common
IntentDataFrame.label.value_counts()
confirm#req_instruction    1222
confirm                     407
req_instruction             320
thank                       225
greeting#req_title          216
                           ... 
other#req_instruction         1
req_repeat#thank              1
confirm#req_start             1
confirm#goodbye               1
thank#req_amount              1
Name: label, Length: 91, dtype: int64
# drop rows with multiple labels
# df = df[df["label"].str.contains("#")==False]
# df.label.value_counts()

Preprocessing

!pip install datasets
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/452.9 KB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━ 286.7/452.9 KB 8.6 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 452.9/452.9 KB 9.8 MB/s eta 0:00:00
[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2022.11.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (6.0)
Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6)
Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3)
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/213.0 KB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 213.0/213.0 KB 27.3 MB/s eta 0:00:00
[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.8/dist-packages (from datasets) (21.3)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from datasets) (1.21.6)
Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (2.25.1)
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0)
Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (4.64.1)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/182.4 KB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 182.4/182.4 KB 23.5 MB/s eta 0:00:00
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/132.0 KB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.0/132.0 KB 18.3 MB/s eta 0:00:00
[?25hRequirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1)
Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3)
Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2)
Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4)
Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1)
Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2)
Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0)
Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (3.9.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (4.4.0)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging->datasets) (3.0.9)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2.10)
Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (4.0.0)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (1.24.3)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/140.6 KB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 140.6/140.6 KB 21.0 MB/s eta 0:00:00
[?25hRequirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)
Installing collected packages: xxhash, urllib3, multiprocess, responses, huggingface-hub, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed datasets-2.8.0 huggingface-hub-0.11.1 multiprocess-0.70.14 responses-0.18.0 urllib3-1.26.14 xxhash-3.2.0
!pip install transformers
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/5.8 MB ? eta -:--:--
     ━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.4/5.8 MB 12.2 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━ 3.1/5.8 MB 48.5 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 5.8/5.8 MB 64.9 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.8/5.8 MB 46.7 MB/s eta 0:00:00
[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[?25l     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/7.6 MB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━ 4.9/7.6 MB 120.5 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸ 7.6/7.6 MB 137.2 MB/s eta 0:00:01
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.6/7.6 MB 86.4 MB/s eta 0:00:00
[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (0.11.1)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2)
Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (21.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers) (4.4.0)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.8/dist-packages (from packaging>=20.0->transformers) (3.0.9)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.26.14)
Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0)
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.2 transformers-4.25.1
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import datasets #Hugging Face library
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk(ModelPath):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# replace the labels strings by label numbers 
unique_labels = IntentDataFrame.label.unique()
LabelToIndex = {}

for i in range(len(unique_labels)):
    LabelToIndex[unique_labels[i]] = i

IntentDataFrame["label"]=IntentDataFrame["label"].map(LabelToIndex)
train_data = IntentDataFrame.sample(frac=0.8, random_state=25)
test_data = IntentDataFrame.drop(train_data.index)

train_data = datasets.Dataset.from_pandas(train_data)
test_data = datasets.Dataset.from_pandas(test_data)

print(f"No. of training examples: {train_data.shape[0]}")
print(f"No. of testing examples: {test_data.shape[0]}")
No. of training examples: 3688
No. of testing examples: 922
# Import AutoTokenizer with checkpoint"distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]
Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]
Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]
Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]
# Tokenization work on train_dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)
tokenize_train=train_data.map(preprocess_function,batched=True)
tokenize_test=test_data.map(preprocess_function,batched=True)
  0%|          | 0/4 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
# data_collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Build model 

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_labels))
Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)
<ipython-input-30-7d137328fd2b>:2: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
  metric = load_metric('accuracy')
Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]
# Model fine tuning training
training_args = TrainingArguments(
    output_dir="/content/results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_train,
    eval_dataset=tokenize_test,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


trainer.train()
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
***** Running training *****
  Num examples = 3688
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9220
  Number of trainable parameters = 67023451
[9220/9220 10:23, Epoch 20/20]
Epoch Training Loss Validation Loss Accuracy
1 No log 1.174354 0.822126
2 0.066100 1.392280 0.784165
3 0.061900 1.246237 0.823210
4 0.056900 1.324116 0.813449
5 0.089800 1.304054 0.819957
6 0.103200 1.212749 0.818872
7 0.093600 1.229501 0.819957
8 0.098000 1.279489 0.817787
9 0.083700 1.188151 0.821041
10 0.068900 1.274577 0.816703
11 0.063100 1.275792 0.824295
12 0.063700 1.263834 0.825380
13 0.063700 1.323240 0.812364
14 0.055600 1.266973 0.825380
15 0.049200 1.295590 0.824295
16 0.049600 1.288514 0.829718
17 0.044600 1.282528 0.824295
18 0.041200 1.285815 0.823210
19 0.050200 1.290950 0.821041
20 0.042100 1.288992 0.819957

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-500
Configuration saved in /content/results/checkpoint-500/config.json
Model weights saved in /content/results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-1000
Configuration saved in /content/results/checkpoint-1000/config.json
Model weights saved in /content/results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-1500
Configuration saved in /content/results/checkpoint-1500/config.json
Model weights saved in /content/results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-1500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-2000
Configuration saved in /content/results/checkpoint-2000/config.json
Model weights saved in /content/results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-2000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-2500
Configuration saved in /content/results/checkpoint-2500/config.json
Model weights saved in /content/results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-2500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-3000
Configuration saved in /content/results/checkpoint-3000/config.json
Model weights saved in /content/results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-3000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-3500
Configuration saved in /content/results/checkpoint-3500/config.json
Model weights saved in /content/results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-3500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-4000
Configuration saved in /content/results/checkpoint-4000/config.json
Model weights saved in /content/results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-4000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-4500
Configuration saved in /content/results/checkpoint-4500/config.json
Model weights saved in /content/results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-4500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-5000
Configuration saved in /content/results/checkpoint-5000/config.json
Model weights saved in /content/results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-5500
Configuration saved in /content/results/checkpoint-5500/config.json
Model weights saved in /content/results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-5500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-6000
Configuration saved in /content/results/checkpoint-6000/config.json
Model weights saved in /content/results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-6000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-6500
Configuration saved in /content/results/checkpoint-6500/config.json
Model weights saved in /content/results/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-6500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-6500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-7000
Configuration saved in /content/results/checkpoint-7000/config.json
Model weights saved in /content/results/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-7000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-7000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-7500
Configuration saved in /content/results/checkpoint-7500/config.json
Model weights saved in /content/results/checkpoint-7500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-7500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-7500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-8000
Configuration saved in /content/results/checkpoint-8000/config.json
Model weights saved in /content/results/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-8000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-8000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-8500
Configuration saved in /content/results/checkpoint-8500/config.json
Model weights saved in /content/results/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-8500/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-8500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8
Saving model checkpoint to /content/results/checkpoint-9000
Configuration saved in /content/results/checkpoint-9000/config.json
Model weights saved in /content/results/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in /content/results/checkpoint-9000/tokenizer_config.json
Special tokens file saved in /content/results/checkpoint-9000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence. If __index_level_0__, sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 922
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


TrainOutput(global_step=9220, training_loss=0.06487055326490754, metrics={'train_runtime': 624.1342, 'train_samples_per_second': 118.18, 'train_steps_per_second': 14.772, 'total_flos': 869726868928704.0, 'train_loss': 0.06487055326490754, 'epoch': 20.0})
def SentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : tokenizer, trainer
    """
    
    def preprocess_function(examples):
        return tokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    OutputLabelName = list(LabelToIndex.keys())[list(LabelToIndex.values()).index(BestLabel[0])]
    
    return OutputLabelName
InputSentence = "yes please".lower()
OutputLabel = SentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')
  0%|          | 0/1 [00:00<?, ?ex/s]
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8
Your question was : "yes please" it was classified as : "affirm"
# Save the model and tokenizer locally
!mkdir /content/CookDial/working
!mkdir /content/CookDial/working/model/
!mkdir /content/CookDial/working/tokenizer/

ModelPath = "/content/CookDial/working/model/"
TokenizerPath = "/content/CookDial/working/tokenizer/"

if os.path.isdir(ModelPath):
    model.save_pretrained(ModelPath)
    print("model ok")
if os.path.isdir(TokenizerPath):
    tokenizer.save_pretrained(TokenizerPath)
    print("tokenizer ok")
Configuration saved in /content/CookDial/working/model/config.json
Model weights saved in /content/CookDial/working/model/pytorch_model.bin
tokenizer config file saved in /content/CookDial/working/tokenizer/tokenizer_config.json
Special tokens file saved in /content/CookDial/working/tokenizer/special_tokens_map.json
model ok
tokenizer ok
# Load the model and tokenizer from a local path\
LocalModel = AutoModelForSequenceClassification.from_pretrained(ModelPath,num_labels=len(unique_labels))
LocalTokenizer = AutoTokenizer.from_pretrained(TokenizerPath)
loading configuration file /content/CookDial/working/model/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/CookDial/working/model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33",
    "34": "LABEL_34",
    "35": "LABEL_35",
    "36": "LABEL_36",
    "37": "LABEL_37",
    "38": "LABEL_38",
    "39": "LABEL_39",
    "40": "LABEL_40",
    "41": "LABEL_41",
    "42": "LABEL_42",
    "43": "LABEL_43",
    "44": "LABEL_44",
    "45": "LABEL_45",
    "46": "LABEL_46",
    "47": "LABEL_47",
    "48": "LABEL_48",
    "49": "LABEL_49",
    "50": "LABEL_50",
    "51": "LABEL_51",
    "52": "LABEL_52",
    "53": "LABEL_53",
    "54": "LABEL_54",
    "55": "LABEL_55",
    "56": "LABEL_56",
    "57": "LABEL_57",
    "58": "LABEL_58",
    "59": "LABEL_59",
    "60": "LABEL_60",
    "61": "LABEL_61",
    "62": "LABEL_62",
    "63": "LABEL_63",
    "64": "LABEL_64",
    "65": "LABEL_65",
    "66": "LABEL_66",
    "67": "LABEL_67",
    "68": "LABEL_68",
    "69": "LABEL_69",
    "70": "LABEL_70",
    "71": "LABEL_71",
    "72": "LABEL_72",
    "73": "LABEL_73",
    "74": "LABEL_74",
    "75": "LABEL_75",
    "76": "LABEL_76",
    "77": "LABEL_77",
    "78": "LABEL_78",
    "79": "LABEL_79",
    "80": "LABEL_80",
    "81": "LABEL_81",
    "82": "LABEL_82",
    "83": "LABEL_83",
    "84": "LABEL_84",
    "85": "LABEL_85",
    "86": "LABEL_86",
    "87": "LABEL_87",
    "88": "LABEL_88",
    "89": "LABEL_89",
    "90": "LABEL_90"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "LABEL_16": 16,
    "LABEL_17": 17,
    "LABEL_18": 18,
    "LABEL_19": 19,
    "LABEL_2": 2,
    "LABEL_20": 20,
    "LABEL_21": 21,
    "LABEL_22": 22,
    "LABEL_23": 23,
    "LABEL_24": 24,
    "LABEL_25": 25,
    "LABEL_26": 26,
    "LABEL_27": 27,
    "LABEL_28": 28,
    "LABEL_29": 29,
    "LABEL_3": 3,
    "LABEL_30": 30,
    "LABEL_31": 31,
    "LABEL_32": 32,
    "LABEL_33": 33,
    "LABEL_34": 34,
    "LABEL_35": 35,
    "LABEL_36": 36,
    "LABEL_37": 37,
    "LABEL_38": 38,
    "LABEL_39": 39,
    "LABEL_4": 4,
    "LABEL_40": 40,
    "LABEL_41": 41,
    "LABEL_42": 42,
    "LABEL_43": 43,
    "LABEL_44": 44,
    "LABEL_45": 45,
    "LABEL_46": 46,
    "LABEL_47": 47,
    "LABEL_48": 48,
    "LABEL_49": 49,
    "LABEL_5": 5,
    "LABEL_50": 50,
    "LABEL_51": 51,
    "LABEL_52": 52,
    "LABEL_53": 53,
    "LABEL_54": 54,
    "LABEL_55": 55,
    "LABEL_56": 56,
    "LABEL_57": 57,
    "LABEL_58": 58,
    "LABEL_59": 59,
    "LABEL_6": 6,
    "LABEL_60": 60,
    "LABEL_61": 61,
    "LABEL_62": 62,
    "LABEL_63": 63,
    "LABEL_64": 64,
    "LABEL_65": 65,
    "LABEL_66": 66,
    "LABEL_67": 67,
    "LABEL_68": 68,
    "LABEL_69": 69,
    "LABEL_7": 7,
    "LABEL_70": 70,
    "LABEL_71": 71,
    "LABEL_72": 72,
    "LABEL_73": 73,
    "LABEL_74": 74,
    "LABEL_75": 75,
    "LABEL_76": 76,
    "LABEL_77": 77,
    "LABEL_78": 78,
    "LABEL_79": 79,
    "LABEL_8": 8,
    "LABEL_80": 80,
    "LABEL_81": 81,
    "LABEL_82": 82,
    "LABEL_83": 83,
    "LABEL_84": 84,
    "LABEL_85": 85,
    "LABEL_86": 86,
    "LABEL_87": 87,
    "LABEL_88": 88,
    "LABEL_89": 89,
    "LABEL_9": 9,
    "LABEL_90": 90
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "vocab_size": 30522
}

loading weights file /content/CookDial/working/model/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at /content/CookDial/working/model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.
loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
def LocalSentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : LocalTokenizer, LocalModel
    We use tokenizer2 and trainer2 instead of tokeninzer and trainer
    to be sure that this function works with the data saved and load locally
    """
    
    trainer = Trainer(
        model=LocalModel,
        args=training_args,
        train_dataset=tokenize_train,
        #eval_dataset=tokenize_test,  Here, we work with the entire dataset as training data
        #compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    def preprocess_function(examples):
        return LocalTokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    OutputLabelName = list(LabelToIndex.keys())[list(LabelToIndex.values()).index(BestLabel[0])]
    
    return OutputLabelName
InputSentence = "ok next step"
OutputLabel = LocalSentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')
  0%|          | 0/1 [00:00<?, ?ex/s]
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8
Your question was : "ok next step" it was classified as : "req_instruction"
# used train script: https://www.kaggle.com/code/philanoe/intent-classifier-training