AMUseBot/ai_talks/AMUseBotBackend/utils/preprocess_data.py

import glob
import sys
import json
import pandas as pd

dataset_path = sys.argv[1] # Path to CookDial's dialog subdirectory

# 1st pass - create intent->integer and integer->intent dicts, save the second one for NLU use
all_intents = set()

for file in list(glob.glob(f'{dataset_path}/*.json')):
  with open(file, encoding='utf-8') as dial_file:
    dial_data = json.load(dial_file)
    for message in dial_data['messages']:
      if message['bot'] == False:
        intents = json.loads(message['annotations'])['intent']
        for intent in [intent.strip() for intent in intents.split(';')]:
          if intent != '':
            all_intents.add(intent)

all_intents.add('choose_recipe') # Add special intent for recipe title recognition

intent2int = dict(zip(sorted(list(all_intents)), range(len(all_intents))))
int2intent = {v: k for k, v in intent2int.items()}
with open('intent_dict.json', 'w', encoding='utf-8') as f:
    json.dump(int2intent, f)


# 2nd pass - preprocess dialogue data for training
preprocessed_data = []

for file in list(glob.glob(f'{dataset_path}/*.json')):
  with open(file, encoding='utf-8') as dial_file:
    dial_data = json.load(dial_file)
    for message in dial_data['messages']:
      if message['bot'] == False:
        annotations = json.loads(message['annotations'])
        intents = [intent.strip() for intent in annotations['intent'].split(';')]
        intents.remove('')
        intents_multi_hot = [0] * len(all_intents)
        for intent in intents:
          intents_multi_hot[intent2int[intent]] = 1
        preprocessed_data.append([message['utterance'], intents_multi_hot])

preprocessed_data_df = pd.DataFrame(preprocessed_data)
preprocessed_data_df.to_csv('preprocessed_data.csv', header = ['utterance', 'intents'], index=False, sep=';')