AMUseBot/ai_talks/AMUseBotBackend/utils/preprocess_data.py

47 lines
1.8 KiB
Python
Raw Permalink Normal View History

2023-06-05 21:23:33 +02:00
import glob
import sys
import json
import pandas as pd
dataset_path = sys.argv[1] # Path to CookDial's dialog subdirectory
# 1st pass - create intent->integer and integer->intent dicts, save the second one for NLU use
all_intents = set()
for file in list(glob.glob(f'{dataset_path}/*.json')):
with open(file, encoding='utf-8') as dial_file:
dial_data = json.load(dial_file)
for message in dial_data['messages']:
if message['bot'] == False:
intents = json.loads(message['annotations'])['intent']
for intent in [intent.strip() for intent in intents.split(';')]:
if intent != '':
all_intents.add(intent)
all_intents.add('choose_recipe') # Add special intent for recipe title recognition
intent2int = dict(zip(sorted(list(all_intents)), range(len(all_intents))))
int2intent = {v: k for k, v in intent2int.items()}
with open('intent_dict.json', 'w', encoding='utf-8') as f:
json.dump(int2intent, f)
# 2nd pass - preprocess dialogue data for training
preprocessed_data = []
for file in list(glob.glob(f'{dataset_path}/*.json')):
with open(file, encoding='utf-8') as dial_file:
dial_data = json.load(dial_file)
for message in dial_data['messages']:
if message['bot'] == False:
annotations = json.loads(message['annotations'])
intents = [intent.strip() for intent in annotations['intent'].split(';')]
intents.remove('')
intents_multi_hot = [0] * len(all_intents)
for intent in intents:
intents_multi_hot[intent2int[intent]] = 1
preprocessed_data.append([message['utterance'], intents_multi_hot])
preprocessed_data_df = pd.DataFrame(preprocessed_data)
preprocessed_data_df.to_csv('preprocessed_data.csv', header = ['utterance', 'intents'], index=False, sep=';')