JARVIS/get_acts.py

63 lines
1.8 KiB
Python
Raw Normal View History

import os
import pandas as pd
from collections import defaultdict
2024-05-09 23:42:05 +02:00
import unicodedata
def get_act_name_and_slots(act="request(cuisine)"):
acts = [act]
if "&" in act:
acts = act.split("&")
for act in acts:
if "(" not in act:
return act, []
act_name = act.split("(")[0]
try:
slots = act.split("(")[1].split(")")[0].split(",")
except IndexError:
slots = []
print(f"Error parsing act: {act}")
return act_name, slots
data_files = []
for filename in os.listdir("data"):
f = os.path.join("data", filename)
if os.path.isfile(f):
data_files.append(pd.read_csv(f, sep='\t', header=None))
acts = defaultdict(lambda: {"msg": [], "slots": []})
for df in data_files:
if len(df.columns) == 3:
df.columns = ["agent", "message", "act"]
else:
continue
current_acts = df['act']
user_speech_rows = df[df['agent'] == "user"]["message"]
for act, speech in zip(current_acts, user_speech_rows):
2024-05-09 23:42:05 +02:00
speech = unicodedata.normalize('NFKD', speech).encode(
'ASCII', 'ignore').decode('utf-8')
act_name, slots = get_act_name_and_slots(act)
2024-05-09 23:42:05 +02:00
for i in range(len(slots)):
slots[i] = unicodedata.normalize(
'NFKD', slots[i]).encode('ASCII', 'ignore').decode('utf-8')
acts[act_name]["slots"] += slots
acts[act_name]["msg"].append(speech)
print("Acts:")
for act in acts:
if act in ["hello", "welcomemsg", "ack", "rezerwuj"]:
continue
print(f"{act}: {len(acts[act]['msg'])} examples")
for sentence in acts[act]["msg"]:
print(f"\t{sentence}")
print("\n\n")
print()
2024-05-09 23:42:05 +02:00
df_act = pd.DataFrame(acts)
df_act.to_json("acts.json", orient="index")
print("Acts saved to acts.json")