import os import pandas as pd from collections import defaultdict import unicodedata def get_act_name_and_slots(act="request(cuisine)"): acts = [act] if "&" in act: acts = act.split("&") for act in acts: if "(" not in act: return act, [] act_name = act.split("(")[0] try: slots = act.split("(")[1].split(")")[0].split(",") except IndexError: slots = [] print(f"Error parsing act: {act}") return act_name, slots data_files = [] for filename in os.listdir("data"): f = os.path.join("data", filename) if os.path.isfile(f): data_files.append(pd.read_csv(f, sep='\t', header=None)) acts = defaultdict(lambda: {"msg": [], "slots": []}) for df in data_files: if len(df.columns) == 3: df.columns = ["agent", "message", "act"] else: continue current_acts = df['act'] user_speech_rows = df[df['agent'] == "user"]["message"] for act, speech in zip(current_acts, user_speech_rows): speech = unicodedata.normalize('NFKD', speech).encode( 'ASCII', 'ignore').decode('utf-8') act_name, slots = get_act_name_and_slots(act) for i in range(len(slots)): slots[i] = unicodedata.normalize( 'NFKD', slots[i]).encode('ASCII', 'ignore').decode('utf-8') acts[act_name]["slots"] += slots acts[act_name]["msg"].append(speech) print("Acts:") for act in acts: if act in ["hello", "welcomemsg", "ack", "rezerwuj"]: continue print(f"{act}: {len(acts[act]['msg'])} examples") for sentence in acts[act]["msg"]: print(f"\t{sentence}") print("\n\n") print() df_act = pd.DataFrame(acts) df_act.to_json("acts.json", orient="index") print("Acts saved to acts.json")