2024-05-09 23:34:11 +02:00
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
from collections import defaultdict
|
2024-05-09 23:42:05 +02:00
|
|
|
import unicodedata
|
2024-05-09 23:34:11 +02:00
|
|
|
|
|
|
|
|
|
|
|
def get_act_name_and_slots(act="request(cuisine)"):
|
|
|
|
acts = [act]
|
|
|
|
if "&" in act:
|
|
|
|
acts = act.split("&")
|
|
|
|
for act in acts:
|
|
|
|
if "(" not in act:
|
|
|
|
return act, []
|
|
|
|
act_name = act.split("(")[0]
|
|
|
|
try:
|
|
|
|
slots = act.split("(")[1].split(")")[0].split(",")
|
|
|
|
except IndexError:
|
|
|
|
slots = []
|
|
|
|
print(f"Error parsing act: {act}")
|
|
|
|
return act_name, slots
|
|
|
|
|
|
|
|
|
|
|
|
data_files = []
|
|
|
|
|
|
|
|
for filename in os.listdir("data"):
|
|
|
|
f = os.path.join("data", filename)
|
|
|
|
if os.path.isfile(f):
|
|
|
|
data_files.append(pd.read_csv(f, sep='\t', header=None))
|
|
|
|
|
|
|
|
acts = defaultdict(lambda: {"msg": [], "slots": []})
|
|
|
|
|
|
|
|
for df in data_files:
|
|
|
|
if len(df.columns) == 3:
|
|
|
|
df.columns = ["agent", "message", "act"]
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
current_acts = df['act']
|
|
|
|
user_speech_rows = df[df['agent'] == "user"]["message"]
|
|
|
|
for act, speech in zip(current_acts, user_speech_rows):
|
2024-05-09 23:42:05 +02:00
|
|
|
speech = unicodedata.normalize('NFKD', speech).encode(
|
|
|
|
'ASCII', 'ignore').decode('utf-8')
|
2024-05-09 23:34:11 +02:00
|
|
|
act_name, slots = get_act_name_and_slots(act)
|
2024-05-09 23:42:05 +02:00
|
|
|
for i in range(len(slots)):
|
|
|
|
slots[i] = unicodedata.normalize(
|
|
|
|
'NFKD', slots[i]).encode('ASCII', 'ignore').decode('utf-8')
|
2024-05-09 23:34:11 +02:00
|
|
|
acts[act_name]["slots"] += slots
|
|
|
|
acts[act_name]["msg"].append(speech)
|
|
|
|
|
|
|
|
print("Acts:")
|
|
|
|
for act in acts:
|
|
|
|
if act in ["hello", "welcomemsg", "ack", "rezerwuj"]:
|
|
|
|
continue
|
|
|
|
print(f"{act}: {len(acts[act]['msg'])} examples")
|
|
|
|
for sentence in acts[act]["msg"]:
|
|
|
|
print(f"\t{sentence}")
|
|
|
|
|
|
|
|
print("\n\n")
|
|
|
|
print()
|
2024-05-09 23:42:05 +02:00
|
|
|
|
|
|
|
df_act = pd.DataFrame(acts)
|
|
|
|
df_act.to_json("acts.json", orient="index")
|
|
|
|
print("Acts saved to acts.json")
|