JARVIS/evaluate.py

88 lines
2.6 KiB
Python
Raw Normal View History

import os
import pandas as pd
import jsgf
2024-05-11 19:41:13 +02:00
from unidecode import unidecode
import string
2024-05-14 11:59:29 +02:00
from collections import defaultdict
2024-05-11 19:41:13 +02:00
def decode_prompt(prompt):
prompt_decoded = unidecode(prompt)
translator = str.maketrans('', '', string.punctuation)
prompt_decoded = prompt_decoded.translate(translator)
return prompt_decoded
grammar = jsgf.parse_grammar_file('book.jsgf')
data_files = []
for filename in os.listdir("data"):
f = os.path.join("data", filename)
if os.path.isfile(f):
data_files.append(pd.read_csv(f, sep='\t', header=None))
recognized = 0
unrecognized = 0
2024-05-14 11:59:29 +02:00
true_positives = 0
false_positives = 0
false_negatives = 0
acts_recognized = defaultdict(int)
acts_not_recognized = defaultdict(int)
for df in data_files:
if len(df.columns) == 3:
df.columns = ["agent", "message", "act"]
elif len(df.columns) == 2:
df.columns = ["agent", "message"]
else:
continue
user_speech_rows = df[df['agent'] == "user"]
user_speeches = user_speech_rows["message"]
entries_count = len(user_speeches)
2024-05-11 19:41:13 +02:00
parsed = user_speeches.apply(
2024-05-11 19:41:13 +02:00
lambda x: bool(grammar.find_matching_rules(decode_prompt(x))))
true_count = parsed.sum()
false_count = len(parsed) - true_count
recognized += true_count
unrecognized += false_count
2024-05-14 11:59:29 +02:00
for line, correct in zip(df.iterrows(), parsed):
acts_recognized[line[1]['act'].split('(')[0]] += int(correct)
acts_not_recognized[line[1]['act'].split('(')[0]] += int(not(correct))
print(f"Recognized user utterances: {recognized}")
print(f"Unrecognized user utterances: {unrecognized}")
print(f"Accuracy: {recognized/(recognized+unrecognized)}")
2024-05-14 11:59:29 +02:00
precision_per_class = {}
recall_per_class = {}
for act in acts_recognized.keys():
true_positives = acts_recognized[act]
false_negatives = acts_not_recognized[act]
false_positives = recognized - true_positives
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
precision_per_class[act] = precision
recall_per_class[act] = recall
average_precision = sum(precision_per_class.values()) / len(precision_per_class)
average_recall = sum(recall_per_class.values()) / len(recall_per_class)
print("\nPrecision per class:")
for act, precision in precision_per_class.items():
print(f"{act}: {precision}")
print("\nRecall per class:")
for act, recall in recall_per_class.items():
print(f"{act}: {recall}")
print(f"\nAverage Precision: {average_precision}")
print(f"Average Recall: {average_recall}")