diff --git a/script.sh b/script.sh index 31867ab..33c6798 100755 --- a/script.sh +++ b/script.sh @@ -2,4 +2,7 @@ awk -f get_2nd.awk < wikiniews_results.tsv > hypotheses.txt awk -f get_3rd.awk < wikiniews_results.tsv > references.txt awk -f txt_to_tsr_converter.awk < references.txt > references.trn awk -f txt_to_tsr_converter.awk < hypotheses.txt > hypotheses.trn -sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics \ No newline at end of file +sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics + +python3 ./xml_parser.py +paste wikiniews_results.tsv WERs > wikiniews_results_with_WERs.tsv \ No newline at end of file diff --git a/xml_parser.py b/xml_parser.py new file mode 100644 index 0000000..7a88503 --- /dev/null +++ b/xml_parser.py @@ -0,0 +1,37 @@ +import xml.etree.ElementTree as ET +import re + +tree = ET.parse('tmp_metrics') +root = tree.getroot() + +speaker = root[0] + +correct_word_pattern = r'^C' +correct_word_regex = re.compile(correct_word_pattern, flags=re.MULTILINE) + +partial_outputs = open("WERs", 'w') + +SRR_counter = 0 +WER_sum = 0 +count = 0 + +for child in speaker: + count = count + 1 + all_words = int(child.attrib["word_cnt"]) + correct = correct_word_regex.findall(child.text) + partial_outputs.write(str(len(correct)/all_words) + '\n') + WER_sum = WER_sum + len(correct)/all_words + if len(correct) == all_words: + SRR_counter = SRR_counter + 1 + + """if len(correct) != all_words: + print(child.text) + print(len(correct), '/', all_words)""" + +statistics = open("statistics", 'w') + +statistics.write("WER: " + str(WER_sum/count) + '\n') +statistics.write("SRR: " + str(SRR_counter/count)) + +statistics.close() +partial_outputs.close()