Added parser and called it in script.sh

This commit is contained in:
Kacper Kurzeja 2019-04-26 15:12:51 +02:00
parent e5db46805c
commit 65718229aa
2 changed files with 41 additions and 1 deletions

View File

@ -2,4 +2,7 @@ awk -f get_2nd.awk < wikiniews_results.tsv > hypotheses.txt
awk -f get_3rd.awk < wikiniews_results.tsv > references.txt
awk -f txt_to_tsr_converter.awk < references.txt > references.trn
awk -f txt_to_tsr_converter.awk < hypotheses.txt > hypotheses.trn
sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics
sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics
python3 ./xml_parser.py
paste wikiniews_results.tsv WERs > wikiniews_results_with_WERs.tsv

37
xml_parser.py Normal file
View File

@ -0,0 +1,37 @@
import xml.etree.ElementTree as ET
import re
tree = ET.parse('tmp_metrics')
root = tree.getroot()
speaker = root[0]
correct_word_pattern = r'^C'
correct_word_regex = re.compile(correct_word_pattern, flags=re.MULTILINE)
partial_outputs = open("WERs", 'w')
SRR_counter = 0
WER_sum = 0
count = 0
for child in speaker:
count = count + 1
all_words = int(child.attrib["word_cnt"])
correct = correct_word_regex.findall(child.text)
partial_outputs.write(str(len(correct)/all_words) + '\n')
WER_sum = WER_sum + len(correct)/all_words
if len(correct) == all_words:
SRR_counter = SRR_counter + 1
"""if len(correct) != all_words:
print(child.text)
print(len(correct), '/', all_words)"""
statistics = open("statistics", 'w')
statistics.write("WER: " + str(WER_sum/count) + '\n')
statistics.write("SRR: " + str(SRR_counter/count))
statistics.close()
partial_outputs.close()