diff --git a/Dockerfile b/Dockerfile index 9752dd3..80d1d98 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ RUN apt update -y && apt install -y make RUN apt install -y git RUN apt install -y gcc RUN apt install -y gawk +RUN apt install -y python3 RUN gcc --version RUN apt install -y build-essential RUN git clone https://github.com/usnistgov/SCTK.git diff --git a/Jenkinsfile b/Jenkinsfile index 7e466e9..fc0b987 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,5 +11,7 @@ node{ sh 'awk -f file3.awk < reference.txt > reference.trn' sh 'awk -f file3.awk < hypothesis.txt > hypothesis.trn' sh 'sclite -f 0 -r reference.trn trn -h hypothesis.trn trn -e utf-8 -i rm -o all -p > tmp_metrics' + sh 'python3 ./xml_parser.py' + sh 'paste wikiniews_results.tsv WERs > wikiniews_results_with_WERs.tsv' } } \ No newline at end of file diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..13016ed --- /dev/null +++ b/parser.py @@ -0,0 +1,37 @@ +import xml.etree.ElementTree as ET +import re + +tree = ET.parse('tmp_metrics') +root = tree.getroot() + +speaker = root[0] + +correct_word_pattern = r'^C' +correct_word_regex = re.compile(correct_word_pattern, flags=re.MULTILINE) + +partial_outputs = open("WERs", 'w') + +SRR_counter = 0 +WER_sum = 0 +count = 0 + +for child in speaker: + count = count + 1 + all_words = int(child.attrib["word_cnt"]) + correct = correct_word_regex.findall(child.text) + partial_outputs.write(str(len(correct)/all_words) + '\n') + WER_sum = WER_sum + len(correct)/all_words + if len(correct) == all_words: + SRR_counter = SRR_counter + 1 + + """if len(correct) != all_words: + print(child.text) + print(len(correct), '/', all_words)""" + +statistics = open("statistics", 'w') + +statistics.write("WER: " + str(WER_sum/count) + '\n') +statistics.write("SRR: " + str(SRR_counter/count)) + +statistics.close() +partial_outputs.close() \ No newline at end of file