Added parser and called it in script.sh

2019-04-26 15:12:51 +02:00 · 2019-04-26 15:12:51 +02:00 · 65718229aa
commit 65718229aa
parent e5db46805c
2 changed files with 41 additions and 1 deletions
--- a/script.sh
+++ b/script.sh
@ -2,4 +2,7 @@ awk -f get_2nd.awk < wikiniews_results.tsv > hypotheses.txt
 awk -f get_3rd.awk < wikiniews_results.tsv > references.txt
 awk -f txt_to_tsr_converter.awk < references.txt > references.trn
 awk -f txt_to_tsr_converter.awk < hypotheses.txt > hypotheses.trn
-sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics
+sclite -f 0 -r references.trn trn -h hypotheses.trn trn -e utf-8 -i rm -o all -p > tmp_metrics
+
+python3 ./xml_parser.py
+paste wikiniews_results.tsv WERs > wikiniews_results_with_WERs.tsv
--- a/xml_parser.py
+++ b/xml_parser.py
@ -0,0 +1,37 @@
+import xml.etree.ElementTree as ET
+import re
+
+tree = ET.parse('tmp_metrics')
+root = tree.getroot()
+
+speaker = root[0]
+
+correct_word_pattern = r'^C'
+correct_word_regex = re.compile(correct_word_pattern, flags=re.MULTILINE)
+
+partial_outputs = open("WERs", 'w')
+
+SRR_counter = 0
+WER_sum = 0
+count = 0
+
+for child in speaker:
+    count = count + 1
+    all_words = int(child.attrib["word_cnt"])
+    correct = correct_word_regex.findall(child.text)
+    partial_outputs.write(str(len(correct)/all_words) + '\n')
+    WER_sum = WER_sum + len(correct)/all_words
+    if len(correct) == all_words:
+        SRR_counter = SRR_counter + 1
+
+    """if len(correct) != all_words:
+        print(child.text)
+    print(len(correct), '/', all_words)"""
+
+statistics = open("statistics", 'w')
+
+statistics.write("WER: " + str(WER_sum/count) + '\n')
+statistics.write("SRR: " + str(SRR_counter/count))
+
+statistics.close()
+partial_outputs.close()