metryki10

2019-04-26 15:20:10 +02:00 · 2019-04-26 15:20:10 +02:00 · 2eaf6584d1
commit 2eaf6584d1
parent 0619df827c
3 changed files with 40 additions and 0 deletions
--- a/1
+++ b/1
@ -4,6 +4,7 @@ RUN apt update -y && apt install -y make
 RUN apt install -y git
 RUN apt install -y gcc
 RUN apt install -y gawk
 RUN apt install -y python3
 RUN gcc --version
 RUN apt install -y build-essential
 RUN git clone https://github.com/usnistgov/SCTK.git
--- a/2
+++ b/2
@ -11,5 +11,7 @@ node{
        sh 'awk -f file3.awk < reference.txt > reference.trn'
        sh 'awk -f file3.awk < hypothesis.txt > hypothesis.trn'
        sh 'sclite -f 0 -r reference.trn trn -h hypothesis.trn trn -e utf-8 -i rm -o all -p > tmp_metrics'
        sh 'python3 ./xml_parser.py'
        sh 'paste wikiniews_results.tsv WERs > wikiniews_results_with_WERs.tsv'
    }
 }
--- a/parser.py
+++ b/parser.py
@ -0,0 +1,37 @@
 import xml.etree.ElementTree as ET
 import re
 tree = ET.parse('tmp_metrics')
 root = tree.getroot()
 speaker = root[0]
 correct_word_pattern = r'^C'
 correct_word_regex = re.compile(correct_word_pattern, flags=re.MULTILINE)
 partial_outputs = open("WERs", 'w')
 SRR_counter = 0
 WER_sum = 0
 count = 0
 for child in speaker:
    count = count + 1
    all_words = int(child.attrib["word_cnt"])
    correct = correct_word_regex.findall(child.text)
    partial_outputs.write(str(len(correct)/all_words) + '\n')
    WER_sum = WER_sum + len(correct)/all_words
    if len(correct) == all_words:
        SRR_counter = SRR_counter + 1
    """if len(correct) != all_words:
        print(child.text)
    print(len(correct), '/', all_words)"""
 statistics = open("statistics", 'w')
 statistics.write("WER: " + str(WER_sum/count) + '\n')
 statistics.write("SRR: " + str(SRR_counter/count))
 statistics.close()
 partial_outputs.close()