From 208799af8531a2cbd166dfaeea42ff22cdfa7567 Mon Sep 17 00:00:00 2001 From: Alagris Date: Mon, 22 Mar 2021 13:50:50 +0100 Subject: [PATCH] data stats --- dataset_stats.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 dataset_stats.py diff --git a/dataset_stats.py b/dataset_stats.py new file mode 100644 index 0000000..d75827f --- /dev/null +++ b/dataset_stats.py @@ -0,0 +1,47 @@ +import re + +import requests + +with open('in_alphabet') as file: + in_alph = list(file.read()) + +with open('out_alphabet') as file: + out_alph = list(file.read()) + +IN_COUNT = {letter: 0 for idx, letter in enumerate(in_alph)} + +OUT_COUNT = {letter: 0 for idx, letter in enumerate(out_alph)} + +DATASET_SIZE = 0 +OUT_LEN_TOTAL = 0 +OUT_LEN_MAX = 0 +OUT_LEN_MIN = 99999 +IN_LEN_TOTAL = 0 +IN_LEN_MAX = 0 +IN_LEN_MIN = 99999 +with open('preprocessed.tsv') as p: + for lin in p: + i, o = lin.split('\t') + o = o.strip() + for letter in i: + IN_COUNT[letter] += 1 + for letter in o: + OUT_COUNT[letter] += 1 + OUT_LEN_TOTAL += len(o) + OUT_LEN_MAX = max(OUT_LEN_MAX, len(o)) + OUT_LEN_MIN = min(OUT_LEN_MIN, len(o)) + IN_LEN_TOTAL += len(i) + IN_LEN_MAX = max(IN_LEN_MAX, len(i)) + IN_LEN_MIN = min(IN_LEN_MAX, len(i)) + DATASET_SIZE += 1 + +with open('stats.txt', 'w+') as file: + file.write('records_count=' + str(DATASET_SIZE) + '\n') + file.write('out_len_total=' + str(OUT_LEN_TOTAL) + '\n') + file.write('out_len_max=' + str(OUT_LEN_MAX) + '\n') + file.write('out_len_min=' + str(OUT_LEN_MIN) + '\n') + file.write('in_len_total=' + str(IN_LEN_TOTAL) + '\n') + file.write('in_len_max=' + str(IN_LEN_MAX) + '\n') + file.write('in_len_min=' + str(IN_LEN_MIN) + '\n') + file.write('in_letter_occurences=' + str(IN_COUNT) + '\n') + file.write('out_letter_occurences=' + str(OUT_COUNT) + '\n')