import re import requests with open('in_alphabet') as file: in_alph = list(file.read()) with open('out_alphabet') as file: out_alph = list(file.read()) IN_COUNT = {letter: 0 for idx, letter in enumerate(in_alph)} OUT_COUNT = {letter: 0 for idx, letter in enumerate(out_alph)} DATASET_SIZE = 0 OUT_LEN_TOTAL = 0 OUT_LEN_MAX = 0 OUT_LEN_MIN = 99999 IN_LEN_TOTAL = 0 IN_LEN_MAX = 0 IN_LEN_MIN = 99999 with open('preprocessed.tsv') as p: for lin in p: i, o = lin.split('\t') o = o.strip() for letter in i: IN_COUNT[letter] += 1 for letter in o: OUT_COUNT[letter] += 1 OUT_LEN_TOTAL += len(o) OUT_LEN_MAX = max(OUT_LEN_MAX, len(o)) OUT_LEN_MIN = min(OUT_LEN_MIN, len(o)) IN_LEN_TOTAL += len(i) IN_LEN_MAX = max(IN_LEN_MAX, len(i)) IN_LEN_MIN = min(IN_LEN_MAX, len(i)) DATASET_SIZE += 1 with open('stats.txt', 'w+') as file: file.write('records_count=' + str(DATASET_SIZE) + '\n') file.write('out_len_total=' + str(OUT_LEN_TOTAL) + '\n') file.write('out_len_max=' + str(OUT_LEN_MAX) + '\n') file.write('out_len_min=' + str(OUT_LEN_MIN) + '\n') file.write('in_len_total=' + str(IN_LEN_TOTAL) + '\n') file.write('in_len_max=' + str(IN_LEN_MAX) + '\n') file.write('in_len_min=' + str(IN_LEN_MIN) + '\n') file.write('in_letter_occurences=' + str(IN_COUNT) + '\n') file.write('out_letter_occurences=' + str(OUT_COUNT) + '\n')