data stats
This commit is contained in:
parent
faaa014167
commit
208799af85
47
dataset_stats.py
Normal file
47
dataset_stats.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
with open('in_alphabet') as file:
|
||||||
|
in_alph = list(file.read())
|
||||||
|
|
||||||
|
with open('out_alphabet') as file:
|
||||||
|
out_alph = list(file.read())
|
||||||
|
|
||||||
|
IN_COUNT = {letter: 0 for idx, letter in enumerate(in_alph)}
|
||||||
|
|
||||||
|
OUT_COUNT = {letter: 0 for idx, letter in enumerate(out_alph)}
|
||||||
|
|
||||||
|
DATASET_SIZE = 0
|
||||||
|
OUT_LEN_TOTAL = 0
|
||||||
|
OUT_LEN_MAX = 0
|
||||||
|
OUT_LEN_MIN = 99999
|
||||||
|
IN_LEN_TOTAL = 0
|
||||||
|
IN_LEN_MAX = 0
|
||||||
|
IN_LEN_MIN = 99999
|
||||||
|
with open('preprocessed.tsv') as p:
|
||||||
|
for lin in p:
|
||||||
|
i, o = lin.split('\t')
|
||||||
|
o = o.strip()
|
||||||
|
for letter in i:
|
||||||
|
IN_COUNT[letter] += 1
|
||||||
|
for letter in o:
|
||||||
|
OUT_COUNT[letter] += 1
|
||||||
|
OUT_LEN_TOTAL += len(o)
|
||||||
|
OUT_LEN_MAX = max(OUT_LEN_MAX, len(o))
|
||||||
|
OUT_LEN_MIN = min(OUT_LEN_MIN, len(o))
|
||||||
|
IN_LEN_TOTAL += len(i)
|
||||||
|
IN_LEN_MAX = max(IN_LEN_MAX, len(i))
|
||||||
|
IN_LEN_MIN = min(IN_LEN_MAX, len(i))
|
||||||
|
DATASET_SIZE += 1
|
||||||
|
|
||||||
|
with open('stats.txt', 'w+') as file:
|
||||||
|
file.write('records_count=' + str(DATASET_SIZE) + '\n')
|
||||||
|
file.write('out_len_total=' + str(OUT_LEN_TOTAL) + '\n')
|
||||||
|
file.write('out_len_max=' + str(OUT_LEN_MAX) + '\n')
|
||||||
|
file.write('out_len_min=' + str(OUT_LEN_MIN) + '\n')
|
||||||
|
file.write('in_len_total=' + str(IN_LEN_TOTAL) + '\n')
|
||||||
|
file.write('in_len_max=' + str(IN_LEN_MAX) + '\n')
|
||||||
|
file.write('in_len_min=' + str(IN_LEN_MIN) + '\n')
|
||||||
|
file.write('in_letter_occurences=' + str(IN_COUNT) + '\n')
|
||||||
|
file.write('out_letter_occurences=' + str(OUT_COUNT) + '\n')
|
Loading…
Reference in New Issue
Block a user