wiki-historian/challam_year_prediction_on_roberta_base_model/01_create_datasets.py

17 lines
1007 B
Python

for split in 'train', 'dev-0':
with open(f'../{split}/in.tsv') as f_in, open(f'../{split}/expected.tsv') as f_exp, open(f'./{split}_huggingface_format.csv', 'w') as f_hf:
f_hf.write('year_start_float\tyear_end_float\tyear_middle_float\tyear_middle_int\ttext\n')
for line_in, line_exp in zip(f_in, f_exp):
year_start_float, year_end_float = line_exp.rstrip().split(',')
year_middle_float = (float(year_start_float) + float(year_end_float)) / 2
year_middle_int = round(year_middle_float)
f_hf.write(f'{year_start_float}\t{year_end_float}\t{year_middle_float}\t{year_middle_int}\t{line_in}')
for split in ('test-A',):
with open(f'../{split}/in.tsv') as f_in, open(f'./{split}_huggingface_format.csv', 'w') as f_hf:
f_hf.write('year_start_float\tyear_end_float\tyear_middle_float\tyear_middle_int\ttext\n')
for line_in in f_in:
expected = '0.0\t0.0\t0.0\t0'
f_hf.write(expected + '\t' + line_in)