diff --git a/regular_roberta_from_scratch/00_create_datasets.py b/regular_roberta_from_scratch/00_create_datasets.py index 026cf0b..80a0f97 100644 --- a/regular_roberta_from_scratch/00_create_datasets.py +++ b/regular_roberta_from_scratch/00_create_datasets.py @@ -31,7 +31,7 @@ for dataset in 'train', 'dev-0': text = text.replace('\\n','') text_splitted = text.split(' ') - for i in range(0, len(text_splitted), 400): - text_chunk = ' '.join(text_splitted[i:i+400]) + for i in range(0, len(text_splitted), 450): + text_chunk = ' '.join(text_splitted[i:i+450]) f_hf.write(text_chunk +'\n') diff --git a/roberta_with_year_from_scratch/00_create_datasets.py b/roberta_with_year_from_scratch/00_create_datasets.py index 09dc41b..03171f9 100644 --- a/roberta_with_year_from_scratch/00_create_datasets.py +++ b/roberta_with_year_from_scratch/00_create_datasets.py @@ -31,8 +31,8 @@ for dataset in 'train', 'dev-0': text = text.replace('\\n','') text_splitted = text.split(' ') - for i in range(0, len(text_splitted), 400): - text_chunk = ' '.join(text_splitted[i:i+400]) + for i in range(0, len(text_splitted), 450): + text_chunk = ' '.join(text_splitted[i:i+450]) text_chunk = 'year : ' + year + ' month : ' + month + ' day ' + day_of_month + ' weekday : ' + weekday + ' ' + text_chunk f_hf.write(text_chunk +'\n')