chron-am-roberta/regular_roberta_from_scratch/00_create_datasets.py
Jakub Pokrywka 00fec94240 init
2021-11-06 15:41:54 +01:00

38 lines
1.4 KiB
Python

import datetime
from tqdm import tqdm
#with open('../test-A/in.tsv','r') as f_in, open(f'./test-A_in.csv', 'w') as f_hf:
# f_hf.write('text\n')
# for line_in in f_in:
# year_cont, date, text = line_in.rstrip('\n').split('\t')
# d = datetime.datetime.strptime(date,"%Y%m%d")
# day_of_year = str(d.timetuple().tm_yday)
# day_of_month = str(d.day)
# month = str(d.month)
# year = str(d.year)
# weekday = str(d.weekday())
# day_of_year = str(d.timetuple().tm_yday)
# f_hf.write(text +'\n')
for dataset in 'train', 'dev-0':
with open(f'../{dataset}/in.tsv') as f_in, open(f'./{dataset}_in.csv','w') as f_hf:
f_hf.write('text\n')
for line_in in tqdm(f_in):
_, _, date, year_frac,_, _, _, text = line_in.rstrip('\n').split('\t')
d = datetime.datetime.strptime(date[:19],"%Y-%m-%d %H:%M:%S")
day_of_year = str(d.timetuple().tm_yday)
day_of_month = str(d.day)
month = str(d.month)
year = str(d.year)
weekday = str(d.weekday())
day_of_year = str(d.timetuple().tm_yday)
text = text.replace('\\n','')
text_splitted = text.split(' ')
for i in range(0, len(text_splitted), 400):
text_chunk = ' '.join(text_splitted[i:i+400])
f_hf.write(text_chunk +'\n')