import datetime from tqdm import tqdm #with open('../test-A/in.tsv','r') as f_in, open(f'./test-A_in.csv', 'w') as f_hf: # f_hf.write('text\n') # for line_in in f_in: # year_cont, date, text = line_in.rstrip('\n').split('\t') # d = datetime.datetime.strptime(date,"%Y%m%d") # day_of_year = str(d.timetuple().tm_yday) # day_of_month = str(d.day) # month = str(d.month) # year = str(d.year) # weekday = str(d.weekday()) # day_of_year = str(d.timetuple().tm_yday) # f_hf.write(text +'\n') for dataset in 'train', 'dev-0': with open(f'../{dataset}/in.tsv') as f_in, open(f'./{dataset}_in.csv','w') as f_hf: f_hf.write('text\n') for line_in in tqdm(f_in): _, _, date, year_frac,_, _, _, text = line_in.rstrip('\n').split('\t') d = datetime.datetime.strptime(date[:19],"%Y-%m-%d %H:%M:%S") day_of_year = str(d.timetuple().tm_yday) day_of_month = str(d.day) month = str(d.month) year = str(d.year) weekday = str(d.weekday()) day_of_year = str(d.timetuple().tm_yday) text = text.replace('\\n','') text_splitted = text.split(' ') for i in range(0, len(text_splitted), 400): text_chunk = ' '.join(text_splitted[i:i+400]) f_hf.write(text_chunk +'\n')