39 lines
1.5 KiB
Python
39 lines
1.5 KiB
Python
|
import datetime
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
#with open('../test-A/in.tsv','r') as f_in, open(f'./test-A_in.csv', 'w') as f_hf:
|
||
|
# f_hf.write('text\n')
|
||
|
# for line_in in f_in:
|
||
|
# year_cont, date, text = line_in.rstrip('\n').split('\t')
|
||
|
# d = datetime.datetime.strptime(date,"%Y%m%d")
|
||
|
# day_of_year = str(d.timetuple().tm_yday)
|
||
|
# day_of_month = str(d.day)
|
||
|
# month = str(d.month)
|
||
|
# year = str(d.year)
|
||
|
# weekday = str(d.weekday())
|
||
|
# day_of_year = str(d.timetuple().tm_yday)
|
||
|
# f_hf.write(text +'\n')
|
||
|
|
||
|
|
||
|
for dataset in 'train', 'dev-0':
|
||
|
with open(f'../{dataset}/in.tsv') as f_in, open(f'./{dataset}_in.csv','w') as f_hf:
|
||
|
f_hf.write('text\n')
|
||
|
for line_in in tqdm(f_in):
|
||
|
_, _, date, year_frac,_, _, _, text = line_in.rstrip('\n').split('\t')
|
||
|
d = datetime.datetime.strptime(date[:19],"%Y-%m-%d %H:%M:%S")
|
||
|
day_of_year = str(d.timetuple().tm_yday)
|
||
|
day_of_month = str(d.day)
|
||
|
month = str(d.month)
|
||
|
year = str(d.year)
|
||
|
weekday = str(d.weekday())
|
||
|
day_of_year = str(d.timetuple().tm_yday)
|
||
|
|
||
|
text = text.replace('\\n','')
|
||
|
|
||
|
text_splitted = text.split(' ')
|
||
|
for i in range(0, len(text_splitted), 400):
|
||
|
text_chunk = ' '.join(text_splitted[i:i+400])
|
||
|
text_chunk = 'year : ' + year + ' month : ' + month + ' day ' + day_of_month + ' weekday : ' + weekday + ' ' + text_chunk
|
||
|
f_hf.write(text_chunk +'\n')
|
||
|
|