roberta_year_text_only_day
This commit is contained in:
parent
d969cafc74
commit
2c2d075f53
20962
dev-0/out.tsv
20962
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
40
roberta_year_text_only_day/01_create_datasets.py
Normal file
40
roberta_year_text_only_day/01_create_datasets.py
Normal file
@ -0,0 +1,40 @@
|
||||
import datetime
|
||||
from config import LABELS_DICT
|
||||
|
||||
with open('../test-A/in.tsv','r') as f_in, open(f'./test-A_huggingface_format_year_as_text.csv', 'w') as f_hf:
|
||||
#f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
|
||||
f_hf.write('text\n')
|
||||
for line_in in f_in:
|
||||
year_cont, date, text = line_in.rstrip('\n').split('\t')
|
||||
d = datetime.datetime.strptime(date,"%Y%m%d")
|
||||
day_of_year = str(d.timetuple().tm_yday)
|
||||
day_of_month = str(d.day)
|
||||
month = str(d.month)
|
||||
year = str(d.year)
|
||||
weekday = str(d.weekday())
|
||||
day_of_year = str(d.timetuple().tm_yday)
|
||||
#text = 'year: ' + year + ' month: ' + month + ' day: ' + day_of_month + ' weekday: ' + weekday + ' ' + text
|
||||
text = 'day: ' + day_of_month + ' ' + text
|
||||
#f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t' + day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + str('0') + '\n')
|
||||
f_hf.write(text + '\n')
|
||||
|
||||
|
||||
for dataset in 'train', 'dev-0':
|
||||
with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'./{dataset}_huggingface_format_year_as_text.csv','w') as f_hf:
|
||||
#f_hf.write('text\tyear_cont\tdate\tday_of_year\tday_of_month\tmonth\tyear\tweekday\tlabel\n')
|
||||
f_hf.write('text\tlabel\n')
|
||||
for line_in, line_exp in zip(f_in, f_exp):
|
||||
label = str(LABELS_DICT[line_exp.rstrip('\n')])
|
||||
year_cont,date,text = line_in.rstrip('\n').split('\t')
|
||||
d = datetime.datetime.strptime(date,"%Y%m%d")
|
||||
day_of_year = str(d.timetuple().tm_yday)
|
||||
day_of_month = str(d.day)
|
||||
month = str(d.month)
|
||||
year = str(d.year)
|
||||
weekday = str(d.weekday())
|
||||
day_of_year = str(d.timetuple().tm_yday)
|
||||
#text = 'year: ' + year + ' month: ' + month + ' day: ' + day_of_month + ' weekday: ' + weekday + ' ' + text
|
||||
text = 'day: ' + day_of_month + ' ' + text
|
||||
#f_hf.write(text +'\t' +year_cont +'\t'+ date + '\t'+ day_of_year + '\t' + day_of_month + '\t' + month + '\t' + year + '\t' + weekday + '\t' + label + '\n')
|
||||
f_hf.write(text + '\t' + label + '\n')
|
||||
|
24
roberta_year_text_only_day/04_predict.py
Normal file
24
roberta_year_text_only_day/04_predict.py
Normal file
@ -0,0 +1,24 @@
|
||||
import pickle
|
||||
from config import LABELS_LIST, MODEL
|
||||
from transformers import AutoTokenizer
|
||||
from tqdm import tqdm
|
||||
|
||||
device = 'cuda'
|
||||
model_path= './roberta-ireland'
|
||||
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(model_path).cuda()
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
||||
|
||||
for dataset in ('dev-0', 'test-A'):
|
||||
with open(f'./{dataset}_huggingface_format_year_as_text.csv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out:
|
||||
next(f_in)
|
||||
for line_in in tqdm(f_in, total=150_000):
|
||||
text = line_in.rstrip('\n')
|
||||
inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
|
||||
outputs = model(**inputs)
|
||||
probs = outputs[0].softmax(1)
|
||||
prediction = LABELS_LIST[probs.argmax(1)]
|
||||
f_out.write(prediction + '\n')
|
||||
|
8
roberta_year_text_only_day/config.py
Normal file
8
roberta_year_text_only_day/config.py
Normal file
@ -0,0 +1,8 @@
|
||||
LABELS_DICT = {'positive':0,
|
||||
'negative':1}
|
||||
|
||||
|
||||
LABELS_LIST = ['positive',
|
||||
'negative']
|
||||
|
||||
MODEL = 'roberta-base'
|
12
roberta_year_text_only_day/run.sh
Normal file
12
roberta_year_text_only_day/run.sh
Normal file
@ -0,0 +1,12 @@
|
||||
python run_glue.py --model_name_or_path roberta-base \
|
||||
--train_file ./train_huggingface_format_year_as_text.csv \
|
||||
--validation_file ./dev-0_huggingface_format_year_as_text.csv \
|
||||
--do_train \
|
||||
--max_seq_length 64 \
|
||||
--per_device_train_batch_size 32 \
|
||||
--learning_rate 2e-5 \
|
||||
--num_train_epochs 3 \
|
||||
--output_dir ./roberta-ireland \
|
||||
--save_steps=10000 \
|
||||
--eval_steps=10000 \
|
||||
--evaluation_strategy steps
|
20718
test-A/out.tsv
20718
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user