roberta_no_year_from_scratch

2021-11-02 22:01:42 +01:00 · 2021-11-02 22:01:42 +01:00 · e984dbd303
commit e984dbd303
parent a8d1b7f154
6 changed files with 21987 additions and 21926 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/roberta_no_year_from_scratch/01_create_datasets.py
+++ b/roberta_no_year_from_scratch/01_create_datasets.py
@ -0,0 +1,17 @@
+from config import LABELS_DICT
+
+with open('../test-A/in.tsv','r') as f_in, open(f'./test-A_huggingface_format.csv', 'w') as f_hf:
+    f_hf.write('text\n')
+    for line_in in f_in:
+        _,_, text = line_in.split('\t')
+        f_hf.write(text)
+
+
+for dataset in 'train', 'dev-0':
+    with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/expected.tsv') as f_exp, open(f'./{dataset}_huggingface_format.csv','w') as f_hf:
+        f_hf.write('text\tlabel\n')
+        for line_in, line_exp in zip(f_in, f_exp):
+            label = LABELS_DICT[line_exp.rstrip('\n')]
+            _,_,text = line_in.rstrip('\n').split('\t')
+            f_hf.write(text +'\t'+ str(label) + '\n')
+
--- a/roberta_no_year_from_scratch/04_predict.py
+++ b/roberta_no_year_from_scratch/04_predict.py
@ -0,0 +1,24 @@
+import pickle
+from config import LABELS_LIST, MODEL
+from transformers import AutoTokenizer
+from tqdm import tqdm
+
+device = 'cuda'
+model_path= './roberta-ireland'
+
+from transformers import AutoModelForSequenceClassification
+
+model = AutoModelForSequenceClassification.from_pretrained(model_path).cuda()
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+for dataset in ('dev-0', 'test-A'):
+    with open(f'../{dataset}/in.tsv') as f_in, open(f'../{dataset}/out.tsv','w') as f_out:
+        for line_in in tqdm(f_in, total=150_000):
+            _,_, text = line_in.split('\t')
+            text = text.rstrip('\n')
+            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
+            outputs = model(**inputs)
+            probs = outputs[0].softmax(1)
+            prediction = LABELS_LIST[probs.argmax(1)]
+            f_out.write(prediction + '\n')
+
--- a/roberta_no_year_from_scratch/config.py
+++ b/roberta_no_year_from_scratch/config.py
@ -0,0 +1,8 @@
+LABELS_DICT = {'positive':0,
+        'negative':1}
+
+
+LABELS_LIST = ['positive',
+        'negative']
+
+MODEL = 'roberta-base'
--- a/roberta_no_year_from_scratch/run.sh
+++ b/roberta_no_year_from_scratch/run.sh
@ -0,0 +1,12 @@
+python run_glue.py   --model_name_or_path roberta-base \
+     	--train_file ./train_huggingface_format.csv \
+       	--validation_file ./dev-0_huggingface_format.csv \
+   	--do_train \
+ 	--max_seq_length 64 \
+     	--per_device_train_batch_size 32 \
+     	--learning_rate 2e-5 \
+     	--num_train_epochs 3 \
+     	--output_dir ./roberta-ireland \
+        --save_steps=10000 \
+	--eval_steps=10000 \
+	--evaluation_strategy steps
--- a/test-A/out.tsv
+++ b/test-A/out.tsv