roberta_base_finetune_existing

2021-06-19 13:25:42 +02:00 · 2021-06-19 13:25:42 +02:00 · cde1e52f78
commit cde1e52f78
parent 0a1b9e7815
8 changed files with 71250 additions and 71130 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/join.py
+++ b/join.py
@ -0,0 +1,16 @@
+
+
+def process(f_in_path, f_exp_path, f_whole_path):
+    with open(f_in_path) as f_in, open(f_exp_path) as f_exp, open(f_whole_path,'w') as f_whole:
+        for line_in, line_exp in zip(f_in, f_exp):
+            _, _, left, right = line_in.rstrip('\n').split('\t')
+            middle = line_exp.rstrip('\n')
+            text = left + ' ' + middle + ' ' + right
+            text = text.replace('\\n', '\n') + '\n\n'
+            f_whole.write(text)
+
+
+process('train/in.tsv', 'train/expected.tsv',  'train/wiki.train.raw')
+process('dev-0/in.tsv', 'dev-0/expected.tsv',  'dev-0/wiki.valid.raw')
+process('dev-0/in.tsv', 'dev-0/expected.tsv',  'dev-0/wiki.test.raw')
+
--- a/roberta_base_finetune_existing/0_get_datasets.sh
+++ b/roberta_base_finetune_existing/0_get_datasets.sh
@ -0,0 +1 @@
+(cd .. ; python join.py ; cp train/wiki.train.raw roberta_base_finetune/wikitext-103-raw/ ; cp dev-0/wiki.{valid,test}.raw roberta_base_finetune/wikitext-103-raw/ )
--- a/roberta_base_finetune_existing/1_encode.sh
+++ b/roberta_base_finetune_existing/1_encode.sh
@ -0,0 +1,12 @@
+mkdir -p gpt2_bpe
+wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+for SPLIT in train valid test; do \
+    python -m multiprocessing_bpe_encoder \
+        --encoder-json gpt2_bpe/encoder.json \
+        --vocab-bpe gpt2_bpe/vocab.bpe \
+        --inputs wikitext-103-raw/wiki.${SPLIT}.raw \
+        --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
+        --keep-empty \
+        --workers 10; \
+done
--- a/roberta_base_finetune_existing/2_binarize.sh
+++ b/roberta_base_finetune_existing/2_binarize.sh
@ -0,0 +1,9 @@
+wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+fairseq-preprocess \
+    --only-source \
+    --srcdict gpt2_bpe/dict.txt \
+    --trainpref wikitext-103-raw/wiki.train.bpe \
+    --validpref wikitext-103-raw/wiki.valid.bpe \
+    --testpref wikitext-103-raw/wiki.test.bpe \
+    --destdir data-bin/wikitext-103 \
+    --workers 10
--- a/roberta_base_finetune_existing/3_train.sh
+++ b/roberta_base_finetune_existing/3_train.sh
@ -0,0 +1,20 @@
+TOTAL_UPDATES=12500000    # Total number of training steps
+WARMUP_UPDATES=1000    # Warmup the learning rate over this many updates
+PEAK_LR=0.0005          # Peak learning rate, adjust as needed
+TOKENS_PER_SAMPLE=512   # Max sequence length
+MAX_POSITIONS=512       # Num. positional embeddings (usually same as above)
+MAX_SENTENCES=4        # Number of sequences per batch (batch size)
+UPDATE_FREQ=64          # Increase the batch size 16x
+
+DATA_DIR=data-bin/wikitext-103
+ulimit -n 4096
+
+fairseq-train --fp16 $DATA_DIR \
+    --task masked_lm --criterion masked_lm \
+    --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
+    --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --batch-size $MAX_SENTENCES --update-freq $UPDATE_FREQ \
+    --max-update $TOTAL_UPDATES --log-format simple --log-interval 1 \
+    --restore-file roberta.base/model.pt
--- a/roberta_base_finetune_existing/predict.py
+++ b/roberta_base_finetune_existing/predict.py
@ -0,0 +1,62 @@
+import torch
+from fairseq.models.roberta import  RobertaModel
+from fairseq import hub_utils
+from fairseq.models.roberta import RobertaModel, RobertaHubInterface
+
+import os
+from tqdm import tqdm
+
+
+roberta = RobertaModel.from_pretrained('checkpoint_final')
+
+roberta.eval() 
+roberta.cuda()
+
+
+preds = roberta.fill_mask('I like <mask> and apples', topk=3)
+#import pdb; pdb.set_trace()
+
+# raise CUDA RuntimeError from which 
+# the process does not recover
+BLACKLIST = ['aeeadb08042bbd49dcbefcefa1f13806',
+            '01ba303704bb62bcb59f8cb7cb5663d7',
+            '98bdfa711364f45f1bcffb1359793614',
+            'a9da7950abcbd531a5207c04c3bdc840',
+            '4cd7f730ee72451406afa89c5c6431d6',
+        ]
+
+def predict(f_in_path,f_out_path):
+    f_in = open(f_in_path,'r', newline='\n')
+    f_out = open(f_out_path,'w', newline='\n')
+
+    for line in tqdm(f_in,total = 88000):
+        id,_, before, after = line.split('\t')
+        before = before.replace('\\n', '\n')
+        after = after.replace('\\n', '\n')
+        before = ' '.join(before.split(' ')[-40:]) # tu można poprawić, żeby  śmigał na tokenal spm a nie zakładał że jest jak ze spacjami
+        after = ' '.join(after.split(' ')[:40])
+        input = before + ' <mask> ' + after
+        try:
+            if id in BLACKLIST:
+                f_out.write(':1\n')
+                continue
+            preds = roberta.fill_mask(input, topk=10)
+            hyps = []
+            probs_sum = 0.0
+            for pred in preds:
+                if pred[2] == '<unk>':
+                    continue
+                hyps.append(pred[2].rstrip().lstrip() + ':' + str(pred[1]))
+                probs_sum += pred[1]
+            hyps.append(':' + str(1 - probs_sum))
+            preds_line = ' '.join(hyps)
+            f_out.write(preds_line + '\n')
+        except RuntimeError:
+            import pdb ; pdb.set_trace()
+            print('RUNTIMEERROR')
+            f_out.write(':1\n')
+
+    f_out.close()
+
+predict('../dev-0/in.tsv', '../dev-0/out.tsv')
+predict('../test-A/in.tsv', '../test-A/out.tsv')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
				`@ -0,0 +1 @@`
				`(cd .. ; python join.py ; cp train/wiki.train.raw roberta_base_finetune/wikitext-103-raw/ ; cp dev-0/wiki.{valid,test}.raw roberta_base_finetune/wikitext-103-raw/ )`