from scratch RoBERTa MLM + classfier
This commit is contained in:
parent
ddce23e0d4
commit
2033259867
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
# Create spm vocab
|
||||
# spm_train --input=train/in.tsv --model_prefix=vocab_spm_bpe --model_type=bpe --vocab_size=50000 --pad_id 1 --bos_id 2 --eos_id 3
|
||||
|
||||
spm_encode --model vocab_spm_bpe.model < data/train/in.tsv > data/train.txt
|
||||
spm_encode --model vocab_spm_bpe.model < data/dev-0/in.tsv > data/valid.txt
|
||||
spm_encode --model vocab_spm_bpe.model < data/test-A/in.tsv > data/test.txt
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TEXT=data/
|
||||
fairseq-preprocess \
|
||||
--only-source --nwordssrc 50000 \
|
||||
--trainpref $TEXT/train.txt \
|
||||
--validpref $TEXT/valid.txt \
|
||||
--destdir data-bin/classifier-spm-bpe/input0 \
|
||||
--workers 8
|
||||
|
||||
fairseq-preprocess \
|
||||
--only-source \
|
||||
--trainpref $TEXT/train/expected.tsv \
|
||||
--validpref $TEXT/dev-0/expected.tsv \
|
||||
--destdir data-bin/classifier-spm-bpe/label \
|
||||
--workers 8
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TEXT=data
|
||||
fairseq-preprocess \
|
||||
--only-source --nwordssrc 50000 \
|
||||
--trainpref $TEXT/train.txt \
|
||||
--validpref $TEXT/valid.txt \
|
||||
--testpref $TEXT/test.txt \
|
||||
--destdir data-bin/lm-spm-bpe \
|
||||
--workers 8
|
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TOTAL_UPDATES=200_000 # Total number of training steps == 10 epoch (1 peoch = 20_000)
|
||||
WARMUP_UPDATES=8_000 # Warmup the learning rate over this many updates
|
||||
PEAK_LR=0.0001 # Peak learning rate, adjust as needed
|
||||
TOKENS_PER_SAMPLE=256 # Max sequence length
|
||||
MAX_POSITIONS=256 # Num. positional embeddings (usually same as above)
|
||||
MAX_SENTENCES=50 # Number of sequences per batch (batch size)
|
||||
UPDATE_FREQ=1 # Increase the batch size
|
||||
|
||||
DATA_DIR=data-bin/lm-spm-bpe
|
||||
|
||||
fairseq-train $DATA_DIR \
|
||||
--fp16 --task masked_lm --criterion masked_lm --save-dir checkpoints/lm_roberta_small \
|
||||
--arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
|
||||
--optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
|
||||
--lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
|
||||
--dropout 0.1 --attention-dropout 0.1 --encoder-layers 8 --encoder-embed-dim 512 --encoder-ffn-embed-dim 2048 --encoder-attention-heads 8 --weight-decay 0.01 \
|
||||
--max-sentences $MAX_SENTENCES --max-positions $MAX_POSITIONS --update-freq $UPDATE_FREQ \
|
||||
--max-epoch 10 --log-format tqdm --log-interval 1 --save-interval-updates 5000 --keep-interval-updates 5 --skip-invalid-size-inputs-valid-test
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TOTAL_NUM_UPDATES=600_000 # Total number of training steps == 10 epoch (1 peoch = 60_000)
|
||||
WARMUP_UPDATES=24_000 # Warmup the learning rate over this many updates
|
||||
PEAK_LR=0.0001 # Peak learning rate, adjust as needed
|
||||
HEAD_NAME='he_she' # Custom name for the classification head.
|
||||
TOKENS_PER_SAMPLE=256 # Max sequence length
|
||||
NUM_CLASSES=2 # Number of classes for the classification task.
|
||||
MAX_SENTENCES=50 # Batch size.
|
||||
UPDATE_FREQ=1 # Increase the batch size
|
||||
|
||||
MODEL_PATH='checkpoints/lm_roberta_small/checkpoint_best.pt'
|
||||
DATA_DIR=data-bin/classifier-spm-bpe
|
||||
|
||||
fairseq-train $DATA_DIR \
|
||||
--restore-file "$MODEL_PATH" \
|
||||
--fp16 --max-sentences $MAX_SENTENCES --max-positions $TOKENS_PER_SAMPLE --update-freq $UPDATE_FREQ \
|
||||
--max-tokens 32768 --save-dir checkpoints/lm_roberta_small_finetune \
|
||||
--task sentence_prediction \
|
||||
--reset-optimizer --reset-dataloader --reset-meters \
|
||||
--required-batch-size-multiple 1 \
|
||||
--init-token 0 --separator-token 2 \
|
||||
--arch roberta \
|
||||
--criterion sentence_prediction \
|
||||
--num-classes $NUM_CLASSES \
|
||||
--dropout 0.1 --attention-dropout 0.1 --encoder-layers 8 --encoder-embed-dim 512 --encoder-ffn-embed-dim 2048 --encoder-attention-heads 8 \
|
||||
--weight-decay 0.1 --clip-norm 0.0 \
|
||||
--optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
|
||||
--lr-scheduler polynomial_decay --lr $PEAK_LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
||||
--max-epoch 10 --log-format tqdm --log-interval 1 --save-interval-updates 15000 --keep-interval-updates 5 --skip-invalid-size-inputs-valid-test \
|
||||
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
|
||||
--find-unused-parameters
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import torch
|
||||
from fairseq.models.roberta import RobertaModel
|
||||
from tqdm import tqdm
|
||||
|
||||
if __name__ == '__main__':
|
||||
roberta = RobertaModel.from_pretrained(
|
||||
model_name_or_path='checkpoints/lm_roberta_small_finetune',
|
||||
data_name_or_path='data-bin/classifier-spm-bpe',
|
||||
sentencepiece_vocab='vocab_spm_bpe.model',
|
||||
checkpoint_file='checkpoint_best.pt',
|
||||
bpe='sentencepiece',
|
||||
)
|
||||
|
||||
roberta.cuda()
|
||||
roberta.eval()
|
||||
|
||||
max_seq = 256
|
||||
batch_size = 15
|
||||
pad_index = roberta.task.source_dictionary.pad()
|
||||
|
||||
for dir_test in ['dev-0', 'dev-1', 'test-A']:
|
||||
lines = []
|
||||
with open(f'data/{dir_test}/in.tsv', 'rt') as f:
|
||||
for line in tqdm(f, desc=f'Reading {dir_test}'):
|
||||
line = roberta.encode(line.rstrip('\n'))[:max_seq]
|
||||
lines.append(line)
|
||||
|
||||
predictions = []
|
||||
for i in tqdm(range(0, len(lines), batch_size), desc='Processing'):
|
||||
batch_text = lines[i: i + batch_size]
|
||||
# Get max length of batch
|
||||
max_len = max([tokens.size(0) for tokens in batch_text])
|
||||
|
||||
# Create empty tensor with padding index
|
||||
input_tensor = torch.LongTensor(len(batch_text), max_len).fill_(pad_index)
|
||||
# Fill tensor with tokens
|
||||
for i, tokens in enumerate(batch_text):
|
||||
input_tensor[i][:tokens.size(0)] = tokens
|
||||
|
||||
with torch.no_grad():
|
||||
raw_prediction = roberta.predict('sentence_classification_head', input_tensor)
|
||||
# Get probability for second class (M class)
|
||||
out_tensor = torch.exp(raw_prediction[:, 1])
|
||||
for line_prediction in out_tensor:
|
||||
# Get probability for first class
|
||||
predictions.append(line_prediction.item())
|
||||
|
||||
with open(f'data/{dir_test}/out.tsv', 'wt') as fw:
|
||||
fw.write('\n'.join([f'{p:.8f}' for p in predictions]))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
Reference in New Issue