Compare commits
1 Commits
master
...
roberta_cl
Author | SHA1 | Date |
---|---|---|
Karol Kaczmarek | 6756e6e54c |
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
# Create spm vocab
|
||||
# spm_train --input=train/in.tsv --model_prefix=vocab_spm_bpe --model_type=bpe --vocab_size=50000 --pad_id 1 --bos_id 2 --eos_id 3
|
||||
|
||||
spm_encode --model vocab_spm_bpe.model < data/train/in.tsv > data/train.txt
|
||||
spm_encode --model vocab_spm_bpe.model < data/dev-0/in.tsv > data/valid.txt
|
||||
spm_encode --model vocab_spm_bpe.model < data/test-A/in.tsv > data/test.txt
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TEXT=data/
|
||||
fairseq-preprocess \
|
||||
--only-source --nwordssrc 50000 \
|
||||
--trainpref $TEXT/train.txt \
|
||||
--validpref $TEXT/valid.txt \
|
||||
--destdir data-bin/classifier-spm-bpe/input0 \
|
||||
--workers 8
|
||||
|
||||
fairseq-preprocess \
|
||||
--only-source \
|
||||
--trainpref $TEXT/train/expected.tsv \
|
||||
--validpref $TEXT/dev-0/expected.tsv \
|
||||
--destdir data-bin/classifier-spm-bpe/label \
|
||||
--workers 8
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
TOTAL_NUM_UPDATES=600_000 # Total number of training steps == 10 epoch (1 peoch = 60_000)
|
||||
WARMUP_UPDATES=24_000 # Warmup the learning rate over this many updates
|
||||
PEAK_LR=0.0001 # Peak learning rate, adjust as needed
|
||||
HEAD_NAME='he_she' # Custom name for the classification head.
|
||||
TOKENS_PER_SAMPLE=256 # Max sequence length
|
||||
NUM_CLASSES=2 # Number of classes for the classification task.
|
||||
MAX_SENTENCES=50 # Batch size.
|
||||
UPDATE_FREQ=1 # Increase the batch size
|
||||
|
||||
MODEL_PATH='checkpoints/lm_roberta_small/checkpoint_best.pt'
|
||||
DATA_DIR=data-bin/classifier-spm-bpe
|
||||
|
||||
fairseq-train $DATA_DIR \
|
||||
--restore-file "$MODEL_PATH" \
|
||||
--fp16 --max-sentences $MAX_SENTENCES --max-positions $TOKENS_PER_SAMPLE --update-freq $UPDATE_FREQ \
|
||||
--max-tokens 32768 --save-dir checkpoints/lm_roberta_small_finetune \
|
||||
--task sentence_prediction \
|
||||
--reset-optimizer --reset-dataloader --reset-meters \
|
||||
--required-batch-size-multiple 1 \
|
||||
--init-token 0 --separator-token 2 \
|
||||
--arch roberta \
|
||||
--criterion sentence_prediction \
|
||||
--num-classes $NUM_CLASSES \
|
||||
--dropout 0.1 --attention-dropout 0.1 --encoder-layers 8 --encoder-embed-dim 512 --encoder-ffn-embed-dim 2048 --encoder-attention-heads 8 \
|
||||
--weight-decay 0.1 --clip-norm 0.0 \
|
||||
--optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
|
||||
--lr-scheduler polynomial_decay --lr $PEAK_LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
||||
--max-epoch 10 --log-format tqdm --log-interval 1 --save-interval-updates 15000 --keep-interval-updates 5 --skip-invalid-size-inputs-valid-test \
|
||||
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
|
||||
--find-unused-parameters
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import torch
|
||||
from fairseq.models.roberta import RobertaModel
|
||||
from tqdm import tqdm
|
||||
|
||||
if __name__ == '__main__':
|
||||
for model_epoch in ['epoch10', 'epoch20']:
|
||||
roberta = RobertaModel.from_pretrained(
|
||||
model_name_or_path=f'checkpoints/classifier_roberta_small_{model_epoch}',
|
||||
data_name_or_path='data-bin/classifier-spm-bpe',
|
||||
sentencepiece_vocab='vocab_spm_bpe.model',
|
||||
checkpoint_file='checkpoint_best.pt',
|
||||
bpe='sentencepiece',
|
||||
)
|
||||
|
||||
roberta.cuda()
|
||||
roberta.eval()
|
||||
|
||||
max_seq = 256
|
||||
batch_size = 15
|
||||
pad_index = roberta.task.source_dictionary.pad()
|
||||
|
||||
for dir_test in ['dev-0', 'dev-1', 'test-A']:
|
||||
lines = []
|
||||
with open(f'data/{dir_test}/in.tsv', 'rt') as f:
|
||||
for line in tqdm(f, desc=f'Reading {dir_test}'):
|
||||
line = roberta.encode(line.rstrip('\n'))[:max_seq]
|
||||
lines.append(line)
|
||||
|
||||
predictions = []
|
||||
for i in tqdm(range(0, len(lines), batch_size), desc='Processing'):
|
||||
batch_text = lines[i: i + batch_size]
|
||||
# Get max length of batch
|
||||
max_len = max([tokens.size(0) for tokens in batch_text])
|
||||
|
||||
# Create empty tensor with padding index
|
||||
input_tensor = torch.LongTensor(len(batch_text), max_len).fill_(pad_index)
|
||||
# Fill tensor with tokens
|
||||
for i, tokens in enumerate(batch_text):
|
||||
input_tensor[i][:tokens.size(0)] = tokens
|
||||
|
||||
with torch.no_grad():
|
||||
raw_prediction = roberta.predict('sentence_classification_head', input_tensor)
|
||||
# Get probability for second class (M class)
|
||||
out_tensor = torch.exp(raw_prediction[:, 1])
|
||||
for line_prediction in out_tensor:
|
||||
# Get probability for first class
|
||||
predictions.append(line_prediction.item())
|
||||
|
||||
with open(f'data/{dir_test}/out-epoch={model_epoch}.tsv', 'wt') as fw:
|
||||
fw.write('\n'.join([f'{p:.8f}' for p in predictions]))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
Reference in New Issue