roberta_base
This commit is contained in:
parent
ddce23e0d4
commit
049966f426
|
@ -0,0 +1,8 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
wget https://github.com/sdadas/polish-roberta/releases/download/models/roberta_base_fairseq.zip
|
||||
|
||||
unzip roberta_base_fairseq.zip -d roberta_base_fairseq
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
spm_encode --model=roberta_base_fairseq/sentencepiece.bpe.model < data/train/in.tsv > data/train.input0.spm
|
||||
spm_encode --model=roberta_base_fairseq/sentencepiece.bpe.model < data/dev-0/in.tsv > data/dev.input.spm
|
||||
|
||||
cp data/dev-0/expected.tsv data/dev.label
|
||||
cp data/train/expected.tsv data/train.label
|
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
fairseq-preprocess \
|
||||
--only-source \
|
||||
--trainpref "data/train.input0.spm" \
|
||||
--validpref "data/dev.input0.spm" \
|
||||
--destdir "data-bin/input0" \
|
||||
--workers 4 --srcdict roberta_base_fairseq/dict.txt
|
||||
|
||||
fairseq-preprocess \
|
||||
--only-source \
|
||||
--trainpref "data/train.label" \
|
||||
--validpref "data/dev.label" \
|
||||
--destdir "data-bin/label" \
|
||||
--workers 4
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
TOTAL_NUM_UPDATES=1000000000000000 # 10 epochs through IMDB for bsz 32
|
||||
WARMUP_UPDATES=216085 # 6 percent of the number of updates
|
||||
LR=1e-05 # Peak LR for polynomial LR scheduler.
|
||||
HEAD_NAME=hesaid # Custom name for the classification head.
|
||||
NUM_CLASSES=2 # Number of classes for the classification task.
|
||||
MAX_SENTENCES=35 # Batch size.
|
||||
ROBERTA_PATH="roberta_base_fairseq/model.pt"
|
||||
|
||||
fairseq-train data-bin/ \
|
||||
--restore-file $ROBERTA_PATH \
|
||||
--max-positions 512 \
|
||||
--max-sentences $MAX_SENTENCES \
|
||||
--max-tokens 8192 \
|
||||
--task sentence_prediction \
|
||||
--reset-optimizer --reset-dataloader --reset-meters \
|
||||
--required-batch-size-multiple 2 \
|
||||
--init-token 0 --separator-token 2 \
|
||||
--arch roberta_base \
|
||||
--criterion sentence_prediction \
|
||||
--classification-head-name $HEAD_NAME \
|
||||
--num-classes $NUM_CLASSES \
|
||||
--dropout 0.1 --attention-dropout 0.1 \
|
||||
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
|
||||
--clip-norm 0.0 \
|
||||
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
||||
--max-epoch 5 --log-format tqdm --log-interval 1 --save-interval-updates 15000 --keep-interval-updates 5 --skip-invalid-size-inputs-valid-test \
|
||||
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
|
||||
--find-unused-parameters \
|
||||
--update-freq 1
|
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import torch
|
||||
from fairseq.models.roberta import RobertaModel
|
||||
from tqdm import tqdm
|
||||
|
||||
if __name__ == '__main__':
|
||||
roberta = RobertaModel.from_pretrained(
|
||||
model_name_or_path='checkpoints',
|
||||
data_name_or_path='data-bin',
|
||||
sentencepiece_vocab='roberta_base_fairseq/sentencepiece.bpe.model',
|
||||
checkpoint_file='checkpoint_best.pt',
|
||||
bpe='sentencepiece',
|
||||
)
|
||||
|
||||
roberta.cuda()
|
||||
roberta.eval()
|
||||
|
||||
max_seq = 512
|
||||
batch_size = 5
|
||||
pad_index = roberta.task.source_dictionary.pad()
|
||||
|
||||
for dir_test in ['dev-0', 'dev-1', 'test-A']:
|
||||
lines = []
|
||||
with open(f'data/{dir_test}/in.tsv', 'rt') as f:
|
||||
for line in tqdm(f, desc=f'Reading {dir_test}'):
|
||||
line = roberta.encode(line.rstrip('\n'))[:max_seq]
|
||||
lines.append(line)
|
||||
|
||||
predictions = []
|
||||
for i in tqdm(range(0, len(lines), batch_size), desc='Processing'):
|
||||
batch_text = lines[i: i + batch_size]
|
||||
# Get max length of batch
|
||||
max_len = max([tokens.size(0) for tokens in batch_text])
|
||||
|
||||
# Create empty tensor with padding index
|
||||
input_tensor = torch.LongTensor(len(batch_text), max_len).fill_(pad_index)
|
||||
# Fill tensor with tokens
|
||||
for i, tokens in enumerate(batch_text):
|
||||
input_tensor[i][:tokens.size(0)] = tokens
|
||||
|
||||
with torch.no_grad():
|
||||
raw_prediction = roberta.predict('hesaid', input_tensor)
|
||||
# Get probability for second class (M class)
|
||||
out_tensor = torch.exp(raw_prediction[:, 1])
|
||||
for line_prediction in out_tensor:
|
||||
# Get probability for first class
|
||||
predictions.append(line_prediction.item())
|
||||
|
||||
with open(f'data/{dir_test}/out.tsv', 'wt') as fw:
|
||||
fw.write('\n'.join([f'{p:.8f}' for p in predictions]))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue