Compare commits

..

No commits in common. "roberta_classifier_only" and "master" have entirely different histories.

12 changed files with 0 additions and 907190 deletions

View File

@ -1,11 +0,0 @@
#!/usr/bin/env bash
set -e
set -x
# Create spm vocab
# spm_train --input=train/in.tsv --model_prefix=vocab_spm_bpe --model_type=bpe --vocab_size=50000 --pad_id 1 --bos_id 2 --eos_id 3
spm_encode --model vocab_spm_bpe.model < data/train/in.tsv > data/train.txt
spm_encode --model vocab_spm_bpe.model < data/dev-0/in.tsv > data/valid.txt
spm_encode --model vocab_spm_bpe.model < data/test-A/in.tsv > data/test.txt

View File

@ -1,19 +0,0 @@
#!/usr/bin/env bash
set -e
set -x
TEXT=data/
fairseq-preprocess \
--only-source --nwordssrc 50000 \
--trainpref $TEXT/train.txt \
--validpref $TEXT/valid.txt \
--destdir data-bin/classifier-spm-bpe/input0 \
--workers 8
fairseq-preprocess \
--only-source \
--trainpref $TEXT/train/expected.tsv \
--validpref $TEXT/dev-0/expected.tsv \
--destdir data-bin/classifier-spm-bpe/label \
--workers 8

View File

@ -1,35 +0,0 @@
#!/usr/bin/env bash
set -e
set -x
TOTAL_NUM_UPDATES=600_000 # Total number of training steps == 10 epoch (1 peoch = 60_000)
WARMUP_UPDATES=24_000 # Warmup the learning rate over this many updates
PEAK_LR=0.0001 # Peak learning rate, adjust as needed
HEAD_NAME='he_she' # Custom name for the classification head.
TOKENS_PER_SAMPLE=256 # Max sequence length
NUM_CLASSES=2 # Number of classes for the classification task.
MAX_SENTENCES=50 # Batch size.
UPDATE_FREQ=1 # Increase the batch size
MODEL_PATH='checkpoints/lm_roberta_small/checkpoint_best.pt'
DATA_DIR=data-bin/classifier-spm-bpe
fairseq-train $DATA_DIR \
--restore-file "$MODEL_PATH" \
--fp16 --max-sentences $MAX_SENTENCES --max-positions $TOKENS_PER_SAMPLE --update-freq $UPDATE_FREQ \
--max-tokens 32768 --save-dir checkpoints/lm_roberta_small_finetune \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 --encoder-layers 8 --encoder-embed-dim 512 --encoder-ffn-embed-dim 2048 --encoder-attention-heads 8 \
--weight-decay 0.1 --clip-norm 0.0 \
--optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--lr-scheduler polynomial_decay --lr $PEAK_LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--max-epoch 10 --log-format tqdm --log-interval 1 --save-interval-updates 15000 --keep-interval-updates 5 --skip-invalid-size-inputs-valid-test \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--find-unused-parameters

View File

@ -1,53 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import torch
from fairseq.models.roberta import RobertaModel
from tqdm import tqdm
if __name__ == '__main__':
for model_epoch in ['epoch10', 'epoch20']:
roberta = RobertaModel.from_pretrained(
model_name_or_path=f'checkpoints/classifier_roberta_small_{model_epoch}',
data_name_or_path='data-bin/classifier-spm-bpe',
sentencepiece_vocab='vocab_spm_bpe.model',
checkpoint_file='checkpoint_best.pt',
bpe='sentencepiece',
)
roberta.cuda()
roberta.eval()
max_seq = 256
batch_size = 15
pad_index = roberta.task.source_dictionary.pad()
for dir_test in ['dev-0', 'dev-1', 'test-A']:
lines = []
with open(f'data/{dir_test}/in.tsv', 'rt') as f:
for line in tqdm(f, desc=f'Reading {dir_test}'):
line = roberta.encode(line.rstrip('\n'))[:max_seq]
lines.append(line)
predictions = []
for i in tqdm(range(0, len(lines), batch_size), desc='Processing'):
batch_text = lines[i: i + batch_size]
# Get max length of batch
max_len = max([tokens.size(0) for tokens in batch_text])
# Create empty tensor with padding index
input_tensor = torch.LongTensor(len(batch_text), max_len).fill_(pad_index)
# Fill tensor with tokens
for i, tokens in enumerate(batch_text):
input_tensor[i][:tokens.size(0)] = tokens
with torch.no_grad():
raw_prediction = roberta.predict('sentence_classification_head', input_tensor)
# Get probability for second class (M class)
out_tensor = torch.exp(raw_prediction[:, 1])
for line_prediction in out_tensor:
# Get probability for first class
predictions.append(line_prediction.item())
with open(f'data/{dir_test}/out-epoch={model_epoch}.tsv', 'wt') as fw:
fw.write('\n'.join([f'{p:.8f}' for p in predictions]))

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

49996
dict.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.