pl changes

This commit is contained in:
zrostek 2023-05-10 13:19:19 +00:00
parent bddaf93fcf
commit c50cf6b0f2
5 changed files with 50018 additions and 23 deletions

View File

@ -10,7 +10,7 @@ predict_only: false
seed: 2137 seed: 2137
model: model:
name: 'google/t5-v1_1-base' name: 'allegro/plt5-base'
checkpoint_path: '' checkpoint_path: ''
dropout: 0.0 dropout: 0.0
random_init: true random_init: true
@ -25,9 +25,9 @@ data:
optim: optim:
name: adamwscale name: adamwscale
base_lr: 2e-2 base_lr: 2e-2
batch_size: 144 batch_size: 64
total_steps: 65536 total_steps: 65536
epochs: -1 # If it's > 0 it overwrites total_steps epochs: 1 # If it's > 0 it overwrites total_steps
warmup_steps: 10000 warmup_steps: 10000
lr_scheduler: cosine lr_scheduler: cosine
weight_decay: 0.0 weight_decay: 0.0
@ -36,7 +36,7 @@ optim:
final_cosine: 1e-5 final_cosine: 1e-5
eval: eval:
every_steps: 100000 # Don't eval every_steps: 10000
steps: 500 steps: 500
checkpoint: checkpoint:
@ -48,7 +48,7 @@ logging:
project: project:
api_token: api_token:
tags: tags:
every_steps: 100 every_steps: 1000
grad_l2: true grad_l2: true
weights_l2: true weights_l2: true

View File

@ -46,7 +46,7 @@ def get_config(args):
def get_tokenizer(args): def get_tokenizer(args):
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.model.name, '/mnt/gpu_data1/zrostek/nanoT5/tokenizer',
use_fast=True use_fast=True
) )
tokenizer.model_max_length = int(1e9) tokenizer.model_max_length = int(1e9)
@ -56,14 +56,13 @@ def get_tokenizer(args):
def load_dataset_splits(args): def load_dataset_splits(args):
if args.mode == 'pt': if args.mode == 'pt':
data_files = {}
data_files["train"] = "/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_in_filtered_3.csv"
data_files["validation"] = '/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/dev-0_in_filtered_3.csv'
dataset = datasets.load_dataset( dataset = datasets.load_dataset(
'c4', 'csv',
'en', data_files=data_files,
streaming=True, sep='\t',
)
dataset = dataset.remove_columns(
['timestamp', 'url']
) )
dataset_splits = { dataset_splits = {
@ -71,9 +70,6 @@ def load_dataset_splits(args):
'test': dataset['validation'], 'test': dataset['validation'],
} }
assert (
dataset['train'].n_shards == 1024
), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
elif args.mode == 'ft': elif args.mode == 'ft':
dataset_splits = datasets.load_dataset( dataset_splits = datasets.load_dataset(
args.data.exec_file_path, args.data.exec_file_path,
@ -117,7 +113,6 @@ def process_dataset(dataset_splits, args, tokenizer):
remove_columns=['text'], remove_columns=['text'],
) )
dataset_split = dataset_split.shuffle(buffer_size=10_000, seed=args.seed)
final_datasets[split] = dataset_split final_datasets[split] = dataset_split
elif args.mode == 'ft': elif args.mode == 'ft':
final_datasets = dataset_splits final_datasets = dataset_splits
@ -176,11 +171,6 @@ def get_dataloaders(tokenizer, config, args):
shuffle = (split == 'train') and not is_iterable shuffle = (split == 'train') and not is_iterable
if args.mode == 'ft' and split == 'train':
assert shuffle is True
else:
assert shuffle is False
dataloaders[split] = DataLoader( dataloaders[split] = DataLoader(
dataset[split], dataset[split],
shuffle=shuffle, shuffle=shuffle,
@ -199,7 +189,7 @@ def get_dataloaders(tokenizer, config, args):
if args.optim.epochs > 0: if args.optim.epochs > 0:
assert not is_iterable assert not is_iterable
args.optim.total_steps = len(dataloaders['train']) * args.optim.epochs args.optim.total_steps = (len(dataloaders['train']) // args.optim.grad_acc) * args.optim.epochs
# We increase eval BS by 2, so decrease number of eval steps # We increase eval BS by 2, so decrease number of eval steps
args.eval.corrected_steps = args.eval.steps / 2 args.eval.corrected_steps = args.eval.steps / 2

BIN
tokenizer/spiece.model Normal file

Binary file not shown.

50000
tokenizer/spiece.vocab Normal file

File diff suppressed because it is too large Load Diff

5
tokenizer/train_tokenizer.sh Executable file
View File

@ -0,0 +1,5 @@
wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_2.csv' --model_prefix='spiece' --vocab_size=50000 --character_coverage=1.0 --model_type=unigram --train_extremely_large_corpus=true