pl changes

This commit is contained in:
zrostek 2023-05-10 13:19:19 +00:00
parent bddaf93fcf
commit c50cf6b0f2
5 changed files with 50018 additions and 23 deletions

View File

@ -10,7 +10,7 @@ predict_only: false
seed: 2137
model:
name: 'google/t5-v1_1-base'
name: 'allegro/plt5-base'
checkpoint_path: ''
dropout: 0.0
random_init: true
@ -25,9 +25,9 @@ data:
optim:
name: adamwscale
base_lr: 2e-2
batch_size: 144
batch_size: 64
total_steps: 65536
epochs: -1 # If it's > 0 it overwrites total_steps
epochs: 1 # If it's > 0 it overwrites total_steps
warmup_steps: 10000
lr_scheduler: cosine
weight_decay: 0.0
@ -36,7 +36,7 @@ optim:
final_cosine: 1e-5
eval:
every_steps: 100000 # Don't eval
every_steps: 10000
steps: 500
checkpoint:
@ -48,7 +48,7 @@ logging:
project:
api_token:
tags:
every_steps: 100
every_steps: 1000
grad_l2: true
weights_l2: true

View File

@ -46,7 +46,7 @@ def get_config(args):
def get_tokenizer(args):
tokenizer = AutoTokenizer.from_pretrained(
args.model.name,
'/mnt/gpu_data1/zrostek/nanoT5/tokenizer',
use_fast=True
)
tokenizer.model_max_length = int(1e9)
@ -56,14 +56,13 @@ def get_tokenizer(args):
def load_dataset_splits(args):
if args.mode == 'pt':
data_files = {}
data_files["train"] = "/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_in_filtered_3.csv"
data_files["validation"] = '/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/dev-0_in_filtered_3.csv'
dataset = datasets.load_dataset(
'c4',
'en',
streaming=True,
)
dataset = dataset.remove_columns(
['timestamp', 'url']
'csv',
data_files=data_files,
sep='\t',
)
dataset_splits = {
@ -71,9 +70,6 @@ def load_dataset_splits(args):
'test': dataset['validation'],
}
assert (
dataset['train'].n_shards == 1024
), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
elif args.mode == 'ft':
dataset_splits = datasets.load_dataset(
args.data.exec_file_path,
@ -117,7 +113,6 @@ def process_dataset(dataset_splits, args, tokenizer):
remove_columns=['text'],
)
dataset_split = dataset_split.shuffle(buffer_size=10_000, seed=args.seed)
final_datasets[split] = dataset_split
elif args.mode == 'ft':
final_datasets = dataset_splits
@ -176,11 +171,6 @@ def get_dataloaders(tokenizer, config, args):
shuffle = (split == 'train') and not is_iterable
if args.mode == 'ft' and split == 'train':
assert shuffle is True
else:
assert shuffle is False
dataloaders[split] = DataLoader(
dataset[split],
shuffle=shuffle,
@ -199,7 +189,7 @@ def get_dataloaders(tokenizer, config, args):
if args.optim.epochs > 0:
assert not is_iterable
args.optim.total_steps = len(dataloaders['train']) * args.optim.epochs
args.optim.total_steps = (len(dataloaders['train']) // args.optim.grad_acc) * args.optim.epochs
# We increase eval BS by 2, so decrease number of eval steps
args.eval.corrected_steps = args.eval.steps / 2

BIN
tokenizer/spiece.model Normal file

Binary file not shown.

50000
tokenizer/spiece.vocab Normal file

File diff suppressed because it is too large Load Diff

5
tokenizer/train_tokenizer.sh Executable file
View File

@ -0,0 +1,5 @@
wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_2.csv' --model_prefix='spiece' --vocab_size=50000 --character_coverage=1.0 --model_type=unigram --train_extremely_large_corpus=true