pl changes
This commit is contained in:
parent
bddaf93fcf
commit
c50cf6b0f2
@ -10,7 +10,7 @@ predict_only: false
|
||||
seed: 2137
|
||||
|
||||
model:
|
||||
name: 'google/t5-v1_1-base'
|
||||
name: 'allegro/plt5-base'
|
||||
checkpoint_path: ''
|
||||
dropout: 0.0
|
||||
random_init: true
|
||||
@ -25,9 +25,9 @@ data:
|
||||
optim:
|
||||
name: adamwscale
|
||||
base_lr: 2e-2
|
||||
batch_size: 144
|
||||
batch_size: 64
|
||||
total_steps: 65536
|
||||
epochs: -1 # If it's > 0 it overwrites total_steps
|
||||
epochs: 1 # If it's > 0 it overwrites total_steps
|
||||
warmup_steps: 10000
|
||||
lr_scheduler: cosine
|
||||
weight_decay: 0.0
|
||||
@ -36,7 +36,7 @@ optim:
|
||||
final_cosine: 1e-5
|
||||
|
||||
eval:
|
||||
every_steps: 100000 # Don't eval
|
||||
every_steps: 10000
|
||||
steps: 500
|
||||
|
||||
checkpoint:
|
||||
@ -48,7 +48,7 @@ logging:
|
||||
project:
|
||||
api_token:
|
||||
tags:
|
||||
every_steps: 100
|
||||
every_steps: 1000
|
||||
grad_l2: true
|
||||
weights_l2: true
|
||||
|
||||
|
@ -46,7 +46,7 @@ def get_config(args):
|
||||
|
||||
def get_tokenizer(args):
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.model.name,
|
||||
'/mnt/gpu_data1/zrostek/nanoT5/tokenizer',
|
||||
use_fast=True
|
||||
)
|
||||
tokenizer.model_max_length = int(1e9)
|
||||
@ -56,14 +56,13 @@ def get_tokenizer(args):
|
||||
|
||||
def load_dataset_splits(args):
|
||||
if args.mode == 'pt':
|
||||
data_files = {}
|
||||
data_files["train"] = "/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_in_filtered_3.csv"
|
||||
data_files["validation"] = '/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/dev-0_in_filtered_3.csv'
|
||||
dataset = datasets.load_dataset(
|
||||
'c4',
|
||||
'en',
|
||||
streaming=True,
|
||||
)
|
||||
|
||||
dataset = dataset.remove_columns(
|
||||
['timestamp', 'url']
|
||||
'csv',
|
||||
data_files=data_files,
|
||||
sep='\t',
|
||||
)
|
||||
|
||||
dataset_splits = {
|
||||
@ -71,9 +70,6 @@ def load_dataset_splits(args):
|
||||
'test': dataset['validation'],
|
||||
}
|
||||
|
||||
assert (
|
||||
dataset['train'].n_shards == 1024
|
||||
), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
|
||||
elif args.mode == 'ft':
|
||||
dataset_splits = datasets.load_dataset(
|
||||
args.data.exec_file_path,
|
||||
@ -117,7 +113,6 @@ def process_dataset(dataset_splits, args, tokenizer):
|
||||
remove_columns=['text'],
|
||||
)
|
||||
|
||||
dataset_split = dataset_split.shuffle(buffer_size=10_000, seed=args.seed)
|
||||
final_datasets[split] = dataset_split
|
||||
elif args.mode == 'ft':
|
||||
final_datasets = dataset_splits
|
||||
@ -176,11 +171,6 @@ def get_dataloaders(tokenizer, config, args):
|
||||
|
||||
shuffle = (split == 'train') and not is_iterable
|
||||
|
||||
if args.mode == 'ft' and split == 'train':
|
||||
assert shuffle is True
|
||||
else:
|
||||
assert shuffle is False
|
||||
|
||||
dataloaders[split] = DataLoader(
|
||||
dataset[split],
|
||||
shuffle=shuffle,
|
||||
@ -199,7 +189,7 @@ def get_dataloaders(tokenizer, config, args):
|
||||
|
||||
if args.optim.epochs > 0:
|
||||
assert not is_iterable
|
||||
args.optim.total_steps = len(dataloaders['train']) * args.optim.epochs
|
||||
args.optim.total_steps = (len(dataloaders['train']) // args.optim.grad_acc) * args.optim.epochs
|
||||
|
||||
# We increase eval BS by 2, so decrease number of eval steps
|
||||
args.eval.corrected_steps = args.eval.steps / 2
|
||||
|
BIN
tokenizer/spiece.model
Normal file
BIN
tokenizer/spiece.model
Normal file
Binary file not shown.
50000
tokenizer/spiece.vocab
Normal file
50000
tokenizer/spiece.vocab
Normal file
File diff suppressed because it is too large
Load Diff
5
tokenizer/train_tokenizer.sh
Executable file
5
tokenizer/train_tokenizer.sh
Executable file
@ -0,0 +1,5 @@
|
||||
wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
|
||||
wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
|
||||
wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
|
||||
|
||||
spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_2.csv' --model_prefix='spiece' --vocab_size=50000 --character_coverage=1.0 --model_type=unigram --train_extremely_large_corpus=true
|
Loading…
Reference in New Issue
Block a user