pl changes
This commit is contained in:
parent
bddaf93fcf
commit
c50cf6b0f2
@ -10,7 +10,7 @@ predict_only: false
|
|||||||
seed: 2137
|
seed: 2137
|
||||||
|
|
||||||
model:
|
model:
|
||||||
name: 'google/t5-v1_1-base'
|
name: 'allegro/plt5-base'
|
||||||
checkpoint_path: ''
|
checkpoint_path: ''
|
||||||
dropout: 0.0
|
dropout: 0.0
|
||||||
random_init: true
|
random_init: true
|
||||||
@ -25,9 +25,9 @@ data:
|
|||||||
optim:
|
optim:
|
||||||
name: adamwscale
|
name: adamwscale
|
||||||
base_lr: 2e-2
|
base_lr: 2e-2
|
||||||
batch_size: 144
|
batch_size: 64
|
||||||
total_steps: 65536
|
total_steps: 65536
|
||||||
epochs: -1 # If it's > 0 it overwrites total_steps
|
epochs: 1 # If it's > 0 it overwrites total_steps
|
||||||
warmup_steps: 10000
|
warmup_steps: 10000
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@ -36,7 +36,7 @@ optim:
|
|||||||
final_cosine: 1e-5
|
final_cosine: 1e-5
|
||||||
|
|
||||||
eval:
|
eval:
|
||||||
every_steps: 100000 # Don't eval
|
every_steps: 10000
|
||||||
steps: 500
|
steps: 500
|
||||||
|
|
||||||
checkpoint:
|
checkpoint:
|
||||||
@ -48,7 +48,7 @@ logging:
|
|||||||
project:
|
project:
|
||||||
api_token:
|
api_token:
|
||||||
tags:
|
tags:
|
||||||
every_steps: 100
|
every_steps: 1000
|
||||||
grad_l2: true
|
grad_l2: true
|
||||||
weights_l2: true
|
weights_l2: true
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ def get_config(args):
|
|||||||
|
|
||||||
def get_tokenizer(args):
|
def get_tokenizer(args):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.model.name,
|
'/mnt/gpu_data1/zrostek/nanoT5/tokenizer',
|
||||||
use_fast=True
|
use_fast=True
|
||||||
)
|
)
|
||||||
tokenizer.model_max_length = int(1e9)
|
tokenizer.model_max_length = int(1e9)
|
||||||
@ -56,14 +56,13 @@ def get_tokenizer(args):
|
|||||||
|
|
||||||
def load_dataset_splits(args):
|
def load_dataset_splits(args):
|
||||||
if args.mode == 'pt':
|
if args.mode == 'pt':
|
||||||
|
data_files = {}
|
||||||
|
data_files["train"] = "/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_in_filtered_3.csv"
|
||||||
|
data_files["validation"] = '/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/dev-0_in_filtered_3.csv'
|
||||||
dataset = datasets.load_dataset(
|
dataset = datasets.load_dataset(
|
||||||
'c4',
|
'csv',
|
||||||
'en',
|
data_files=data_files,
|
||||||
streaming=True,
|
sep='\t',
|
||||||
)
|
|
||||||
|
|
||||||
dataset = dataset.remove_columns(
|
|
||||||
['timestamp', 'url']
|
|
||||||
)
|
)
|
||||||
|
|
||||||
dataset_splits = {
|
dataset_splits = {
|
||||||
@ -71,9 +70,6 @@ def load_dataset_splits(args):
|
|||||||
'test': dataset['validation'],
|
'test': dataset['validation'],
|
||||||
}
|
}
|
||||||
|
|
||||||
assert (
|
|
||||||
dataset['train'].n_shards == 1024
|
|
||||||
), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
|
|
||||||
elif args.mode == 'ft':
|
elif args.mode == 'ft':
|
||||||
dataset_splits = datasets.load_dataset(
|
dataset_splits = datasets.load_dataset(
|
||||||
args.data.exec_file_path,
|
args.data.exec_file_path,
|
||||||
@ -117,7 +113,6 @@ def process_dataset(dataset_splits, args, tokenizer):
|
|||||||
remove_columns=['text'],
|
remove_columns=['text'],
|
||||||
)
|
)
|
||||||
|
|
||||||
dataset_split = dataset_split.shuffle(buffer_size=10_000, seed=args.seed)
|
|
||||||
final_datasets[split] = dataset_split
|
final_datasets[split] = dataset_split
|
||||||
elif args.mode == 'ft':
|
elif args.mode == 'ft':
|
||||||
final_datasets = dataset_splits
|
final_datasets = dataset_splits
|
||||||
@ -176,11 +171,6 @@ def get_dataloaders(tokenizer, config, args):
|
|||||||
|
|
||||||
shuffle = (split == 'train') and not is_iterable
|
shuffle = (split == 'train') and not is_iterable
|
||||||
|
|
||||||
if args.mode == 'ft' and split == 'train':
|
|
||||||
assert shuffle is True
|
|
||||||
else:
|
|
||||||
assert shuffle is False
|
|
||||||
|
|
||||||
dataloaders[split] = DataLoader(
|
dataloaders[split] = DataLoader(
|
||||||
dataset[split],
|
dataset[split],
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
@ -199,7 +189,7 @@ def get_dataloaders(tokenizer, config, args):
|
|||||||
|
|
||||||
if args.optim.epochs > 0:
|
if args.optim.epochs > 0:
|
||||||
assert not is_iterable
|
assert not is_iterable
|
||||||
args.optim.total_steps = len(dataloaders['train']) * args.optim.epochs
|
args.optim.total_steps = (len(dataloaders['train']) // args.optim.grad_acc) * args.optim.epochs
|
||||||
|
|
||||||
# We increase eval BS by 2, so decrease number of eval steps
|
# We increase eval BS by 2, so decrease number of eval steps
|
||||||
args.eval.corrected_steps = args.eval.steps / 2
|
args.eval.corrected_steps = args.eval.steps / 2
|
||||||
|
BIN
tokenizer/spiece.model
Normal file
BIN
tokenizer/spiece.model
Normal file
Binary file not shown.
50000
tokenizer/spiece.vocab
Normal file
50000
tokenizer/spiece.vocab
Normal file
File diff suppressed because it is too large
Load Diff
5
tokenizer/train_tokenizer.sh
Executable file
5
tokenizer/train_tokenizer.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
|
||||||
|
wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
|
||||||
|
wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
|
||||||
|
|
||||||
|
spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_2.csv' --model_prefix='spiece' --vocab_size=50000 --character_coverage=1.0 --model_type=unigram --train_extremely_large_corpus=true
|
Loading…
Reference in New Issue
Block a user