pl changes

2023-05-10 13:19:19 +00:00 · 2023-05-10 13:19:19 +00:00 · c50cf6b0f2
commit c50cf6b0f2
parent bddaf93fcf
5 changed files with 50018 additions and 23 deletions
--- a/nanoT5/configs/default.yaml
+++ b/nanoT5/configs/default.yaml
@ -10,7 +10,7 @@ predict_only: false
 seed: 2137

 model:
-  name: 'google/t5-v1_1-base'
+  name: 'allegro/plt5-base'
  checkpoint_path: ''
  dropout: 0.0
  random_init: true
@ -25,9 +25,9 @@ data:
 optim:
  name: adamwscale
  base_lr: 2e-2
-  batch_size: 144
+  batch_size: 64
  total_steps: 65536
-  epochs: -1 # If it's > 0 it overwrites total_steps
+  epochs: 1 # If it's > 0 it overwrites total_steps
  warmup_steps: 10000
  lr_scheduler: cosine
  weight_decay: 0.0
@ -36,7 +36,7 @@ optim:
  final_cosine: 1e-5

 eval:
-  every_steps: 100000 # Don't eval
+  every_steps: 10000
  steps: 500

 checkpoint:
@ -48,7 +48,7 @@ logging:
    project:
    api_token:
    tags:
-  every_steps: 100
+  every_steps: 1000
  grad_l2: true
  weights_l2: true

--- a/nanoT5/utils/model_utils.py
+++ b/nanoT5/utils/model_utils.py
@ -46,7 +46,7 @@ def get_config(args):

 def get_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained(
-        args.model.name,
+        '/mnt/gpu_data1/zrostek/nanoT5/tokenizer',
        use_fast=True
    )
    tokenizer.model_max_length = int(1e9)
@ -56,14 +56,13 @@ def get_tokenizer(args):

 def load_dataset_splits(args):
    if args.mode == 'pt':
+        data_files = {}
+        data_files["train"] = "/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_in_filtered_3.csv"
+        data_files["validation"] = '/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/dev-0_in_filtered_3.csv'
        dataset = datasets.load_dataset(
-            'c4',
-            'en',
-            streaming=True,
-        )
-
-        dataset = dataset.remove_columns(
-            ['timestamp', 'url']
+            'csv',
+            data_files=data_files,
+            sep='\t',
        )

        dataset_splits = {
@ -71,9 +70,6 @@ def load_dataset_splits(args):
            'test': dataset['validation'],
        }

-        assert (
-            dataset['train'].n_shards == 1024
-        ), "We want to have many shards for efficient processing with num_workes in PyTorch dataloader"
    elif args.mode == 'ft':
        dataset_splits = datasets.load_dataset(
            args.data.exec_file_path,
@ -117,7 +113,6 @@ def process_dataset(dataset_splits, args, tokenizer):
                remove_columns=['text'],
            )

-            dataset_split = dataset_split.shuffle(buffer_size=10_000, seed=args.seed)
            final_datasets[split] = dataset_split
    elif args.mode == 'ft':
        final_datasets = dataset_splits
@ -176,11 +171,6 @@ def get_dataloaders(tokenizer, config, args):

        shuffle = (split == 'train') and not is_iterable

-        if args.mode == 'ft' and split == 'train':
-            assert shuffle is True
-        else:
-            assert shuffle is False
-
        dataloaders[split] = DataLoader(
            dataset[split],
            shuffle=shuffle,
@ -199,7 +189,7 @@ def get_dataloaders(tokenizer, config, args):

        if args.optim.epochs > 0:
            assert not is_iterable
-            args.optim.total_steps = len(dataloaders['train']) * args.optim.epochs
+            args.optim.total_steps = (len(dataloaders['train']) // args.optim.grad_acc) * args.optim.epochs 

        # We increase eval BS by 2, so decrease number of eval steps
        args.eval.corrected_steps = args.eval.steps / 2
--- a/tokenizer/spiece.model
+++ b/tokenizer/spiece.model
--- a/tokenizer/spiece.vocab
+++ b/tokenizer/spiece.vocab
--- a/tokenizer/train_tokenizer.sh
+++ b/tokenizer/train_tokenizer.sh
@ -0,0 +1,5 @@
+wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
+wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
+wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
+
+spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_2.csv' --model_prefix='spiece' --vocab_size=50000 --character_coverage=1.0 --model_type=unigram --train_extremely_large_corpus=true