This commit is contained in:
Patryk Bartkowiak 2025-01-03 06:16:19 +00:00 committed by Patryk
parent 3d6826f058
commit 35e5d3e8fa
2 changed files with 15 additions and 16 deletions

View File

@ -1,21 +1,21 @@
{
"project": "test",
"run_name": "no-sinusoidal",
"dataset": "patrykbart/codeparrot-clean-no-comments-starencoder",
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
"extra_embeddings": true,
"project": "runpod",
"run_name": "original",
"dataset": "patrykbart/codeparrot-clean-no-comments-starencoder-small",
"output_dir": "./outputs/long-no-comments-starencoder-original",
"extra_embeddings": false,
"seed": 420,
"mlm_probability": 0.15,
"batch_size": 32,
"epochs": 1,
"eval_every": 10000,
"batch_size": 192,
"epochs": 3,
"eval_every": 2500,
"learning_rate": 5e-4,
"weight_decay": 0.1,
"max_grad_norm": 1.0,
"warmup_steps": 1000,
"warmup_steps": 500,
"bf16": true,
"logging_steps": 500,
"logging_steps": 100,
"valid_size": 0.05,
"test_size": 0.05,
"num_samples": 1000
}
"num_samples": -1
}

View File

@ -3,7 +3,7 @@ import json
import logging
import zipfile
from pathlib import Path
from datasets import load_from_disk, DatasetDict
from datasets import load_from_disk, DatasetDict, load_dataset
from transformers import (
AutoConfig,
AutoTokenizer,
@ -43,7 +43,6 @@ def main():
# Setup paths
current_dir = Path(__file__).parent
config = load_config(current_dir / 'config.json')
data_dir = Path(config['data_dir'])
output_dir = Path(config['output_dir'])
# Set seed
@ -56,7 +55,7 @@ def main():
wandb.save(current_dir / file)
# Simplified dataset splitting
dataset = load_dataset(config['dataset'], cache_dir="./data")
dataset = load_dataset(config['dataset'], split='train')
if config['num_samples'] > 0:
dataset = dataset.select(range(config['num_samples']))
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
@ -157,4 +156,4 @@ def main():
logger.info('Training completed!')
if __name__ == '__main__':
main()
main()