diff --git a/code/src/config.json b/code/src/config.json index dc4706c..1428ca7 100644 --- a/code/src/config.json +++ b/code/src/config.json @@ -1,7 +1,7 @@ { "project": "test", "run_name": "no-sinusoidal", - "data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/", + "dataset": "patrykbart/codeparrot-clean-no-comments-starencoder", "output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal", "extra_embeddings": true, "seed": 420, diff --git a/code/src/training.py b/code/src/training.py index 2cf41d5..267b01a 100644 --- a/code/src/training.py +++ b/code/src/training.py @@ -56,7 +56,7 @@ def main(): wandb.save(current_dir / file) # Simplified dataset splitting - dataset = load_from_disk(data_dir) + dataset = load_dataset(config['dataset'], cache_dir="./data") if config['num_samples'] > 0: dataset = dataset.select(range(config['num_samples'])) train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])