using hf to load online dataset

This commit is contained in:
Patryk Bartkowiak 2025-01-03 06:12:17 +00:00
parent dfb1e669bd
commit 3d6826f058
2 changed files with 2 additions and 2 deletions

View File

@ -1,7 +1,7 @@
{ {
"project": "test", "project": "test",
"run_name": "no-sinusoidal", "run_name": "no-sinusoidal",
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/", "dataset": "patrykbart/codeparrot-clean-no-comments-starencoder",
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal", "output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
"extra_embeddings": true, "extra_embeddings": true,
"seed": 420, "seed": 420,

View File

@ -56,7 +56,7 @@ def main():
wandb.save(current_dir / file) wandb.save(current_dir / file)
# Simplified dataset splitting # Simplified dataset splitting
dataset = load_from_disk(data_dir) dataset = load_dataset(config['dataset'], cache_dir="./data")
if config['num_samples'] > 0: if config['num_samples'] > 0:
dataset = dataset.select(range(config['num_samples'])) dataset = dataset.select(range(config['num_samples']))
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size']) train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])