using hf to load online dataset
This commit is contained in:
parent
dfb1e669bd
commit
3d6826f058
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"project": "test",
|
"project": "test",
|
||||||
"run_name": "no-sinusoidal",
|
"run_name": "no-sinusoidal",
|
||||||
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
|
"dataset": "patrykbart/codeparrot-clean-no-comments-starencoder",
|
||||||
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
|
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
|
||||||
"extra_embeddings": true,
|
"extra_embeddings": true,
|
||||||
"seed": 420,
|
"seed": 420,
|
||||||
|
@ -56,7 +56,7 @@ def main():
|
|||||||
wandb.save(current_dir / file)
|
wandb.save(current_dir / file)
|
||||||
|
|
||||||
# Simplified dataset splitting
|
# Simplified dataset splitting
|
||||||
dataset = load_from_disk(data_dir)
|
dataset = load_dataset(config['dataset'], cache_dir="./data")
|
||||||
if config['num_samples'] > 0:
|
if config['num_samples'] > 0:
|
||||||
dataset = dataset.select(range(config['num_samples']))
|
dataset = dataset.select(range(config['num_samples']))
|
||||||
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
|
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
|
||||||
|
Loading…
Reference in New Issue
Block a user