From 3b93a7cc8a60c5a2618ad246781f8de2cad2db34 Mon Sep 17 00:00:00 2001 From: Patryk Bartkowiak Date: Tue, 7 Jan 2025 15:59:30 +0000 Subject: [PATCH] prepared for run by prof Filip Gralinski (done) --- code/src/training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/src/training.py b/code/src/training.py index dc343f9..0fa7b41 100644 --- a/code/src/training.py +++ b/code/src/training.py @@ -7,7 +7,7 @@ import logging import numpy as np from pathlib import Path from safetensors.torch import load_file -from datasets import load_from_disk, DatasetDict +from datasets import load_from_disk, DatasetDict, load_dataset from transformers import ( AutoConfig, AutoTokenizer, @@ -58,7 +58,7 @@ def main(): if config['extra_embeddings']: wandb.save(current_dir / 'tree_starencoder.py') - dataset = load_from_disk(data_dir) + dataset = load_dataset("patrykbart/codeparrot-clean-no-comments-starencoder-small", split='train', num_proc=16, cache_dir=data_dir.parent) if config['num_samples'] > 0: dataset = dataset.select(range(config['num_samples'])) train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])