Compare commits

...

3 Commits

Author SHA1 Message Date
Patryk Bartkowiak
35e5d3e8fa original 2025-01-03 10:16:18 +00:00
Patryk Bartkowiak
3d6826f058 using hf to load online dataset 2025-01-03 06:12:17 +00:00
Patryk Bartkowiak
dfb1e669bd ready for runpod 2025-01-02 20:36:05 +00:00
5 changed files with 47 additions and 250 deletions

View File

@ -1,18 +1,19 @@
{
"extra_embeddings": true,
"run_name": "no-sinusoidal",
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
"project": "runpod",
"run_name": "original",
"dataset": "patrykbart/codeparrot-clean-no-comments-starencoder-small",
"output_dir": "./outputs/long-no-comments-starencoder-original",
"extra_embeddings": false,
"seed": 420,
"mlm_probability": 0.15,
"batch_size": 32,
"batch_size": 192,
"epochs": 3,
"eval_every": 10000,
"eval_every": 2500,
"learning_rate": 5e-4,
"weight_decay": 0.1,
"max_grad_norm": 1.0,
"warmup_steps": 1000,
"fp16": true,
"warmup_steps": 500,
"bf16": true,
"logging_steps": 100,
"valid_size": 0.05,
"test_size": 0.05,

View File

@ -1,118 +0,0 @@
import json
import logging
import multiprocessing
from pathlib import Path
from datasets import load_from_disk
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
def load_node_types_from_json(json_path: Path):
"""
Load node types from the Tree-sitter grammar's `node_types.json` and include UNK as the 0 index.
Args:
json_path (Path): Path to the `node_types.json` file.
Returns:
dict: A mapping from node type strings to unique integer IDs.
"""
if not json_path.exists():
raise FileNotFoundError(f"{json_path} not found.")
logger.info(f"Loading node types from {json_path}...")
with open(json_path, "r", encoding="utf-8") as f:
node_types_data = json.load(f)
# Extract all unique "type" entries
node_types = set()
def extract_types(data):
if isinstance(data, list):
for item in data:
extract_types(item)
elif isinstance(data, dict):
if "type" in data and isinstance(data["type"], str):
node_types.add(data["type"])
for key, value in data.items():
extract_types(value)
extract_types(node_types_data)
# Create mapping and add 'UNK' at index 0
node_type2id = {"<UNK>": 0}
for i, node_type in enumerate(sorted(node_types), start=1):
node_type2id[node_type] = i
logger.info(f"Loaded {len(node_type2id)} node types, including UNK.")
return node_type2id
def encode_node_types(examples, node_type2id):
"""
Batched function to replace node type strings with their integer IDs using a preloaded mapping.
"""
encoded_node_types = []
for node_list in examples["node_types"]:
try:
encoded_node_list = [node_type2id[nt] if nt is not None and nt != 'ERROR' else node_type2id['<UNK>'] for nt in node_list]
encoded_node_types.append(encoded_node_list)
except KeyError as e:
raise KeyError(f"Unknown node type encountered: {e}")
examples["node_types_encoded"] = encoded_node_types
return examples
def main():
"""
Main script to load, process, and save a dataset with node types encoded as integers.
"""
# ------------------------------------------------------------------------------
# 1. Setup paths & load dataset
# ------------------------------------------------------------------------------
current_dir = Path(__file__).parent
input_dir = current_dir.parent / "data" / "codeparrot-clean-parsed-starencoder-classes-padded"
output_dir = current_dir.parent / "data" / "codeparrot-clean-parsed-starencoder-classes-encoded"
node_types_path = current_dir / "node_types.json"
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Loading dataset from {input_dir}...")
dataset = load_from_disk(str(input_dir))
logger.info("Dataset loaded.")
# Determine number of processes to use
num_proc = min(multiprocessing.cpu_count() - 1, 32)
logger.info(f"Using {num_proc} processes.")
# ------------------------------------------------------------------------------
# 2. Load node types from JSON
# ------------------------------------------------------------------------------
node_type2id = load_node_types_from_json(node_types_path)
logger.info(f"Loaded {len(node_type2id)} node types.")
# Save node_type2id to disk
with open(output_dir / "node_type2id.json", "w") as f:
json.dump(node_type2id, f)
# ------------------------------------------------------------------------------
# 3. Convert node types in the dataset to integer IDs
# ------------------------------------------------------------------------------
logger.info("Converting node type strings to integer IDs...")
dataset = dataset.map(
lambda examples: encode_node_types(examples, node_type2id),
batched=True,
num_proc=num_proc,
desc="Encoding node types to integer IDs",
)
# ------------------------------------------------------------------------------
# 4. Save the modified dataset to disk
# ------------------------------------------------------------------------------
logger.info(f"Saving updated dataset to {output_dir}...")
dataset.save_to_disk(str(output_dir))
logger.info("Dataset saved successfully.")
if __name__ == "__main__":
main()

View File

@ -120,7 +120,7 @@ def main():
# Setup paths
current_dir = Path(__file__).parent
config = load_config(current_dir / 'eval_config.json')
model_dir = Path(config['model_dir']) / 'final-model'
model_dir = Path(config['model_dir'])
data_dir = Path(config['data_dir'])
results_dir = Path(config['model_dir']) / 'evaluation_results'
results_dir.mkdir(exist_ok=True)
@ -133,7 +133,7 @@ def main():
model_config.max_position_embeddings = 1024
if config['extra_embeddings']:
model = TreeStarEncoderForPreTraining(config=model_config, log=False)
model = TreeStarEncoderForPreTraining(config=model_config)
else:
model = AutoModelForMaskedLM.from_config(model_config)

View File

@ -1,77 +0,0 @@
import logging
from pathlib import Path
from datasets import load_from_disk
from transformers import AutoTokenizer
import multiprocessing
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def pad_and_save_dataset(input_dir, output_dir, tokenizer_name='bigcode/starencoder', max_length=512):
# Load the processed dataset
logger.info(f"Loading processed dataset from {input_dir}...")
dataset = load_from_disk(input_dir)
logger.info(f"Loaded dataset with {len(dataset)} examples")
# Initialize tokenizer
logger.info("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token
logger.info("Loaded StarEncoder tokenizer")
# Define number of processes
num_proc = min(multiprocessing.cpu_count() - 1, 32)
logger.info(f"Using {num_proc} processes")
# Define a function to pad the sequences
def pad_sequences(batch):
# Convert input_ids back to text if necessary
texts = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
# Use the tokenizer's __call__ method for padding
padded_inputs = tokenizer(
texts,
padding='max_length',
max_length=max_length,
return_tensors='pt',
truncation=True
)
# Pad other fields with default values
padded_depths = [seq + [-1] * (max_length - len(seq)) for seq in batch['depths']]
padded_sibling_idxs = [seq + [-1] * (max_length - len(seq)) for seq in batch['sibling_idxs']]
padded_node_types = [seq + [None] * (max_length - len(seq)) for seq in batch['node_types']]
padded_node_texts = [seq + [''] * (max_length - len(seq)) for seq in batch['node_texts']]
return {
'input_ids': padded_inputs['input_ids'].tolist(),
'attention_mask': padded_inputs['attention_mask'].tolist(),
'depths': padded_depths,
'sibling_idxs': padded_sibling_idxs,
'node_types': padded_node_types,
'node_texts': padded_node_texts
}
# Apply padding
logger.info("Applying padding to dataset...")
padded_dataset = dataset.map(
pad_sequences,
batched=True,
desc="Padding dataset",
num_proc=num_proc
)
# Save the padded dataset
logger.info(f"Saving padded dataset to {output_dir}...")
padded_dataset.save_to_disk(output_dir)
logger.info(f"Saved padded dataset to {output_dir}")
if __name__ == "__main__":
current_dir = Path(__file__).parent
input_dir = current_dir.parent / 'data' / 'codeparrot-clean-parsed-starencoder-classes'
output_dir = current_dir.parent / 'data' / 'codeparrot-clean-parsed-starencoder-classes-padded'
pad_and_save_dataset(input_dir, output_dir)

View File

@ -1,12 +1,11 @@
import wandb
import json
import logging
import zipfile
from pathlib import Path
from datasets import load_from_disk, DatasetDict
from datasets import load_from_disk, DatasetDict, load_dataset
from transformers import (
RobertaConfig,
AutoConfig,
RobertaForMaskedLM,
AutoTokenizer,
TrainingArguments,
Trainer,
@ -44,41 +43,31 @@ def main():
# Setup paths
current_dir = Path(__file__).parent
config = load_config(current_dir / 'config.json')
data_dir = Path(config['data_dir'])
output_dir = Path(config['output_dir'])
# Set seed
set_seed(config['seed'])
# Initialize W&B
wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name'])
# Initialize W&B and save files
wandb.init(project=config['project'], config=config, name=config['run_name'])
for file in [__file__, 'config.json', 'tree_starencoder.py']:
if config['extra_embeddings'] or file != 'tree_starencoder.py':
wandb.save(current_dir / file)
# Upload the training files to W&B
wandb.save(__file__)
wandb.save(Path(__file__).parent / 'config.json')
if config['extra_embeddings']:
wandb.save(current_dir / 'tree_starencoder.py')
if 'CodeSearchNet' in config['data_dir']:
dataset = DatasetDict({
'train': load_from_disk(data_dir / 'train'),
'valid': load_from_disk(data_dir / 'valid'),
'test': load_from_disk(data_dir / 'test')
})
else:
dataset = load_from_disk(data_dir)
if config['num_samples'] > 0:
dataset = dataset.select(range(config['num_samples']))
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
test_valid = train_testvalid['test'].train_test_split(
test_size=config['valid_size'] / (config['test_size'] + config['valid_size']),
seed=config['seed']
)
dataset = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train'],
})
# Simplified dataset splitting
dataset = load_dataset(config['dataset'], split='train')
if config['num_samples'] > 0:
dataset = dataset.select(range(config['num_samples']))
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
test_valid = train_testvalid['test'].train_test_split(
test_size=config['valid_size'] / (config['test_size'] + config['valid_size']),
seed=config['seed']
)
dataset = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train'],
})
# Continue with the rest of processing
@ -91,15 +80,10 @@ def main():
dataset = dataset.remove_columns(columns_to_remove)
logger.info(f'Loaded dataset:\n{dataset}')
# Initialize model from scratch
# Simplify tokenizer setup
tokenizer = AutoTokenizer.from_pretrained('bigcode/starencoder')
if tokenizer.mask_token is None:
tokenizer.add_special_tokens({'mask_token': '<mask>'})
tokenizer.mask_token = '<mask>'
logger.info("Added '<mask>' as the mask token.")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
logger.info("Set padding token to be the same as the EOS token.")
tokenizer.add_special_tokens({'mask_token': '<mask>'}) if tokenizer.mask_token is None else None
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
model_config = AutoConfig.from_pretrained('bigcode/starencoder')
if config['extra_embeddings']:
@ -123,11 +107,12 @@ def main():
save_steps=config['eval_every'],
eval_strategy='steps',
save_strategy='steps',
save_total_limit=5,
load_best_model_at_end=True,
report_to='wandb',
run_name=config['run_name'],
seed=config['seed'],
fp16=config['fp16'],
bf16=config['bf16'],
dataloader_num_workers=8,
gradient_checkpointing=True,
metric_for_best_model='eval_loss',
@ -162,6 +147,12 @@ def main():
trainer.save_model(output_dir / 'final-model')
tokenizer.save_pretrained(output_dir / 'final-model')
# Zip and upload the final model to W&B
with zipfile.ZipFile(output_dir / 'final-model.zip', 'w') as zipf:
for file in (output_dir / 'final-model').glob('**/*'):
zipf.write(file, arcname=file.name)
wandb.save(output_dir / 'final-model.zip')
logger.info('Training completed!')
if __name__ == '__main__':