diff --git a/code/src/config.json b/code/src/config.json index f2bea33..dc4706c 100644 --- a/code/src/config.json +++ b/code/src/config.json @@ -1,20 +1,21 @@ { - "extra_embeddings": true, + "project": "test", "run_name": "no-sinusoidal", "data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/", "output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal", + "extra_embeddings": true, "seed": 420, "mlm_probability": 0.15, "batch_size": 32, - "epochs": 3, + "epochs": 1, "eval_every": 10000, "learning_rate": 5e-4, "weight_decay": 0.1, "max_grad_norm": 1.0, "warmup_steps": 1000, - "fp16": true, - "logging_steps": 100, + "bf16": true, + "logging_steps": 500, "valid_size": 0.05, "test_size": 0.05, - "num_samples": -1 + "num_samples": 1000 } \ No newline at end of file diff --git a/code/src/encode_classes.py b/code/src/encode_classes.py deleted file mode 100644 index f267ae0..0000000 --- a/code/src/encode_classes.py +++ /dev/null @@ -1,118 +0,0 @@ -import json -import logging -import multiprocessing -from pathlib import Path -from datasets import load_from_disk - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -logger = logging.getLogger(__name__) - -def load_node_types_from_json(json_path: Path): - """ - Load node types from the Tree-sitter grammar's `node_types.json` and include UNK as the 0 index. - - Args: - json_path (Path): Path to the `node_types.json` file. - - Returns: - dict: A mapping from node type strings to unique integer IDs. - """ - if not json_path.exists(): - raise FileNotFoundError(f"{json_path} not found.") - - logger.info(f"Loading node types from {json_path}...") - with open(json_path, "r", encoding="utf-8") as f: - node_types_data = json.load(f) - - # Extract all unique "type" entries - node_types = set() - - def extract_types(data): - if isinstance(data, list): - for item in data: - extract_types(item) - elif isinstance(data, dict): - if "type" in data and isinstance(data["type"], str): - node_types.add(data["type"]) - for key, value in data.items(): - extract_types(value) - - extract_types(node_types_data) - - # Create mapping and add 'UNK' at index 0 - node_type2id = {"": 0} - for i, node_type in enumerate(sorted(node_types), start=1): - node_type2id[node_type] = i - - logger.info(f"Loaded {len(node_type2id)} node types, including UNK.") - return node_type2id - -def encode_node_types(examples, node_type2id): - """ - Batched function to replace node type strings with their integer IDs using a preloaded mapping. - """ - encoded_node_types = [] - for node_list in examples["node_types"]: - try: - encoded_node_list = [node_type2id[nt] if nt is not None and nt != 'ERROR' else node_type2id[''] for nt in node_list] - encoded_node_types.append(encoded_node_list) - except KeyError as e: - raise KeyError(f"Unknown node type encountered: {e}") - examples["node_types_encoded"] = encoded_node_types - return examples - -def main(): - """ - Main script to load, process, and save a dataset with node types encoded as integers. - """ - # ------------------------------------------------------------------------------ - # 1. Setup paths & load dataset - # ------------------------------------------------------------------------------ - current_dir = Path(__file__).parent - input_dir = current_dir.parent / "data" / "codeparrot-clean-parsed-starencoder-classes-padded" - output_dir = current_dir.parent / "data" / "codeparrot-clean-parsed-starencoder-classes-encoded" - node_types_path = current_dir / "node_types.json" - - output_dir.mkdir(parents=True, exist_ok=True) - logger.info(f"Loading dataset from {input_dir}...") - dataset = load_from_disk(str(input_dir)) - logger.info("Dataset loaded.") - - # Determine number of processes to use - num_proc = min(multiprocessing.cpu_count() - 1, 32) - logger.info(f"Using {num_proc} processes.") - - # ------------------------------------------------------------------------------ - # 2. Load node types from JSON - # ------------------------------------------------------------------------------ - node_type2id = load_node_types_from_json(node_types_path) - logger.info(f"Loaded {len(node_type2id)} node types.") - # Save node_type2id to disk - with open(output_dir / "node_type2id.json", "w") as f: - json.dump(node_type2id, f) - - # ------------------------------------------------------------------------------ - # 3. Convert node types in the dataset to integer IDs - # ------------------------------------------------------------------------------ - logger.info("Converting node type strings to integer IDs...") - - dataset = dataset.map( - lambda examples: encode_node_types(examples, node_type2id), - batched=True, - num_proc=num_proc, - desc="Encoding node types to integer IDs", - ) - - # ------------------------------------------------------------------------------ - # 4. Save the modified dataset to disk - # ------------------------------------------------------------------------------ - logger.info(f"Saving updated dataset to {output_dir}...") - dataset.save_to_disk(str(output_dir)) - logger.info("Dataset saved successfully.") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/code/src/eval_model.py b/code/src/eval_model.py index dbdc822..4e92a51 100644 --- a/code/src/eval_model.py +++ b/code/src/eval_model.py @@ -120,7 +120,7 @@ def main(): # Setup paths current_dir = Path(__file__).parent config = load_config(current_dir / 'eval_config.json') - model_dir = Path(config['model_dir']) / 'final-model' + model_dir = Path(config['model_dir']) data_dir = Path(config['data_dir']) results_dir = Path(config['model_dir']) / 'evaluation_results' results_dir.mkdir(exist_ok=True) @@ -133,7 +133,7 @@ def main(): model_config.max_position_embeddings = 1024 if config['extra_embeddings']: - model = TreeStarEncoderForPreTraining(config=model_config, log=False) + model = TreeStarEncoderForPreTraining(config=model_config) else: model = AutoModelForMaskedLM.from_config(model_config) diff --git a/code/src/pad_dataset.py b/code/src/pad_dataset.py deleted file mode 100644 index 4074f1e..0000000 --- a/code/src/pad_dataset.py +++ /dev/null @@ -1,77 +0,0 @@ -import logging -from pathlib import Path -from datasets import load_from_disk -from transformers import AutoTokenizer -import multiprocessing -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' -) -logger = logging.getLogger(__name__) - -def pad_and_save_dataset(input_dir, output_dir, tokenizer_name='bigcode/starencoder', max_length=512): - # Load the processed dataset - logger.info(f"Loading processed dataset from {input_dir}...") - dataset = load_from_disk(input_dir) - logger.info(f"Loaded dataset with {len(dataset)} examples") - - # Initialize tokenizer - logger.info("Initializing tokenizer...") - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - tokenizer.pad_token = tokenizer.eos_token - logger.info("Loaded StarEncoder tokenizer") - - # Define number of processes - num_proc = min(multiprocessing.cpu_count() - 1, 32) - logger.info(f"Using {num_proc} processes") - - # Define a function to pad the sequences - def pad_sequences(batch): - # Convert input_ids back to text if necessary - texts = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True) - - # Use the tokenizer's __call__ method for padding - padded_inputs = tokenizer( - texts, - padding='max_length', - max_length=max_length, - return_tensors='pt', - truncation=True - ) - - # Pad other fields with default values - padded_depths = [seq + [-1] * (max_length - len(seq)) for seq in batch['depths']] - padded_sibling_idxs = [seq + [-1] * (max_length - len(seq)) for seq in batch['sibling_idxs']] - padded_node_types = [seq + [None] * (max_length - len(seq)) for seq in batch['node_types']] - padded_node_texts = [seq + [''] * (max_length - len(seq)) for seq in batch['node_texts']] - - return { - 'input_ids': padded_inputs['input_ids'].tolist(), - 'attention_mask': padded_inputs['attention_mask'].tolist(), - 'depths': padded_depths, - 'sibling_idxs': padded_sibling_idxs, - 'node_types': padded_node_types, - 'node_texts': padded_node_texts - } - - # Apply padding - logger.info("Applying padding to dataset...") - padded_dataset = dataset.map( - pad_sequences, - batched=True, - desc="Padding dataset", - num_proc=num_proc - ) - - # Save the padded dataset - logger.info(f"Saving padded dataset to {output_dir}...") - padded_dataset.save_to_disk(output_dir) - logger.info(f"Saved padded dataset to {output_dir}") - -if __name__ == "__main__": - current_dir = Path(__file__).parent - input_dir = current_dir.parent / 'data' / 'codeparrot-clean-parsed-starencoder-classes' - output_dir = current_dir.parent / 'data' / 'codeparrot-clean-parsed-starencoder-classes-padded' - - pad_and_save_dataset(input_dir, output_dir) \ No newline at end of file diff --git a/code/src/training.py b/code/src/training.py index e985b04..2cf41d5 100644 --- a/code/src/training.py +++ b/code/src/training.py @@ -1,12 +1,11 @@ import wandb import json import logging +import zipfile from pathlib import Path from datasets import load_from_disk, DatasetDict from transformers import ( - RobertaConfig, AutoConfig, - RobertaForMaskedLM, AutoTokenizer, TrainingArguments, Trainer, @@ -50,35 +49,26 @@ def main(): # Set seed set_seed(config['seed']) - # Initialize W&B - wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name']) - - # Upload the training files to W&B - wandb.save(__file__) - wandb.save(Path(__file__).parent / 'config.json') - if config['extra_embeddings']: - wandb.save(current_dir / 'tree_starencoder.py') - - if 'CodeSearchNet' in config['data_dir']: - dataset = DatasetDict({ - 'train': load_from_disk(data_dir / 'train'), - 'valid': load_from_disk(data_dir / 'valid'), - 'test': load_from_disk(data_dir / 'test') - }) - else: - dataset = load_from_disk(data_dir) - if config['num_samples'] > 0: - dataset = dataset.select(range(config['num_samples'])) - train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size']) - test_valid = train_testvalid['test'].train_test_split( - test_size=config['valid_size'] / (config['test_size'] + config['valid_size']), - seed=config['seed'] - ) - dataset = DatasetDict({ - 'train': train_testvalid['train'], - 'test': test_valid['test'], - 'valid': test_valid['train'], - }) + # Initialize W&B and save files + wandb.init(project=config['project'], config=config, name=config['run_name']) + for file in [__file__, 'config.json', 'tree_starencoder.py']: + if config['extra_embeddings'] or file != 'tree_starencoder.py': + wandb.save(current_dir / file) + + # Simplified dataset splitting + dataset = load_from_disk(data_dir) + if config['num_samples'] > 0: + dataset = dataset.select(range(config['num_samples'])) + train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size']) + test_valid = train_testvalid['test'].train_test_split( + test_size=config['valid_size'] / (config['test_size'] + config['valid_size']), + seed=config['seed'] + ) + dataset = DatasetDict({ + 'train': train_testvalid['train'], + 'test': test_valid['test'], + 'valid': test_valid['train'], + }) # Continue with the rest of processing @@ -91,15 +81,10 @@ def main(): dataset = dataset.remove_columns(columns_to_remove) logger.info(f'Loaded dataset:\n{dataset}') - # Initialize model from scratch + # Simplify tokenizer setup tokenizer = AutoTokenizer.from_pretrained('bigcode/starencoder') - if tokenizer.mask_token is None: - tokenizer.add_special_tokens({'mask_token': ''}) - tokenizer.mask_token = '' - logger.info("Added '' as the mask token.") - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - logger.info("Set padding token to be the same as the EOS token.") + tokenizer.add_special_tokens({'mask_token': ''}) if tokenizer.mask_token is None else None + tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token model_config = AutoConfig.from_pretrained('bigcode/starencoder') if config['extra_embeddings']: @@ -123,11 +108,12 @@ def main(): save_steps=config['eval_every'], eval_strategy='steps', save_strategy='steps', + save_total_limit=5, load_best_model_at_end=True, report_to='wandb', run_name=config['run_name'], seed=config['seed'], - fp16=config['fp16'], + bf16=config['bf16'], dataloader_num_workers=8, gradient_checkpointing=True, metric_for_best_model='eval_loss', @@ -161,7 +147,13 @@ def main(): logger.info('Saving final model...') trainer.save_model(output_dir / 'final-model') tokenizer.save_pretrained(output_dir / 'final-model') - + + # Zip and upload the final model to W&B + with zipfile.ZipFile(output_dir / 'final-model.zip', 'w') as zipf: + for file in (output_dir / 'final-model').glob('**/*'): + zipf.write(file, arcname=file.name) + wandb.save(output_dir / 'final-model.zip') + logger.info('Training completed!') if __name__ == '__main__':