Compare commits

...

4 Commits

Author SHA1 Message Date
Patryk Bartkowiak
3b93a7cc8a prepared for run by prof Filip Gralinski (done) 2025-01-07 15:59:30 +00:00
Patryk Bartkowiak
76a89dc236 3 pochs 2025-01-07 12:58:20 +00:00
Patryk Bartkowiak
eed2096400 prepared for run by prof Filip Gralinski 2025-01-07 11:45:55 +00:00
Patryk Bartkowiak
f0679ab861 on this commit i continued to train original starencoder model 2025-01-04 21:02:30 +00:00
4 changed files with 60 additions and 58 deletions

View File

@ -20,9 +20,11 @@ pdm install
``` ```
### 4. Run training code ### 4. Run training code
```bash ```bash
pdm run_training pdm train
``` ```
or
## Required secrets
``` ```
pdm run src/train_codebert_mlm.py export HF_TOKEN=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
export WANDB_API_KEY=313671f10f2a389b3171b32da8d4abdad91aaa7c
``` ```

View File

@ -1,8 +1,9 @@
{ {
"extra_embeddings": true, "extra_embeddings": true,
"run_name": "no-sinusoidal", "run_name": "tree-continued",
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/", "data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal", "output_dir": "./outputs/no-comments-starencoder-tree-2",
"checkpoint": null,
"seed": 420, "seed": 420,
"mlm_probability": 0.15, "mlm_probability": 0.15,
"batch_size": 32, "batch_size": 32,

View File

@ -1,23 +1,22 @@
import wandb import wandb
import json import json
import torch
import random
import logging import logging
import numpy as np
from pathlib import Path from pathlib import Path
from datasets import load_from_disk, DatasetDict from safetensors.torch import load_file
from datasets import load_from_disk, DatasetDict, load_dataset
from transformers import ( from transformers import (
RobertaConfig,
AutoConfig, AutoConfig,
RobertaForMaskedLM,
AutoTokenizer, AutoTokenizer,
TrainingArguments, TrainingArguments,
Trainer, Trainer,
DataCollatorForLanguageModeling, DataCollatorForLanguageModeling,
AutoModelForMaskedLM AutoModelForMaskedLM
) )
import random
import numpy as np
import torch
from tree_codebert import TreeCodeBERTForPreTraining
from tree_starencoder import TreeStarEncoderForPreTraining from tree_starencoder import TreeStarEncoderForPreTraining
logging.basicConfig( logging.basicConfig(
@ -51,43 +50,34 @@ def main():
set_seed(config['seed']) set_seed(config['seed'])
# Initialize W&B # Initialize W&B
wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name']) wandb.init(project='gralinski', config=config, name=config['run_name'])
# Upload the training files to W&B # Upload the training files to W&B
wandb.save(__file__) wandb.save(__file__)
wandb.save(Path(__file__).parent / 'config.json') wandb.save(current_dir / 'config.json')
if config['extra_embeddings']: if config['extra_embeddings']:
wandb.save(current_dir / 'tree_starencoder.py') wandb.save(current_dir / 'tree_starencoder.py')
if 'CodeSearchNet' in config['data_dir']: dataset = load_dataset("patrykbart/codeparrot-clean-no-comments-starencoder-small", split='train', num_proc=16, cache_dir=data_dir.parent)
dataset = DatasetDict({ if config['num_samples'] > 0:
'train': load_from_disk(data_dir / 'train'), dataset = dataset.select(range(config['num_samples']))
'valid': load_from_disk(data_dir / 'valid'), train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
'test': load_from_disk(data_dir / 'test') test_valid = train_testvalid['test'].train_test_split(
}) test_size=config['valid_size'] / (config['test_size'] + config['valid_size']),
else: seed=config['seed']
dataset = load_from_disk(data_dir) )
if config['num_samples'] > 0: dataset = DatasetDict({
dataset = dataset.select(range(config['num_samples'])) 'train': train_testvalid['train'],
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size']) 'test': test_valid['test'],
test_valid = train_testvalid['test'].train_test_split( 'valid': test_valid['train'],
test_size=config['valid_size'] / (config['test_size'] + config['valid_size']), })
seed=config['seed']
)
dataset = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train'],
})
# Continue with the rest of processing # Continue with the rest of processing
columns_to_remove = dataset['train'].column_names columns_to_remove = dataset['train'].column_names
columns_to_remove.remove('input_ids') columns_to_remove = [col for col in columns_to_remove if col not in ['input_ids', 'attention_mask']]
columns_to_remove.remove('attention_mask')
if config['extra_embeddings']: if config['extra_embeddings']:
columns_to_remove.remove('depths') columns_to_remove = [col for col in columns_to_remove if col not in ['depths', 'sibling_idxs']]
columns_to_remove.remove('sibling_idxs')
dataset = dataset.remove_columns(columns_to_remove) dataset = dataset.remove_columns(columns_to_remove)
logger.info(f'Loaded dataset:\n{dataset}') logger.info(f'Loaded dataset:\n{dataset}')
@ -102,12 +92,20 @@ def main():
logger.info("Set padding token to be the same as the EOS token.") logger.info("Set padding token to be the same as the EOS token.")
model_config = AutoConfig.from_pretrained('bigcode/starencoder') model_config = AutoConfig.from_pretrained('bigcode/starencoder')
if config['extra_embeddings']: model = TreeStarEncoderForPreTraining(model_config) if config['extra_embeddings'] else AutoModelForMaskedLM.from_config(model_config)
model = TreeStarEncoderForPreTraining(model_config)
else:
model = AutoModelForMaskedLM.from_config(model_config)
logger.info(f'Loaded model: {model.__class__.__name__}') logger.info(f'Loaded model: {model.__class__.__name__}')
# Load checkpoint if provided
if config['checkpoint'] is not None:
checkpoint_path = Path(config['checkpoint']) / 'model.safetensors'
logger.info(f'Loading checkpoint from {checkpoint_path}')
state_dict = load_file(checkpoint_path)
model.load_state_dict(state_dict, strict=False)
model.tie_weights()
config['warmup_steps'] = 0
config['learning_rate'] = 4.8701e-7
logger.info('Checkpoint loaded successfully.')
# Setup training arguments # Setup training arguments
training_args = TrainingArguments( training_args = TrainingArguments(
output_dir=str(output_dir), output_dir=str(output_dir),

View File

@ -13,12 +13,12 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
super().__init__(config) super().__init__(config)
self.config = config self.config = config
# self.fusion_layer = nn.Sequential( self.fusion_layer = nn.Sequential(
# nn.Linear(config.hidden_size * 4, config.hidden_size), nn.Linear(config.hidden_size * 3, config.hidden_size),
# nn.GELU(), nn.GELU(),
# nn.Dropout(config.hidden_dropout_prob), nn.Dropout(config.hidden_dropout_prob),
# nn.LayerNorm(config.hidden_size) nn.LayerNorm(config.hidden_size)
# ) )
# Override config to set max_seq_length # Override config to set max_seq_length
config.max_position_embeddings = max_seq_length config.max_position_embeddings = max_seq_length
@ -31,13 +31,13 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size) self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size)
# # Initialize sequential position embeddings with sinusoidal pattern # Initialize sequential position embeddings with sinusoidal pattern
# position = torch.arange(max_seq_length).unsqueeze(1) position = torch.arange(max_seq_length).unsqueeze(1)
# div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size)) div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size))
# pe = torch.zeros(max_seq_length, config.hidden_size) pe = torch.zeros(max_seq_length, config.hidden_size)
# pe[:, 0::2] = torch.sin(position * div_term) pe[:, 0::2] = torch.sin(position * div_term)
# pe[:, 1::2] = torch.cos(position * div_term) pe[:, 1::2] = torch.cos(position * div_term)
# self.seq_pos_embeddings.weight.data.copy_(pe) self.seq_pos_embeddings.weight.data.copy_(pe)
# New node type embeddings # New node type embeddings
self.node_type_embeddings = nn.Embedding(217, config.hidden_size) self.node_type_embeddings = nn.Embedding(217, config.hidden_size)
@ -72,10 +72,11 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
# node_type_embeddings = self.node_type_embeddings(node_types) # node_type_embeddings = self.node_type_embeddings(node_types)
# combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1) # combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1)
# combined_embeddings = self.fusion_layer(combined) combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings], dim=-1)
combined_embeddings = self.fusion_layer(combined)
# Add the embeddings instead of concatenating # Add the embeddings instead of concatenating
combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings # combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings
combined_embeddings = self.norm(combined_embeddings) combined_embeddings = self.norm(combined_embeddings)