new changes

This commit is contained in:
Patryk Bartkowiak 2025-01-02 15:09:08 +00:00
parent c80c591e7c
commit 3b7bc5d6d2
3 changed files with 30 additions and 29 deletions

View File

@ -1,13 +1,13 @@
{
"extra_embeddings": true,
"run_name": "tree-seq-non-sinusoidal",
"data_dir": "./data/codeparrot-clean-parsed-starencoder-classes-encoded/",
"output_dir": "./outputs/tree-seq-non-sinusoidal",
"run_name": "no-sinusoidal",
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
"seed": 420,
"mlm_probability": 0.15,
"batch_size": 32,
"epochs": 1,
"eval_every": 5000,
"epochs": 3,
"eval_every": 10000,
"learning_rate": 5e-4,
"weight_decay": 0.1,
"max_grad_norm": 1.0,

View File

@ -41,16 +41,17 @@ def load_config(config_path: Path) -> dict:
return json.load(f)
def main():
set_seed(config['seed'])
# Setup paths
current_dir = Path(__file__).parent
config = load_config(current_dir / 'config.json')
data_dir = Path(config['data_dir'])
output_dir = Path(config['output_dir'])
# Set seed
set_seed(config['seed'])
# Initialize W&B
wandb.init(project='codeparrot-starencoder', config=config, name=config['run_name'])
wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name'])
# Upload the training files to W&B
wandb.save(__file__)
@ -87,7 +88,6 @@ def main():
if config['extra_embeddings']:
columns_to_remove.remove('depths')
columns_to_remove.remove('sibling_idxs')
columns_to_remove.remove('node_types_encoded')
dataset = dataset.remove_columns(columns_to_remove)
logger.info(f'Loaded dataset:\n{dataset}')

View File

@ -9,17 +9,16 @@ from transformers import AutoConfig, BertForMaskedLM
from tree_codebert import TreePositionalEmbedding
class TreeStarEncoderForPreTraining(BertForMaskedLM):
def __init__(self, config: AutoConfig, max_depth: int = 32, max_seq_length: int = 512, log: bool = True):
def __init__(self, config: AutoConfig, max_depth: int = 32, max_seq_length: int = 512):
super().__init__(config)
self.config = config
self.log = log
self.fusion_layer = nn.Sequential(
nn.Linear(config.hidden_size * 4, config.hidden_size),
nn.GELU(),
nn.Dropout(config.hidden_dropout_prob),
nn.LayerNorm(config.hidden_size)
)
# self.fusion_layer = nn.Sequential(
# nn.Linear(config.hidden_size * 4, config.hidden_size),
# nn.GELU(),
# nn.Dropout(config.hidden_dropout_prob),
# nn.LayerNorm(config.hidden_size)
# )
# Override config to set max_seq_length
config.max_position_embeddings = max_seq_length
@ -32,13 +31,13 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size)
# Initialize sequential position embeddings with sinusoidal pattern
position = torch.arange(max_seq_length).unsqueeze(1)
div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size))
pe = torch.zeros(max_seq_length, config.hidden_size)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.seq_pos_embeddings.weight.data.copy_(pe)
# # Initialize sequential position embeddings with sinusoidal pattern
# position = torch.arange(max_seq_length).unsqueeze(1)
# div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size))
# pe = torch.zeros(max_seq_length, config.hidden_size)
# pe[:, 0::2] = torch.sin(position * div_term)
# pe[:, 1::2] = torch.cos(position * div_term)
# self.seq_pos_embeddings.weight.data.copy_(pe)
# New node type embeddings
self.node_type_embeddings = nn.Embedding(217, config.hidden_size)
@ -51,7 +50,6 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
labels: Optional[torch.Tensor] = None,
depths: Optional[torch.Tensor] = None,
sibling_idxs: Optional[torch.Tensor] = None,
node_types: Optional[torch.Tensor] = None,
output_attentions: bool = False,
**kwargs
) -> Dict[str, torch.Tensor]:
@ -70,11 +68,14 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
else:
tree_embeddings = torch.zeros_like(token_embeddings)
# Get node type embeddings
node_type_embeddings = self.node_type_embeddings(node_types)
# # Get node type embeddings
# node_type_embeddings = self.node_type_embeddings(node_types)
combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1)
combined_embeddings = self.fusion_layer(combined)
# combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1)
# combined_embeddings = self.fusion_layer(combined)
# Add the embeddings instead of concatenating
combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings
combined_embeddings = self.norm(combined_embeddings)