diff --git a/code/src/config.json b/code/src/config.json index 0c3d5d9..f2bea33 100644 --- a/code/src/config.json +++ b/code/src/config.json @@ -1,13 +1,13 @@ { "extra_embeddings": true, - "run_name": "tree-seq-non-sinusoidal", - "data_dir": "./data/codeparrot-clean-parsed-starencoder-classes-encoded/", - "output_dir": "./outputs/tree-seq-non-sinusoidal", + "run_name": "no-sinusoidal", + "data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/", + "output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal", "seed": 420, "mlm_probability": 0.15, "batch_size": 32, - "epochs": 1, - "eval_every": 5000, + "epochs": 3, + "eval_every": 10000, "learning_rate": 5e-4, "weight_decay": 0.1, "max_grad_norm": 1.0, diff --git a/code/src/training.py b/code/src/training.py index fe10f3b..e985b04 100644 --- a/code/src/training.py +++ b/code/src/training.py @@ -41,16 +41,17 @@ def load_config(config_path: Path) -> dict: return json.load(f) def main(): - set_seed(config['seed']) - # Setup paths current_dir = Path(__file__).parent config = load_config(current_dir / 'config.json') data_dir = Path(config['data_dir']) output_dir = Path(config['output_dir']) + # Set seed + set_seed(config['seed']) + # Initialize W&B - wandb.init(project='codeparrot-starencoder', config=config, name=config['run_name']) + wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name']) # Upload the training files to W&B wandb.save(__file__) @@ -87,7 +88,6 @@ def main(): if config['extra_embeddings']: columns_to_remove.remove('depths') columns_to_remove.remove('sibling_idxs') - columns_to_remove.remove('node_types_encoded') dataset = dataset.remove_columns(columns_to_remove) logger.info(f'Loaded dataset:\n{dataset}') diff --git a/code/src/tree_starencoder.py b/code/src/tree_starencoder.py index 0c075a8..7271b42 100644 --- a/code/src/tree_starencoder.py +++ b/code/src/tree_starencoder.py @@ -9,17 +9,16 @@ from transformers import AutoConfig, BertForMaskedLM from tree_codebert import TreePositionalEmbedding class TreeStarEncoderForPreTraining(BertForMaskedLM): - def __init__(self, config: AutoConfig, max_depth: int = 32, max_seq_length: int = 512, log: bool = True): + def __init__(self, config: AutoConfig, max_depth: int = 32, max_seq_length: int = 512): super().__init__(config) self.config = config - self.log = log - self.fusion_layer = nn.Sequential( - nn.Linear(config.hidden_size * 4, config.hidden_size), - nn.GELU(), - nn.Dropout(config.hidden_dropout_prob), - nn.LayerNorm(config.hidden_size) - ) + # self.fusion_layer = nn.Sequential( + # nn.Linear(config.hidden_size * 4, config.hidden_size), + # nn.GELU(), + # nn.Dropout(config.hidden_dropout_prob), + # nn.LayerNorm(config.hidden_size) + # ) # Override config to set max_seq_length config.max_position_embeddings = max_seq_length @@ -32,13 +31,13 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM): self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size) - # Initialize sequential position embeddings with sinusoidal pattern - position = torch.arange(max_seq_length).unsqueeze(1) - div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size)) - pe = torch.zeros(max_seq_length, config.hidden_size) - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - self.seq_pos_embeddings.weight.data.copy_(pe) + # # Initialize sequential position embeddings with sinusoidal pattern + # position = torch.arange(max_seq_length).unsqueeze(1) + # div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size)) + # pe = torch.zeros(max_seq_length, config.hidden_size) + # pe[:, 0::2] = torch.sin(position * div_term) + # pe[:, 1::2] = torch.cos(position * div_term) + # self.seq_pos_embeddings.weight.data.copy_(pe) # New node type embeddings self.node_type_embeddings = nn.Embedding(217, config.hidden_size) @@ -51,7 +50,6 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM): labels: Optional[torch.Tensor] = None, depths: Optional[torch.Tensor] = None, sibling_idxs: Optional[torch.Tensor] = None, - node_types: Optional[torch.Tensor] = None, output_attentions: bool = False, **kwargs ) -> Dict[str, torch.Tensor]: @@ -70,11 +68,14 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM): else: tree_embeddings = torch.zeros_like(token_embeddings) - # Get node type embeddings - node_type_embeddings = self.node_type_embeddings(node_types) + # # Get node type embeddings + # node_type_embeddings = self.node_type_embeddings(node_types) - combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1) - combined_embeddings = self.fusion_layer(combined) + # combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1) + # combined_embeddings = self.fusion_layer(combined) + + # Add the embeddings instead of concatenating + combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings combined_embeddings = self.norm(combined_embeddings)