Compare commits
4 Commits
runpod-exp
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
3b93a7cc8a | ||
|
76a89dc236 | ||
|
eed2096400 | ||
|
f0679ab861 |
@ -20,9 +20,11 @@ pdm install
|
|||||||
```
|
```
|
||||||
### 4. Run training code
|
### 4. Run training code
|
||||||
```bash
|
```bash
|
||||||
pdm run_training
|
pdm train
|
||||||
```
|
```
|
||||||
or
|
|
||||||
|
## Required secrets
|
||||||
```
|
```
|
||||||
pdm run src/train_codebert_mlm.py
|
export HF_TOKEN=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
|
||||||
|
export WANDB_API_KEY=313671f10f2a389b3171b32da8d4abdad91aaa7c
|
||||||
```
|
```
|
@ -1,8 +1,9 @@
|
|||||||
{
|
{
|
||||||
"extra_embeddings": true,
|
"extra_embeddings": true,
|
||||||
"run_name": "no-sinusoidal",
|
"run_name": "tree-continued",
|
||||||
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
|
"data_dir": "./data/codeparrot-clean-parsed-starencoder-no-comments/",
|
||||||
"output_dir": "./outputs/long-no-comments-starencoder-no-sinusoidal",
|
"output_dir": "./outputs/no-comments-starencoder-tree-2",
|
||||||
|
"checkpoint": null,
|
||||||
"seed": 420,
|
"seed": 420,
|
||||||
"mlm_probability": 0.15,
|
"mlm_probability": 0.15,
|
||||||
"batch_size": 32,
|
"batch_size": 32,
|
||||||
|
@ -1,23 +1,22 @@
|
|||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import torch
|
||||||
|
import random
|
||||||
import logging
|
import logging
|
||||||
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datasets import load_from_disk, DatasetDict
|
from safetensors.torch import load_file
|
||||||
|
from datasets import load_from_disk, DatasetDict, load_dataset
|
||||||
from transformers import (
|
from transformers import (
|
||||||
RobertaConfig,
|
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
RobertaForMaskedLM,
|
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
Trainer,
|
Trainer,
|
||||||
DataCollatorForLanguageModeling,
|
DataCollatorForLanguageModeling,
|
||||||
AutoModelForMaskedLM
|
AutoModelForMaskedLM
|
||||||
)
|
)
|
||||||
import random
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from tree_codebert import TreeCodeBERTForPreTraining
|
|
||||||
from tree_starencoder import TreeStarEncoderForPreTraining
|
from tree_starencoder import TreeStarEncoderForPreTraining
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -51,43 +50,34 @@ def main():
|
|||||||
set_seed(config['seed'])
|
set_seed(config['seed'])
|
||||||
|
|
||||||
# Initialize W&B
|
# Initialize W&B
|
||||||
wandb.init(project='codeparrot-starencoder-no-comments', config=config, name=config['run_name'])
|
wandb.init(project='gralinski', config=config, name=config['run_name'])
|
||||||
|
|
||||||
# Upload the training files to W&B
|
# Upload the training files to W&B
|
||||||
wandb.save(__file__)
|
wandb.save(__file__)
|
||||||
wandb.save(Path(__file__).parent / 'config.json')
|
wandb.save(current_dir / 'config.json')
|
||||||
if config['extra_embeddings']:
|
if config['extra_embeddings']:
|
||||||
wandb.save(current_dir / 'tree_starencoder.py')
|
wandb.save(current_dir / 'tree_starencoder.py')
|
||||||
|
|
||||||
if 'CodeSearchNet' in config['data_dir']:
|
dataset = load_dataset("patrykbart/codeparrot-clean-no-comments-starencoder-small", split='train', num_proc=16, cache_dir=data_dir.parent)
|
||||||
dataset = DatasetDict({
|
if config['num_samples'] > 0:
|
||||||
'train': load_from_disk(data_dir / 'train'),
|
dataset = dataset.select(range(config['num_samples']))
|
||||||
'valid': load_from_disk(data_dir / 'valid'),
|
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
|
||||||
'test': load_from_disk(data_dir / 'test')
|
test_valid = train_testvalid['test'].train_test_split(
|
||||||
})
|
test_size=config['valid_size'] / (config['test_size'] + config['valid_size']),
|
||||||
else:
|
seed=config['seed']
|
||||||
dataset = load_from_disk(data_dir)
|
)
|
||||||
if config['num_samples'] > 0:
|
dataset = DatasetDict({
|
||||||
dataset = dataset.select(range(config['num_samples']))
|
'train': train_testvalid['train'],
|
||||||
train_testvalid = dataset.train_test_split(test_size=config['test_size'] + config['valid_size'])
|
'test': test_valid['test'],
|
||||||
test_valid = train_testvalid['test'].train_test_split(
|
'valid': test_valid['train'],
|
||||||
test_size=config['valid_size'] / (config['test_size'] + config['valid_size']),
|
})
|
||||||
seed=config['seed']
|
|
||||||
)
|
|
||||||
dataset = DatasetDict({
|
|
||||||
'train': train_testvalid['train'],
|
|
||||||
'test': test_valid['test'],
|
|
||||||
'valid': test_valid['train'],
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
# Continue with the rest of processing
|
# Continue with the rest of processing
|
||||||
columns_to_remove = dataset['train'].column_names
|
columns_to_remove = dataset['train'].column_names
|
||||||
columns_to_remove.remove('input_ids')
|
columns_to_remove = [col for col in columns_to_remove if col not in ['input_ids', 'attention_mask']]
|
||||||
columns_to_remove.remove('attention_mask')
|
|
||||||
if config['extra_embeddings']:
|
if config['extra_embeddings']:
|
||||||
columns_to_remove.remove('depths')
|
columns_to_remove = [col for col in columns_to_remove if col not in ['depths', 'sibling_idxs']]
|
||||||
columns_to_remove.remove('sibling_idxs')
|
|
||||||
dataset = dataset.remove_columns(columns_to_remove)
|
dataset = dataset.remove_columns(columns_to_remove)
|
||||||
logger.info(f'Loaded dataset:\n{dataset}')
|
logger.info(f'Loaded dataset:\n{dataset}')
|
||||||
|
|
||||||
@ -102,12 +92,20 @@ def main():
|
|||||||
logger.info("Set padding token to be the same as the EOS token.")
|
logger.info("Set padding token to be the same as the EOS token.")
|
||||||
|
|
||||||
model_config = AutoConfig.from_pretrained('bigcode/starencoder')
|
model_config = AutoConfig.from_pretrained('bigcode/starencoder')
|
||||||
if config['extra_embeddings']:
|
model = TreeStarEncoderForPreTraining(model_config) if config['extra_embeddings'] else AutoModelForMaskedLM.from_config(model_config)
|
||||||
model = TreeStarEncoderForPreTraining(model_config)
|
|
||||||
else:
|
|
||||||
model = AutoModelForMaskedLM.from_config(model_config)
|
|
||||||
logger.info(f'Loaded model: {model.__class__.__name__}')
|
logger.info(f'Loaded model: {model.__class__.__name__}')
|
||||||
|
|
||||||
|
# Load checkpoint if provided
|
||||||
|
if config['checkpoint'] is not None:
|
||||||
|
checkpoint_path = Path(config['checkpoint']) / 'model.safetensors'
|
||||||
|
logger.info(f'Loading checkpoint from {checkpoint_path}')
|
||||||
|
state_dict = load_file(checkpoint_path)
|
||||||
|
model.load_state_dict(state_dict, strict=False)
|
||||||
|
model.tie_weights()
|
||||||
|
config['warmup_steps'] = 0
|
||||||
|
config['learning_rate'] = 4.8701e-7
|
||||||
|
logger.info('Checkpoint loaded successfully.')
|
||||||
|
|
||||||
# Setup training arguments
|
# Setup training arguments
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=str(output_dir),
|
output_dir=str(output_dir),
|
||||||
|
@ -13,12 +13,12 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
|
|||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
# self.fusion_layer = nn.Sequential(
|
self.fusion_layer = nn.Sequential(
|
||||||
# nn.Linear(config.hidden_size * 4, config.hidden_size),
|
nn.Linear(config.hidden_size * 3, config.hidden_size),
|
||||||
# nn.GELU(),
|
nn.GELU(),
|
||||||
# nn.Dropout(config.hidden_dropout_prob),
|
nn.Dropout(config.hidden_dropout_prob),
|
||||||
# nn.LayerNorm(config.hidden_size)
|
nn.LayerNorm(config.hidden_size)
|
||||||
# )
|
)
|
||||||
|
|
||||||
# Override config to set max_seq_length
|
# Override config to set max_seq_length
|
||||||
config.max_position_embeddings = max_seq_length
|
config.max_position_embeddings = max_seq_length
|
||||||
@ -31,13 +31,13 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
|
|||||||
|
|
||||||
self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size)
|
self.seq_pos_embeddings = nn.Embedding(max_seq_length, config.hidden_size)
|
||||||
|
|
||||||
# # Initialize sequential position embeddings with sinusoidal pattern
|
# Initialize sequential position embeddings with sinusoidal pattern
|
||||||
# position = torch.arange(max_seq_length).unsqueeze(1)
|
position = torch.arange(max_seq_length).unsqueeze(1)
|
||||||
# div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size))
|
div_term = torch.exp(torch.arange(0, config.hidden_size, 2) * (-math.log(10000.0) / config.hidden_size))
|
||||||
# pe = torch.zeros(max_seq_length, config.hidden_size)
|
pe = torch.zeros(max_seq_length, config.hidden_size)
|
||||||
# pe[:, 0::2] = torch.sin(position * div_term)
|
pe[:, 0::2] = torch.sin(position * div_term)
|
||||||
# pe[:, 1::2] = torch.cos(position * div_term)
|
pe[:, 1::2] = torch.cos(position * div_term)
|
||||||
# self.seq_pos_embeddings.weight.data.copy_(pe)
|
self.seq_pos_embeddings.weight.data.copy_(pe)
|
||||||
|
|
||||||
# New node type embeddings
|
# New node type embeddings
|
||||||
self.node_type_embeddings = nn.Embedding(217, config.hidden_size)
|
self.node_type_embeddings = nn.Embedding(217, config.hidden_size)
|
||||||
@ -72,10 +72,11 @@ class TreeStarEncoderForPreTraining(BertForMaskedLM):
|
|||||||
# node_type_embeddings = self.node_type_embeddings(node_types)
|
# node_type_embeddings = self.node_type_embeddings(node_types)
|
||||||
|
|
||||||
# combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1)
|
# combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings, node_type_embeddings], dim=-1)
|
||||||
# combined_embeddings = self.fusion_layer(combined)
|
combined = torch.cat([token_embeddings, tree_embeddings, seq_embeddings], dim=-1)
|
||||||
|
combined_embeddings = self.fusion_layer(combined)
|
||||||
|
|
||||||
# Add the embeddings instead of concatenating
|
# Add the embeddings instead of concatenating
|
||||||
combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings
|
# combined_embeddings = token_embeddings + tree_embeddings + seq_embeddings
|
||||||
|
|
||||||
combined_embeddings = self.norm(combined_embeddings)
|
combined_embeddings = self.norm(combined_embeddings)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user