redesigned code, added functions and granularity

This commit is contained in:
Patryk Bartkowiak 2024-10-21 20:05:56 +00:00
parent 3e2f9c7711
commit 240c16b495
10 changed files with 237 additions and 793 deletions

4
code/.gitignore vendored
View File

@ -160,3 +160,7 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
# Weights & Biases
wandb/
outputs/

2
code/data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

2
code/models/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -5,7 +5,7 @@
groups = ["default"] groups = ["default"]
strategy = ["inherit_metadata"] strategy = ["inherit_metadata"]
lock_version = "4.5.0" lock_version = "4.5.0"
content_hash = "sha256:ac6621f3bd9193d786ab94f80f8b1711100fe418959f2e131ae03afeab616788" content_hash = "sha256:bf0a0ea826769cf12a84888d394edd8c3c5599c4d369b8b19b75c2fa5e16f5f0"
[[metadata.targets]] [[metadata.targets]]
requires_python = "==3.11.*" requires_python = "==3.11.*"

View File

@ -6,13 +6,13 @@ authors = [
{name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"}, {name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"},
] ]
dependencies = [ dependencies = [
"wandb>=0.18.5", "wandb==0.18.5",
"torch>=2.5.0", "torch==2.5.0",
"tqdm>=4.66.5", "tqdm==4.66.5",
"tree-sitter>=0.23.1", "tree-sitter==0.23.1",
"transformers>=4.45.2", "transformers==4.45.2",
"datasets>=3.0.1", "datasets==3.0.1",
"huggingface-hub>=0.26.0", "huggingface-hub==0.26.0",
] ]
requires-python = "==3.11.*" requires-python = "==3.11.*"
readme = "README.md" readme = "README.md"

11
code/src/config.json Normal file
View File

@ -0,0 +1,11 @@
{
"seed": 42,
"mlm_probability": 0.15,
"batch": 32,
"epochs": 1,
"eval_every": 10000,
"learning_rate": 5e-4,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"warmup_steps": 10000
}

File diff suppressed because one or more lines are too long

View File

@ -1,58 +0,0 @@
from datasets import load_dataset, disable_caching
from transformers import RobertaTokenizer
disable_caching()
def visible_print(text):
print('\n\n')
print('=' * 100)
print(text)
print('=' * 100)
print('\n\n')
if __name__ == '__main__':
# Load the dataset
train_data = load_dataset('/work/s452638/datasets/the-stack-python', split='train')
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['train']
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['train']
visible_print('Loaded data')
# Rename the columns
train_data = train_data.rename_column('content', 'code')
# Remove all the columns except the code
train_columns = train_data.column_names
valid_columns = valid_data.column_names
test_columns = test_data.column_names
train_columns.remove('code')
valid_columns.remove('code')
test_columns.remove('code')
train_data = train_data.remove_columns(train_columns)
valid_data = valid_data.remove_columns(valid_columns)
test_data = test_data.remove_columns(test_columns)
visible_print('Removed unnecessary columns')
# Tokenize the data
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
def tokenize_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Train] Running tokenizer', num_proc=8)
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Valid] Running tokenizer', num_proc=8)
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Test] Running tokenizer', num_proc=8)
visible_print('Tokenized data')
# Save the tokenized data
train_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
valid_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
test_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
visible_print('Saved tokenized data')

File diff suppressed because one or more lines are too long

View File

@ -1,254 +1,245 @@
import wandb import wandb
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import os import os
import json
import random import random
import datetime import datetime
import logging
from pathlib import Path
from typing import Dict, Any, Tuple, List
import numpy as np import numpy as np
from datasets import load_dataset, load_from_disk, disable_caching, DatasetDict import torch
from tree_sitter import Language, Parser from torch import Tensor
from transformers import RobertaForMaskedLM, RobertaConfig, RobertaTokenizer, DataCollatorForLanguageModeling from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, disable_caching, DatasetDict
from huggingface_hub import list_repo_files, hf_hub_download
from transformers import (
RobertaForMaskedLM,
RobertaConfig,
RobertaTokenizer,
DataCollatorForLanguageModeling,
get_linear_schedule_with_warmup,
PreTrainedTokenizer,
PreTrainedModel
)
from tqdm import tqdm from tqdm import tqdm
from utils import remove_docstrings_and_comments_from_code # Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Disable caching for datasets class OnTheFlyTokenizationDataset(Dataset):
disable_caching() def __init__(self, dataset: Dataset, tokenizer: PreTrainedTokenizer, max_length: int):
self.dataset = dataset
self.tokenizer = tokenizer
self.max_length = max_length
############################### CONFIG ############################### def __len__(self) -> int:
dataset_name = 'the-stack-tokenized' # 'the-stack' or 'code-search-net' or 'the-stack-tokenized return len(self.dataset)
remove_comments = False
######################################################################
# Initialize Weights & Biases and output directory def __getitem__(self, idx: int) -> Dict[str, Tensor]:
curr_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') content: str = self.dataset[idx]['content']
wandb.init(project='codebert-training', name=curr_time) tokenized = self.tokenizer(
output_dir = f'/home/s452638/magisterka/output/{curr_time}/' content,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': tokenized['input_ids'].squeeze(0),
'attention_mask': tokenized['attention_mask'].squeeze(0),
'labels': tokenized['input_ids'].squeeze(0)
}
# Save this file to Weights & Biases def set_seed(seed: int) -> None:
wandb.save('train_codebert_mlm.py')
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Set the seed for reproducibility
SEED = 42
def set_seed(seed):
random.seed(seed) random.seed(seed)
np.random.seed(seed) np.random.seed(seed)
torch.manual_seed(seed) torch.manual_seed(seed)
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed) torch.cuda.manual_seed_all(seed)
set_seed(SEED) def setup_wandb(config: Dict[str, Any]) -> None:
curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
wandb.init(project='codebert-training', name=curr_time, config=config)
wandb.save('train_codebert_mlm.py')
# Set the device for PyTorch (use GPU if available, otherwise CPU) def setup_directories(current_dir: Path) -> Path:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
torch.set_default_device(device) output_dir: Path = current_dir.parent.parent / 'outputs' / curr_time
print('*' * 10, 'Device', '*' * 10) output_dir.mkdir(parents=True, exist_ok=True)
print(f'Using device: {device}') return output_dir
if device.type == 'cuda':
print(f'Device name: {torch.cuda.get_device_name()}')
# Load the dataset def load_config(config_file: Path) -> Dict[str, Any]:
if dataset_name == 'the-stack-tokenized': with open(config_file, 'r') as f:
train_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/train') return json.load(f)
valid_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
test_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
else:
if dataset_name == 'the-stack':
train_data = load_dataset("/work/s452638/datasets/the-stack-python", split="train")
train_data = train_data.rename_column_('content', 'code')
elif dataset_name == 'code-search-net':
train_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/train.jsonl')['train']
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['valid'] def setup_device() -> torch.device:
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['test'] device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_device(device)
logger.info(f'Using device: {device}')
if device.type == 'cuda':
logger.info(f'Device name: {torch.cuda.get_device_name()}')
torch.set_float32_matmul_precision('high')
return device
dataset = DatasetDict({'train': train_data, 'valid': valid_data, 'test': test_data}) def download_dataset(dataset_dir: Path) -> None:
print('\n\n', '*' * 10, 'Dataset', '*' * 10) if not dataset_dir.exists():
print(dataset) logger.info("Downloading the dataset...")
dataset_dir.mkdir(parents=True, exist_ok=True)
files_list: List[str] = list_repo_files(repo_id='bigcode/the-stack-dedup', repo_type='dataset')
files_to_download: List[str] = [file for file in files_list if file.startswith('data/python/')]
for file_name in files_to_download:
hf_hub_download(repo_id='bigcode/the-stack-dedup', repo_type='dataset', filename=file_name, local_dir=dataset_dir)
logger.info("Dataset downloaded successfully.")
if remove_comments: def load_and_prepare_dataset(dataset_dir: Path, seed: int) -> DatasetDict:
# Build the language library if not already built dataset: DatasetDict = load_dataset(str(dataset_dir), split='train')
Language.build_library('/home/s452638/magisterka/build/my-languages.so', ['/home/s452638/magisterka/vendor/tree-sitter-python']) dataset = dataset.train_test_split(test_size=0.01, seed=seed)
logger.info(f'Dataset loaded: {dataset}')
return dataset
# Load the language def create_dataloaders(
PYTHON_LANGUAGE = Language('/home/s452638/magisterka/build/my-languages.so', 'python') dataset: DatasetDict,
tokenizer: PreTrainedTokenizer,
config: Dict[str, Any],
device: torch.device
) -> Tuple[DataLoader, DataLoader]:
dataset['train'] = OnTheFlyTokenizationDataset(dataset['train'], tokenizer, max_length=512)
dataset['test'] = OnTheFlyTokenizationDataset(dataset['test'], tokenizer, max_length=512)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=config['mlm_probability'])
train_dataloader = DataLoader(
dataset['train'],
batch_size=config['batch'],
shuffle=False,
collate_fn=data_collator,
generator=torch.Generator(device=device)
)
valid_dataloader = DataLoader(
dataset['test'],
batch_size=config['batch'],
shuffle=False,
collate_fn=data_collator,
generator=torch.Generator(device=device)
)
return train_dataloader, valid_dataloader
# Initialize the parser def setup_model_and_optimizer(
parser = Parser() config: Dict[str, Any],
parser.set_language(PYTHON_LANGUAGE) current_dir: Path
) -> Tuple[PreTrainedModel, AdamW]:
os.environ['HF_HOME'] = str(current_dir.parent / 'models')
model_config = RobertaConfig.from_pretrained('roberta-base')
model: PreTrainedModel = RobertaForMaskedLM(model_config)
model = torch.compile(model)
wandb.watch(model)
logger.info(f'Model config: {model_config}')
wandb.config.update({'model_config': model_config.to_dict()})
optimizer: AdamW = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
return model, optimizer
# Remove docstrings and comments from the code def train_and_evaluate(
dataset = dataset.map(lambda x: {'code': remove_docstrings_and_comments_from_code(x['code'], parser)}, batched=False, desc='Removing docstrings and comments') model: PreTrainedModel,
train_dataloader: DataLoader,
# Load the tokenizer valid_dataloader: DataLoader,
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True) optimizer: AdamW,
print('\n\n', '*' * 10, 'Tokenizer', '*' * 10) scheduler: Any,
print(tokenizer) config: Dict[str, Any],
output_dir: Path
if dataset_name != 'the-stack-tokenized': ) -> None:
# Tokenize the dataset num_training_steps: int = config['epochs'] * len(train_dataloader)
def tokenize_function(examples): best_valid_loss: float = float('inf')
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
with tqdm(total=num_training_steps, desc='Training') as pbar:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer') for epoch_idx in range(config['epochs']):
print('\n\n', '*' * 10, 'Tokenized dataset', '*' * 10) model.train()
print(tokenized_datasets) for train_idx, train_batch in enumerate(train_dataloader):
else:
tokenized_datasets = dataset
# Set data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
# Create DataLoaders
batch_size = 64
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
valid_dataloader = DataLoader(tokenized_datasets['valid'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
# Initialize a model with random weights based on the configuration for RoBERTa (CodeBERT is based on RoBERTa)
config = RobertaConfig.from_pretrained('roberta-base')
model = RobertaForMaskedLM(config)
model = torch.compile(model)
wandb.watch(model)
print('\n\n', '*' * 10, 'Model', '*' * 10)
print(config)
# Log the model configuration to wandb
wandb.config.update({'model_config': config.to_dict()})
# Set the optimizer and scaler
learning_rate = 5e-4
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scaler = torch.amp.GradScaler()
# Training settings
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
eval_every = 10_000
# Log training settings to wandb
wandb.config.update({
'training_settings': {
'num_epochs': num_epochs,
'num_training_steps': num_training_steps,
'eval_every': eval_every,
'batch_size': batch_size,
'learning_rate': learning_rate,
}
})
# Initialize variables to track validation loss, accuracy, and best model path
valid_acc = 0.0
valid_loss = 0.0
best_valid_loss = float('inf')
# Train the model
print('\n\n', '*' * 10, 'Training', '*' * 10)
model.train()
with tqdm(total=num_training_steps, desc='Training') as pbar:
for epoch_idx in range(num_epochs):
for train_idx, train_batch in enumerate(train_dataloader):
# Forward pass with mixed precision
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
outputs = model(**train_batch) outputs = model(**train_batch)
train_loss: Tensor = outputs.loss
train_loss = outputs.loss train_loss.backward()
scaler.scale(train_loss).backward()
norm: Tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
# Gradient clipping to prevent exploding gradients
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step()
scheduler.step()
scaler.step(optimizer) optimizer.zero_grad()
scaler.update()
optimizer.zero_grad() pbar.update(1)
pbar.set_postfix({'train_loss': train_loss.item()})
pbar.update(1)
pbar.set_postfix({'norm': norm.item(), 'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc})
# Log metrics to Weights & Biases
wandb.log({
'step': train_idx + len(train_dataloader) * epoch_idx,
'train_loss': train_loss.item(),
'gradient_norm': norm.item(),
'learning_rate': optimizer.param_groups[0]['lr'],
})
# Evaluate the model
if train_idx != 0 and train_idx % eval_every == 0:
model.eval()
valid_loss = 0.0
valid_acc = 0.0
with tqdm(total=len(valid_dataloader), desc='Validation') as pbar_valid:
with torch.no_grad():
for valid_idx, valid_batch in enumerate(valid_dataloader):
# Forward pass with mixed precision for validation
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
outputs = model(**valid_batch)
# Accumulate validation loss and accuracy
valid_loss += outputs.loss.item()
valid_acc += outputs.logits.argmax(dim=-1).eq(valid_batch['labels']).sum().item()
pbar_valid.update(1)
# Compute average validation loss and accuracy
valid_loss /= len(valid_dataloader)
valid_acc /= len(valid_dataloader.dataset)
model.train()
# Log validation metrics to Weights & Biases
wandb.log({ wandb.log({
'valid_loss': valid_loss,
'valid_acc': valid_acc,
'step': train_idx + len(train_dataloader) * epoch_idx, 'step': train_idx + len(train_dataloader) * epoch_idx,
'train_loss': train_loss.item(),
'gradient_norm': norm.item(),
'learning_rate': scheduler.get_last_lr()[0],
}) })
if train_idx != 0 and train_idx % config['eval_every'] == 0:
valid_loss, valid_acc = evaluate(model, valid_dataloader)
pbar.set_postfix({'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc})
wandb.log({
'valid_loss': valid_loss,
'valid_acc': valid_acc,
'step': train_idx + len(train_dataloader) * epoch_idx,
})
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), output_dir / 'best_model.pt')
logger.info(f'Best validation loss: {best_valid_loss}')
# Update best model if current validation loss is lower def evaluate(model: PreTrainedModel, dataloader: DataLoader) -> Tuple[float, float]:
if valid_loss < best_valid_loss: model.eval()
best_valid_loss = valid_loss total_loss: float = 0.0
torch.save(model.state_dict(), output_dir + f'best_model.pt') total_acc: float = 0.0
print('\n\n', '*' * 10, 'Training results', '*' * 10)
print(f'Best validation loss: {best_valid_loss}')
# Load the best model and evaluate on the test set
print('\n\n', '*' * 10, 'Testing', '*' * 10)
model.load_state_dict(torch.load(output_dir + f'best_model.pt', weights_only=True, map_location=device))
model.eval()
test_loss = 0.0
test_acc = 0.0
with tqdm(total=len(test_dataloader), desc='Testing') as pbar_test:
with torch.no_grad(): with torch.no_grad():
for test_idx, test_batch in enumerate(test_dataloader): for batch in tqdm(dataloader, desc='Validation'):
outputs = model(**batch)
total_loss += outputs.loss.item()
total_acc += outputs.logits.argmax(dim=-1).eq(batch['labels']).sum().item()
avg_loss: float = total_loss / len(dataloader)
avg_acc: float = total_acc / len(dataloader.dataset)
return avg_loss, avg_acc
# Forward pass with mixed precision for testing def main() -> None:
with torch.autocast(device_type=device.type, dtype=torch.bfloat16): disable_caching()
outputs = model(**test_batch)
current_dir: Path = Path(__file__).parent
output_dir: Path = setup_directories(current_dir)
config: Dict[str, Any] = load_config(current_dir / 'config.json')
# Accumulate test loss and accuracy setup_wandb(config)
test_loss += outputs.loss.item() set_seed(config['seed'])
test_acc += outputs.logits.argmax(dim=-1).eq(test_batch['labels']).sum().item() device: torch.device = setup_device()
pbar_test.update(1)
dataset_dir: Path = current_dir.parent / 'data' / 'the-stack-python'
download_dataset(dataset_dir)
dataset: DatasetDict = load_and_prepare_dataset(dataset_dir, config['seed'])
tokenizer: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
logger.info(f'Tokenizer loaded: {tokenizer}')
train_dataloader, valid_dataloader = create_dataloaders(dataset, tokenizer, config, device)
model, optimizer = setup_model_and_optimizer(config, current_dir)
num_training_steps: int = config['epochs'] * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=config['warmup_steps'],
num_training_steps=num_training_steps
)
train_and_evaluate(model, train_dataloader, valid_dataloader, optimizer, scheduler, config, output_dir)
# Compute average test loss and accuracy if __name__ == "__main__":
test_loss /= len(test_dataloader) main()
test_acc /= len(test_dataloader.dataset)
# Log test metrics to Weights & Biases
wandb.log({
'test_loss': test_loss,
'test_acc': test_acc,
})
print('\n\n', '*' * 10, 'Test results', '*' * 10)
print(f'Test loss: {test_loss}')
print(f'Test accuracy: {test_acc}')