redesigned code, added functions and granularity

This commit is contained in:
Patryk Bartkowiak 2024-10-21 20:05:56 +00:00
parent 3e2f9c7711
commit 240c16b495
10 changed files with 237 additions and 793 deletions

4
code/.gitignore vendored
View File

@ -160,3 +160,7 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
# Weights & Biases
wandb/
outputs/

2
code/data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

2
code/models/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*
!.gitignore

View File

@ -5,7 +5,7 @@
groups = ["default"] groups = ["default"]
strategy = ["inherit_metadata"] strategy = ["inherit_metadata"]
lock_version = "4.5.0" lock_version = "4.5.0"
content_hash = "sha256:ac6621f3bd9193d786ab94f80f8b1711100fe418959f2e131ae03afeab616788" content_hash = "sha256:bf0a0ea826769cf12a84888d394edd8c3c5599c4d369b8b19b75c2fa5e16f5f0"
[[metadata.targets]] [[metadata.targets]]
requires_python = "==3.11.*" requires_python = "==3.11.*"

View File

@ -6,13 +6,13 @@ authors = [
{name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"}, {name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"},
] ]
dependencies = [ dependencies = [
"wandb>=0.18.5", "wandb==0.18.5",
"torch>=2.5.0", "torch==2.5.0",
"tqdm>=4.66.5", "tqdm==4.66.5",
"tree-sitter>=0.23.1", "tree-sitter==0.23.1",
"transformers>=4.45.2", "transformers==4.45.2",
"datasets>=3.0.1", "datasets==3.0.1",
"huggingface-hub>=0.26.0", "huggingface-hub==0.26.0",
] ]
requires-python = "==3.11.*" requires-python = "==3.11.*"
readme = "README.md" readme = "README.md"

11
code/src/config.json Normal file
View File

@ -0,0 +1,11 @@
{
"seed": 42,
"mlm_probability": 0.15,
"batch": 32,
"epochs": 1,
"eval_every": 10000,
"learning_rate": 5e-4,
"weight_decay": 0.01,
"max_grad_norm": 1.0,
"warmup_steps": 10000
}

File diff suppressed because one or more lines are too long

View File

@ -1,58 +0,0 @@
from datasets import load_dataset, disable_caching
from transformers import RobertaTokenizer
disable_caching()
def visible_print(text):
print('\n\n')
print('=' * 100)
print(text)
print('=' * 100)
print('\n\n')
if __name__ == '__main__':
# Load the dataset
train_data = load_dataset('/work/s452638/datasets/the-stack-python', split='train')
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['train']
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['train']
visible_print('Loaded data')
# Rename the columns
train_data = train_data.rename_column('content', 'code')
# Remove all the columns except the code
train_columns = train_data.column_names
valid_columns = valid_data.column_names
test_columns = test_data.column_names
train_columns.remove('code')
valid_columns.remove('code')
test_columns.remove('code')
train_data = train_data.remove_columns(train_columns)
valid_data = valid_data.remove_columns(valid_columns)
test_data = test_data.remove_columns(test_columns)
visible_print('Removed unnecessary columns')
# Tokenize the data
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
def tokenize_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Train] Running tokenizer', num_proc=8)
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Valid] Running tokenizer', num_proc=8)
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Test] Running tokenizer', num_proc=8)
visible_print('Tokenized data')
# Save the tokenized data
train_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
valid_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
test_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
visible_print('Saved tokenized data')

File diff suppressed because one or more lines are too long

View File

@ -1,254 +1,245 @@
import wandb import wandb
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
import os import os
import json
import random import random
import datetime import datetime
import logging
from pathlib import Path
from typing import Dict, Any, Tuple, List
import numpy as np import numpy as np
from datasets import load_dataset, load_from_disk, disable_caching, DatasetDict import torch
from tree_sitter import Language, Parser from torch import Tensor
from transformers import RobertaForMaskedLM, RobertaConfig, RobertaTokenizer, DataCollatorForLanguageModeling from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, disable_caching, DatasetDict
from huggingface_hub import list_repo_files, hf_hub_download
from transformers import (
RobertaForMaskedLM,
RobertaConfig,
RobertaTokenizer,
DataCollatorForLanguageModeling,
get_linear_schedule_with_warmup,
PreTrainedTokenizer,
PreTrainedModel
)
from tqdm import tqdm from tqdm import tqdm
from utils import remove_docstrings_and_comments_from_code # Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Disable caching for datasets class OnTheFlyTokenizationDataset(Dataset):
disable_caching() def __init__(self, dataset: Dataset, tokenizer: PreTrainedTokenizer, max_length: int):
self.dataset = dataset
self.tokenizer = tokenizer
self.max_length = max_length
############################### CONFIG ############################### def __len__(self) -> int:
dataset_name = 'the-stack-tokenized' # 'the-stack' or 'code-search-net' or 'the-stack-tokenized return len(self.dataset)
remove_comments = False
######################################################################
# Initialize Weights & Biases and output directory def __getitem__(self, idx: int) -> Dict[str, Tensor]:
curr_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') content: str = self.dataset[idx]['content']
wandb.init(project='codebert-training', name=curr_time) tokenized = self.tokenizer(
output_dir = f'/home/s452638/magisterka/output/{curr_time}/' content,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': tokenized['input_ids'].squeeze(0),
'attention_mask': tokenized['attention_mask'].squeeze(0),
'labels': tokenized['input_ids'].squeeze(0)
}
# Save this file to Weights & Biases def set_seed(seed: int) -> None:
wandb.save('train_codebert_mlm.py')
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Set the seed for reproducibility
SEED = 42
def set_seed(seed):
random.seed(seed) random.seed(seed)
np.random.seed(seed) np.random.seed(seed)
torch.manual_seed(seed) torch.manual_seed(seed)
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed) torch.cuda.manual_seed_all(seed)
set_seed(SEED) def setup_wandb(config: Dict[str, Any]) -> None:
curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
wandb.init(project='codebert-training', name=curr_time, config=config)
wandb.save('train_codebert_mlm.py')
# Set the device for PyTorch (use GPU if available, otherwise CPU) def setup_directories(current_dir: Path) -> Path:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
output_dir: Path = current_dir.parent.parent / 'outputs' / curr_time
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir
def load_config(config_file: Path) -> Dict[str, Any]:
with open(config_file, 'r') as f:
return json.load(f)
def setup_device() -> torch.device:
device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_device(device) torch.set_default_device(device)
print('*' * 10, 'Device', '*' * 10) logger.info(f'Using device: {device}')
print(f'Using device: {device}')
if device.type == 'cuda': if device.type == 'cuda':
print(f'Device name: {torch.cuda.get_device_name()}') logger.info(f'Device name: {torch.cuda.get_device_name()}')
torch.set_float32_matmul_precision('high')
return device
# Load the dataset def download_dataset(dataset_dir: Path) -> None:
if dataset_name == 'the-stack-tokenized': if not dataset_dir.exists():
train_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/train') logger.info("Downloading the dataset...")
valid_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/valid') dataset_dir.mkdir(parents=True, exist_ok=True)
test_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/test') files_list: List[str] = list_repo_files(repo_id='bigcode/the-stack-dedup', repo_type='dataset')
else: files_to_download: List[str] = [file for file in files_list if file.startswith('data/python/')]
if dataset_name == 'the-stack': for file_name in files_to_download:
train_data = load_dataset("/work/s452638/datasets/the-stack-python", split="train") hf_hub_download(repo_id='bigcode/the-stack-dedup', repo_type='dataset', filename=file_name, local_dir=dataset_dir)
train_data = train_data.rename_column_('content', 'code') logger.info("Dataset downloaded successfully.")
elif dataset_name == 'code-search-net':
train_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/train.jsonl')['train']
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['valid'] def load_and_prepare_dataset(dataset_dir: Path, seed: int) -> DatasetDict:
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['test'] dataset: DatasetDict = load_dataset(str(dataset_dir), split='train')
dataset = dataset.train_test_split(test_size=0.01, seed=seed)
logger.info(f'Dataset loaded: {dataset}')
return dataset
dataset = DatasetDict({'train': train_data, 'valid': valid_data, 'test': test_data}) def create_dataloaders(
print('\n\n', '*' * 10, 'Dataset', '*' * 10) dataset: DatasetDict,
print(dataset) tokenizer: PreTrainedTokenizer,
config: Dict[str, Any],
device: torch.device
) -> Tuple[DataLoader, DataLoader]:
dataset['train'] = OnTheFlyTokenizationDataset(dataset['train'], tokenizer, max_length=512)
dataset['test'] = OnTheFlyTokenizationDataset(dataset['test'], tokenizer, max_length=512)
if remove_comments: data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=config['mlm_probability'])
# Build the language library if not already built
Language.build_library('/home/s452638/magisterka/build/my-languages.so', ['/home/s452638/magisterka/vendor/tree-sitter-python'])
# Load the language train_dataloader = DataLoader(
PYTHON_LANGUAGE = Language('/home/s452638/magisterka/build/my-languages.so', 'python') dataset['train'],
batch_size=config['batch'],
shuffle=False,
collate_fn=data_collator,
generator=torch.Generator(device=device)
)
valid_dataloader = DataLoader(
dataset['test'],
batch_size=config['batch'],
shuffle=False,
collate_fn=data_collator,
generator=torch.Generator(device=device)
)
return train_dataloader, valid_dataloader
# Initialize the parser def setup_model_and_optimizer(
parser = Parser() config: Dict[str, Any],
parser.set_language(PYTHON_LANGUAGE) current_dir: Path
) -> Tuple[PreTrainedModel, AdamW]:
# Remove docstrings and comments from the code os.environ['HF_HOME'] = str(current_dir.parent / 'models')
dataset = dataset.map(lambda x: {'code': remove_docstrings_and_comments_from_code(x['code'], parser)}, batched=False, desc='Removing docstrings and comments') model_config = RobertaConfig.from_pretrained('roberta-base')
model: PreTrainedModel = RobertaForMaskedLM(model_config)
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
print('\n\n', '*' * 10, 'Tokenizer', '*' * 10)
print(tokenizer)
if dataset_name != 'the-stack-tokenized':
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
print('\n\n', '*' * 10, 'Tokenized dataset', '*' * 10)
print(tokenized_datasets)
else:
tokenized_datasets = dataset
# Set data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
# Create DataLoaders
batch_size = 64
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
valid_dataloader = DataLoader(tokenized_datasets['valid'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
# Initialize a model with random weights based on the configuration for RoBERTa (CodeBERT is based on RoBERTa)
config = RobertaConfig.from_pretrained('roberta-base')
model = RobertaForMaskedLM(config)
model = torch.compile(model) model = torch.compile(model)
wandb.watch(model) wandb.watch(model)
print('\n\n', '*' * 10, 'Model', '*' * 10) logger.info(f'Model config: {model_config}')
print(config) wandb.config.update({'model_config': model_config.to_dict()})
# Log the model configuration to wandb optimizer: AdamW = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
wandb.config.update({'model_config': config.to_dict()}) return model, optimizer
# Set the optimizer and scaler def train_and_evaluate(
learning_rate = 5e-4 model: PreTrainedModel,
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01) train_dataloader: DataLoader,
scaler = torch.amp.GradScaler() valid_dataloader: DataLoader,
optimizer: AdamW,
scheduler: Any,
config: Dict[str, Any],
output_dir: Path
) -> None:
num_training_steps: int = config['epochs'] * len(train_dataloader)
best_valid_loss: float = float('inf')
# Training settings
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
eval_every = 10_000
# Log training settings to wandb
wandb.config.update({
'training_settings': {
'num_epochs': num_epochs,
'num_training_steps': num_training_steps,
'eval_every': eval_every,
'batch_size': batch_size,
'learning_rate': learning_rate,
}
})
# Initialize variables to track validation loss, accuracy, and best model path
valid_acc = 0.0
valid_loss = 0.0
best_valid_loss = float('inf')
# Train the model
print('\n\n', '*' * 10, 'Training', '*' * 10)
model.train()
with tqdm(total=num_training_steps, desc='Training') as pbar: with tqdm(total=num_training_steps, desc='Training') as pbar:
for epoch_idx in range(num_epochs): for epoch_idx in range(config['epochs']):
model.train()
for train_idx, train_batch in enumerate(train_dataloader): for train_idx, train_batch in enumerate(train_dataloader):
# Forward pass with mixed precision
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
outputs = model(**train_batch) outputs = model(**train_batch)
train_loss: Tensor = outputs.loss
train_loss.backward()
train_loss = outputs.loss norm: Tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
scaler.scale(train_loss).backward()
# Gradient clipping to prevent exploding gradients optimizer.step()
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scheduler.step()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad() optimizer.zero_grad()
pbar.update(1) pbar.update(1)
pbar.set_postfix({'norm': norm.item(), 'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc}) pbar.set_postfix({'train_loss': train_loss.item()})
# Log metrics to Weights & Biases
wandb.log({ wandb.log({
'step': train_idx + len(train_dataloader) * epoch_idx, 'step': train_idx + len(train_dataloader) * epoch_idx,
'train_loss': train_loss.item(), 'train_loss': train_loss.item(),
'gradient_norm': norm.item(), 'gradient_norm': norm.item(),
'learning_rate': optimizer.param_groups[0]['lr'], 'learning_rate': scheduler.get_last_lr()[0],
}) })
# Evaluate the model if train_idx != 0 and train_idx % config['eval_every'] == 0:
if train_idx != 0 and train_idx % eval_every == 0: valid_loss, valid_acc = evaluate(model, valid_dataloader)
model.eval() pbar.set_postfix({'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc})
valid_loss = 0.0
valid_acc = 0.0
with tqdm(total=len(valid_dataloader), desc='Validation') as pbar_valid:
with torch.no_grad():
for valid_idx, valid_batch in enumerate(valid_dataloader):
# Forward pass with mixed precision for validation
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
outputs = model(**valid_batch)
# Accumulate validation loss and accuracy
valid_loss += outputs.loss.item()
valid_acc += outputs.logits.argmax(dim=-1).eq(valid_batch['labels']).sum().item()
pbar_valid.update(1)
# Compute average validation loss and accuracy
valid_loss /= len(valid_dataloader)
valid_acc /= len(valid_dataloader.dataset)
model.train()
# Log validation metrics to Weights & Biases
wandb.log({ wandb.log({
'valid_loss': valid_loss, 'valid_loss': valid_loss,
'valid_acc': valid_acc, 'valid_acc': valid_acc,
'step': train_idx + len(train_dataloader) * epoch_idx, 'step': train_idx + len(train_dataloader) * epoch_idx,
}) })
# Update best model if current validation loss is lower
if valid_loss < best_valid_loss: if valid_loss < best_valid_loss:
best_valid_loss = valid_loss best_valid_loss = valid_loss
torch.save(model.state_dict(), output_dir + f'best_model.pt') torch.save(model.state_dict(), output_dir / 'best_model.pt')
print('\n\n', '*' * 10, 'Training results', '*' * 10) logger.info(f'Best validation loss: {best_valid_loss}')
print(f'Best validation loss: {best_valid_loss}')
# Load the best model and evaluate on the test set def evaluate(model: PreTrainedModel, dataloader: DataLoader) -> Tuple[float, float]:
print('\n\n', '*' * 10, 'Testing', '*' * 10)
model.load_state_dict(torch.load(output_dir + f'best_model.pt', weights_only=True, map_location=device))
model.eval() model.eval()
test_loss = 0.0 total_loss: float = 0.0
test_acc = 0.0 total_acc: float = 0.0
with tqdm(total=len(test_dataloader), desc='Testing') as pbar_test:
with torch.no_grad(): with torch.no_grad():
for test_idx, test_batch in enumerate(test_dataloader): for batch in tqdm(dataloader, desc='Validation'):
outputs = model(**batch)
total_loss += outputs.loss.item()
total_acc += outputs.logits.argmax(dim=-1).eq(batch['labels']).sum().item()
# Forward pass with mixed precision for testing avg_loss: float = total_loss / len(dataloader)
with torch.autocast(device_type=device.type, dtype=torch.bfloat16): avg_acc: float = total_acc / len(dataloader.dataset)
outputs = model(**test_batch) return avg_loss, avg_acc
# Accumulate test loss and accuracy def main() -> None:
test_loss += outputs.loss.item() disable_caching()
test_acc += outputs.logits.argmax(dim=-1).eq(test_batch['labels']).sum().item()
pbar_test.update(1)
# Compute average test loss and accuracy current_dir: Path = Path(__file__).parent
test_loss /= len(test_dataloader) output_dir: Path = setup_directories(current_dir)
test_acc /= len(test_dataloader.dataset) config: Dict[str, Any] = load_config(current_dir / 'config.json')
# Log test metrics to Weights & Biases setup_wandb(config)
wandb.log({ set_seed(config['seed'])
'test_loss': test_loss, device: torch.device = setup_device()
'test_acc': test_acc,
})
print('\n\n', '*' * 10, 'Test results', '*' * 10) dataset_dir: Path = current_dir.parent / 'data' / 'the-stack-python'
print(f'Test loss: {test_loss}') download_dataset(dataset_dir)
print(f'Test accuracy: {test_acc}') dataset: DatasetDict = load_and_prepare_dataset(dataset_dir, config['seed'])
tokenizer: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
logger.info(f'Tokenizer loaded: {tokenizer}')
train_dataloader, valid_dataloader = create_dataloaders(dataset, tokenizer, config, device)
model, optimizer = setup_model_and_optimizer(config, current_dir)
num_training_steps: int = config['epochs'] * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=config['warmup_steps'],
num_training_steps=num_training_steps
)
train_and_evaluate(model, train_dataloader, valid_dataloader, optimizer, scheduler, config, output_dir)
if __name__ == "__main__":
main()