redesigned code, added functions and granularity
This commit is contained in:
parent
3e2f9c7711
commit
240c16b495
4
code/.gitignore
vendored
4
code/.gitignore
vendored
@ -160,3 +160,7 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
# Weights & Biases
|
||||||
|
wandb/
|
||||||
|
outputs/
|
||||||
|
2
code/data/.gitignore
vendored
Normal file
2
code/data/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
2
code/models/.gitignore
vendored
Normal file
2
code/models/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
@ -5,7 +5,7 @@
|
|||||||
groups = ["default"]
|
groups = ["default"]
|
||||||
strategy = ["inherit_metadata"]
|
strategy = ["inherit_metadata"]
|
||||||
lock_version = "4.5.0"
|
lock_version = "4.5.0"
|
||||||
content_hash = "sha256:ac6621f3bd9193d786ab94f80f8b1711100fe418959f2e131ae03afeab616788"
|
content_hash = "sha256:bf0a0ea826769cf12a84888d394edd8c3c5599c4d369b8b19b75c2fa5e16f5f0"
|
||||||
|
|
||||||
[[metadata.targets]]
|
[[metadata.targets]]
|
||||||
requires_python = "==3.11.*"
|
requires_python = "==3.11.*"
|
||||||
|
@ -6,13 +6,13 @@ authors = [
|
|||||||
{name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"},
|
{name = "Patryk Bartkowiak", email = "patbar15@st.amu.edu.pl"},
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"wandb>=0.18.5",
|
"wandb==0.18.5",
|
||||||
"torch>=2.5.0",
|
"torch==2.5.0",
|
||||||
"tqdm>=4.66.5",
|
"tqdm==4.66.5",
|
||||||
"tree-sitter>=0.23.1",
|
"tree-sitter==0.23.1",
|
||||||
"transformers>=4.45.2",
|
"transformers==4.45.2",
|
||||||
"datasets>=3.0.1",
|
"datasets==3.0.1",
|
||||||
"huggingface-hub>=0.26.0",
|
"huggingface-hub==0.26.0",
|
||||||
]
|
]
|
||||||
requires-python = "==3.11.*"
|
requires-python = "==3.11.*"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
11
code/src/config.json
Normal file
11
code/src/config.json
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"seed": 42,
|
||||||
|
"mlm_probability": 0.15,
|
||||||
|
"batch": 32,
|
||||||
|
"epochs": 1,
|
||||||
|
"eval_every": 10000,
|
||||||
|
"learning_rate": 5e-4,
|
||||||
|
"weight_decay": 0.01,
|
||||||
|
"max_grad_norm": 1.0,
|
||||||
|
"warmup_steps": 10000
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
@ -1,58 +0,0 @@
|
|||||||
from datasets import load_dataset, disable_caching
|
|
||||||
from transformers import RobertaTokenizer
|
|
||||||
|
|
||||||
disable_caching()
|
|
||||||
|
|
||||||
|
|
||||||
def visible_print(text):
|
|
||||||
print('\n\n')
|
|
||||||
print('=' * 100)
|
|
||||||
print(text)
|
|
||||||
print('=' * 100)
|
|
||||||
print('\n\n')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# Load the dataset
|
|
||||||
train_data = load_dataset('/work/s452638/datasets/the-stack-python', split='train')
|
|
||||||
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['train']
|
|
||||||
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['train']
|
|
||||||
|
|
||||||
visible_print('Loaded data')
|
|
||||||
|
|
||||||
# Rename the columns
|
|
||||||
train_data = train_data.rename_column('content', 'code')
|
|
||||||
|
|
||||||
# Remove all the columns except the code
|
|
||||||
train_columns = train_data.column_names
|
|
||||||
valid_columns = valid_data.column_names
|
|
||||||
test_columns = test_data.column_names
|
|
||||||
|
|
||||||
train_columns.remove('code')
|
|
||||||
valid_columns.remove('code')
|
|
||||||
test_columns.remove('code')
|
|
||||||
|
|
||||||
train_data = train_data.remove_columns(train_columns)
|
|
||||||
valid_data = valid_data.remove_columns(valid_columns)
|
|
||||||
test_data = test_data.remove_columns(test_columns)
|
|
||||||
|
|
||||||
visible_print('Removed unnecessary columns')
|
|
||||||
|
|
||||||
# Tokenize the data
|
|
||||||
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
|
|
||||||
|
|
||||||
def tokenize_function(examples):
|
|
||||||
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
|
|
||||||
|
|
||||||
train_data = train_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Train] Running tokenizer', num_proc=8)
|
|
||||||
valid_data = valid_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Valid] Running tokenizer', num_proc=8)
|
|
||||||
test_data = test_data.map(tokenize_function, batched=True, remove_columns=['code'], desc='[Test] Running tokenizer', num_proc=8)
|
|
||||||
|
|
||||||
visible_print('Tokenized data')
|
|
||||||
|
|
||||||
# Save the tokenized data
|
|
||||||
train_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
|
|
||||||
valid_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
|
|
||||||
test_data.save_to_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
|
|
||||||
|
|
||||||
visible_print('Saved tokenized data')
|
|
File diff suppressed because one or more lines are too long
@ -1,254 +1,245 @@
|
|||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
import torch
|
|
||||||
from torch.optim import AdamW
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
import random
|
import random
|
||||||
import datetime
|
import datetime
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Tuple, List
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from datasets import load_dataset, load_from_disk, disable_caching, DatasetDict
|
import torch
|
||||||
from tree_sitter import Language, Parser
|
from torch import Tensor
|
||||||
from transformers import RobertaForMaskedLM, RobertaConfig, RobertaTokenizer, DataCollatorForLanguageModeling
|
from torch.optim import AdamW
|
||||||
|
from torch.utils.data import DataLoader, Dataset
|
||||||
|
from datasets import load_dataset, disable_caching, DatasetDict
|
||||||
|
from huggingface_hub import list_repo_files, hf_hub_download
|
||||||
|
from transformers import (
|
||||||
|
RobertaForMaskedLM,
|
||||||
|
RobertaConfig,
|
||||||
|
RobertaTokenizer,
|
||||||
|
DataCollatorForLanguageModeling,
|
||||||
|
get_linear_schedule_with_warmup,
|
||||||
|
PreTrainedTokenizer,
|
||||||
|
PreTrainedModel
|
||||||
|
)
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from utils import remove_docstrings_and_comments_from_code
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Disable caching for datasets
|
class OnTheFlyTokenizationDataset(Dataset):
|
||||||
disable_caching()
|
def __init__(self, dataset: Dataset, tokenizer: PreTrainedTokenizer, max_length: int):
|
||||||
|
self.dataset = dataset
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.max_length = max_length
|
||||||
|
|
||||||
############################### CONFIG ###############################
|
def __len__(self) -> int:
|
||||||
dataset_name = 'the-stack-tokenized' # 'the-stack' or 'code-search-net' or 'the-stack-tokenized
|
return len(self.dataset)
|
||||||
remove_comments = False
|
|
||||||
######################################################################
|
|
||||||
|
|
||||||
# Initialize Weights & Biases and output directory
|
def __getitem__(self, idx: int) -> Dict[str, Tensor]:
|
||||||
curr_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
|
content: str = self.dataset[idx]['content']
|
||||||
wandb.init(project='codebert-training', name=curr_time)
|
tokenized = self.tokenizer(
|
||||||
output_dir = f'/home/s452638/magisterka/output/{curr_time}/'
|
content,
|
||||||
|
truncation=True,
|
||||||
|
padding='max_length',
|
||||||
|
max_length=self.max_length,
|
||||||
|
return_tensors='pt'
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
'input_ids': tokenized['input_ids'].squeeze(0),
|
||||||
|
'attention_mask': tokenized['attention_mask'].squeeze(0),
|
||||||
|
'labels': tokenized['input_ids'].squeeze(0)
|
||||||
|
}
|
||||||
|
|
||||||
# Save this file to Weights & Biases
|
def set_seed(seed: int) -> None:
|
||||||
wandb.save('train_codebert_mlm.py')
|
|
||||||
|
|
||||||
# Create the output directory if it does not exist
|
|
||||||
if not os.path.exists(output_dir):
|
|
||||||
os.makedirs(output_dir)
|
|
||||||
|
|
||||||
# Set the seed for reproducibility
|
|
||||||
SEED = 42
|
|
||||||
def set_seed(seed):
|
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
|
||||||
set_seed(SEED)
|
def setup_wandb(config: Dict[str, Any]) -> None:
|
||||||
|
curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
|
||||||
|
wandb.init(project='codebert-training', name=curr_time, config=config)
|
||||||
|
wandb.save('train_codebert_mlm.py')
|
||||||
|
|
||||||
# Set the device for PyTorch (use GPU if available, otherwise CPU)
|
def setup_directories(current_dir: Path) -> Path:
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
curr_time: str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
|
||||||
|
output_dir: Path = current_dir.parent.parent / 'outputs' / curr_time
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return output_dir
|
||||||
|
|
||||||
|
def load_config(config_file: Path) -> Dict[str, Any]:
|
||||||
|
with open(config_file, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def setup_device() -> torch.device:
|
||||||
|
device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
print('*' * 10, 'Device', '*' * 10)
|
logger.info(f'Using device: {device}')
|
||||||
print(f'Using device: {device}')
|
|
||||||
if device.type == 'cuda':
|
if device.type == 'cuda':
|
||||||
print(f'Device name: {torch.cuda.get_device_name()}')
|
logger.info(f'Device name: {torch.cuda.get_device_name()}')
|
||||||
|
torch.set_float32_matmul_precision('high')
|
||||||
|
return device
|
||||||
|
|
||||||
# Load the dataset
|
def download_dataset(dataset_dir: Path) -> None:
|
||||||
if dataset_name == 'the-stack-tokenized':
|
if not dataset_dir.exists():
|
||||||
train_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/train')
|
logger.info("Downloading the dataset...")
|
||||||
valid_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/valid')
|
dataset_dir.mkdir(parents=True, exist_ok=True)
|
||||||
test_data = load_from_disk('/work/s452638/datasets/the-stack-python-tokenized/test')
|
files_list: List[str] = list_repo_files(repo_id='bigcode/the-stack-dedup', repo_type='dataset')
|
||||||
else:
|
files_to_download: List[str] = [file for file in files_list if file.startswith('data/python/')]
|
||||||
if dataset_name == 'the-stack':
|
for file_name in files_to_download:
|
||||||
train_data = load_dataset("/work/s452638/datasets/the-stack-python", split="train")
|
hf_hub_download(repo_id='bigcode/the-stack-dedup', repo_type='dataset', filename=file_name, local_dir=dataset_dir)
|
||||||
train_data = train_data.rename_column_('content', 'code')
|
logger.info("Dataset downloaded successfully.")
|
||||||
elif dataset_name == 'code-search-net':
|
|
||||||
train_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/train.jsonl')['train']
|
|
||||||
|
|
||||||
valid_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/valid.jsonl')['valid']
|
def load_and_prepare_dataset(dataset_dir: Path, seed: int) -> DatasetDict:
|
||||||
test_data = load_dataset('json', data_files='/work/s452638/datasets/CodeSearchNet/python/test.jsonl')['test']
|
dataset: DatasetDict = load_dataset(str(dataset_dir), split='train')
|
||||||
|
dataset = dataset.train_test_split(test_size=0.01, seed=seed)
|
||||||
|
logger.info(f'Dataset loaded: {dataset}')
|
||||||
|
return dataset
|
||||||
|
|
||||||
dataset = DatasetDict({'train': train_data, 'valid': valid_data, 'test': test_data})
|
def create_dataloaders(
|
||||||
print('\n\n', '*' * 10, 'Dataset', '*' * 10)
|
dataset: DatasetDict,
|
||||||
print(dataset)
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
device: torch.device
|
||||||
|
) -> Tuple[DataLoader, DataLoader]:
|
||||||
|
dataset['train'] = OnTheFlyTokenizationDataset(dataset['train'], tokenizer, max_length=512)
|
||||||
|
dataset['test'] = OnTheFlyTokenizationDataset(dataset['test'], tokenizer, max_length=512)
|
||||||
|
|
||||||
if remove_comments:
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=config['mlm_probability'])
|
||||||
# Build the language library if not already built
|
|
||||||
Language.build_library('/home/s452638/magisterka/build/my-languages.so', ['/home/s452638/magisterka/vendor/tree-sitter-python'])
|
|
||||||
|
|
||||||
# Load the language
|
train_dataloader = DataLoader(
|
||||||
PYTHON_LANGUAGE = Language('/home/s452638/magisterka/build/my-languages.so', 'python')
|
dataset['train'],
|
||||||
|
batch_size=config['batch'],
|
||||||
|
shuffle=False,
|
||||||
|
collate_fn=data_collator,
|
||||||
|
generator=torch.Generator(device=device)
|
||||||
|
)
|
||||||
|
valid_dataloader = DataLoader(
|
||||||
|
dataset['test'],
|
||||||
|
batch_size=config['batch'],
|
||||||
|
shuffle=False,
|
||||||
|
collate_fn=data_collator,
|
||||||
|
generator=torch.Generator(device=device)
|
||||||
|
)
|
||||||
|
return train_dataloader, valid_dataloader
|
||||||
|
|
||||||
# Initialize the parser
|
def setup_model_and_optimizer(
|
||||||
parser = Parser()
|
config: Dict[str, Any],
|
||||||
parser.set_language(PYTHON_LANGUAGE)
|
current_dir: Path
|
||||||
|
) -> Tuple[PreTrainedModel, AdamW]:
|
||||||
# Remove docstrings and comments from the code
|
os.environ['HF_HOME'] = str(current_dir.parent / 'models')
|
||||||
dataset = dataset.map(lambda x: {'code': remove_docstrings_and_comments_from_code(x['code'], parser)}, batched=False, desc='Removing docstrings and comments')
|
model_config = RobertaConfig.from_pretrained('roberta-base')
|
||||||
|
model: PreTrainedModel = RobertaForMaskedLM(model_config)
|
||||||
# Load the tokenizer
|
|
||||||
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
|
|
||||||
print('\n\n', '*' * 10, 'Tokenizer', '*' * 10)
|
|
||||||
print(tokenizer)
|
|
||||||
|
|
||||||
if dataset_name != 'the-stack-tokenized':
|
|
||||||
# Tokenize the dataset
|
|
||||||
def tokenize_function(examples):
|
|
||||||
return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
|
|
||||||
|
|
||||||
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['code'], desc='Running tokenizer')
|
|
||||||
print('\n\n', '*' * 10, 'Tokenized dataset', '*' * 10)
|
|
||||||
print(tokenized_datasets)
|
|
||||||
else:
|
|
||||||
tokenized_datasets = dataset
|
|
||||||
|
|
||||||
# Set data collator for MLM
|
|
||||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
|
|
||||||
|
|
||||||
# Create DataLoaders
|
|
||||||
batch_size = 64
|
|
||||||
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
|
|
||||||
valid_dataloader = DataLoader(tokenized_datasets['valid'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
|
|
||||||
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator, generator=torch.Generator(device=device))
|
|
||||||
|
|
||||||
# Initialize a model with random weights based on the configuration for RoBERTa (CodeBERT is based on RoBERTa)
|
|
||||||
config = RobertaConfig.from_pretrained('roberta-base')
|
|
||||||
model = RobertaForMaskedLM(config)
|
|
||||||
model = torch.compile(model)
|
model = torch.compile(model)
|
||||||
wandb.watch(model)
|
wandb.watch(model)
|
||||||
print('\n\n', '*' * 10, 'Model', '*' * 10)
|
logger.info(f'Model config: {model_config}')
|
||||||
print(config)
|
wandb.config.update({'model_config': model_config.to_dict()})
|
||||||
|
|
||||||
# Log the model configuration to wandb
|
optimizer: AdamW = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
|
||||||
wandb.config.update({'model_config': config.to_dict()})
|
return model, optimizer
|
||||||
|
|
||||||
# Set the optimizer and scaler
|
def train_and_evaluate(
|
||||||
learning_rate = 5e-4
|
model: PreTrainedModel,
|
||||||
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
|
train_dataloader: DataLoader,
|
||||||
scaler = torch.amp.GradScaler()
|
valid_dataloader: DataLoader,
|
||||||
|
optimizer: AdamW,
|
||||||
|
scheduler: Any,
|
||||||
|
config: Dict[str, Any],
|
||||||
|
output_dir: Path
|
||||||
|
) -> None:
|
||||||
|
num_training_steps: int = config['epochs'] * len(train_dataloader)
|
||||||
|
best_valid_loss: float = float('inf')
|
||||||
|
|
||||||
# Training settings
|
|
||||||
num_epochs = 1
|
|
||||||
num_training_steps = num_epochs * len(train_dataloader)
|
|
||||||
eval_every = 10_000
|
|
||||||
|
|
||||||
# Log training settings to wandb
|
|
||||||
wandb.config.update({
|
|
||||||
'training_settings': {
|
|
||||||
'num_epochs': num_epochs,
|
|
||||||
'num_training_steps': num_training_steps,
|
|
||||||
'eval_every': eval_every,
|
|
||||||
'batch_size': batch_size,
|
|
||||||
'learning_rate': learning_rate,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
# Initialize variables to track validation loss, accuracy, and best model path
|
|
||||||
valid_acc = 0.0
|
|
||||||
valid_loss = 0.0
|
|
||||||
best_valid_loss = float('inf')
|
|
||||||
|
|
||||||
# Train the model
|
|
||||||
print('\n\n', '*' * 10, 'Training', '*' * 10)
|
|
||||||
model.train()
|
|
||||||
with tqdm(total=num_training_steps, desc='Training') as pbar:
|
with tqdm(total=num_training_steps, desc='Training') as pbar:
|
||||||
for epoch_idx in range(num_epochs):
|
for epoch_idx in range(config['epochs']):
|
||||||
|
model.train()
|
||||||
for train_idx, train_batch in enumerate(train_dataloader):
|
for train_idx, train_batch in enumerate(train_dataloader):
|
||||||
|
|
||||||
# Forward pass with mixed precision
|
|
||||||
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
|
|
||||||
outputs = model(**train_batch)
|
outputs = model(**train_batch)
|
||||||
|
train_loss: Tensor = outputs.loss
|
||||||
|
train_loss.backward()
|
||||||
|
|
||||||
train_loss = outputs.loss
|
norm: Tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
|
||||||
scaler.scale(train_loss).backward()
|
|
||||||
|
|
||||||
# Gradient clipping to prevent exploding gradients
|
optimizer.step()
|
||||||
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
scheduler.step()
|
||||||
|
|
||||||
scaler.step(optimizer)
|
|
||||||
scaler.update()
|
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
pbar.set_postfix({'norm': norm.item(), 'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc})
|
pbar.set_postfix({'train_loss': train_loss.item()})
|
||||||
|
|
||||||
# Log metrics to Weights & Biases
|
|
||||||
wandb.log({
|
wandb.log({
|
||||||
'step': train_idx + len(train_dataloader) * epoch_idx,
|
'step': train_idx + len(train_dataloader) * epoch_idx,
|
||||||
'train_loss': train_loss.item(),
|
'train_loss': train_loss.item(),
|
||||||
'gradient_norm': norm.item(),
|
'gradient_norm': norm.item(),
|
||||||
'learning_rate': optimizer.param_groups[0]['lr'],
|
'learning_rate': scheduler.get_last_lr()[0],
|
||||||
})
|
})
|
||||||
|
|
||||||
# Evaluate the model
|
if train_idx != 0 and train_idx % config['eval_every'] == 0:
|
||||||
if train_idx != 0 and train_idx % eval_every == 0:
|
valid_loss, valid_acc = evaluate(model, valid_dataloader)
|
||||||
model.eval()
|
pbar.set_postfix({'train_loss': train_loss.item(), 'valid_loss': valid_loss, 'valid_acc': valid_acc})
|
||||||
valid_loss = 0.0
|
|
||||||
valid_acc = 0.0
|
|
||||||
|
|
||||||
with tqdm(total=len(valid_dataloader), desc='Validation') as pbar_valid:
|
|
||||||
with torch.no_grad():
|
|
||||||
for valid_idx, valid_batch in enumerate(valid_dataloader):
|
|
||||||
|
|
||||||
# Forward pass with mixed precision for validation
|
|
||||||
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
|
|
||||||
outputs = model(**valid_batch)
|
|
||||||
|
|
||||||
# Accumulate validation loss and accuracy
|
|
||||||
valid_loss += outputs.loss.item()
|
|
||||||
valid_acc += outputs.logits.argmax(dim=-1).eq(valid_batch['labels']).sum().item()
|
|
||||||
pbar_valid.update(1)
|
|
||||||
|
|
||||||
# Compute average validation loss and accuracy
|
|
||||||
valid_loss /= len(valid_dataloader)
|
|
||||||
valid_acc /= len(valid_dataloader.dataset)
|
|
||||||
model.train()
|
|
||||||
|
|
||||||
# Log validation metrics to Weights & Biases
|
|
||||||
wandb.log({
|
wandb.log({
|
||||||
'valid_loss': valid_loss,
|
'valid_loss': valid_loss,
|
||||||
'valid_acc': valid_acc,
|
'valid_acc': valid_acc,
|
||||||
'step': train_idx + len(train_dataloader) * epoch_idx,
|
'step': train_idx + len(train_dataloader) * epoch_idx,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Update best model if current validation loss is lower
|
|
||||||
if valid_loss < best_valid_loss:
|
if valid_loss < best_valid_loss:
|
||||||
best_valid_loss = valid_loss
|
best_valid_loss = valid_loss
|
||||||
torch.save(model.state_dict(), output_dir + f'best_model.pt')
|
torch.save(model.state_dict(), output_dir / 'best_model.pt')
|
||||||
|
|
||||||
print('\n\n', '*' * 10, 'Training results', '*' * 10)
|
logger.info(f'Best validation loss: {best_valid_loss}')
|
||||||
print(f'Best validation loss: {best_valid_loss}')
|
|
||||||
|
|
||||||
# Load the best model and evaluate on the test set
|
def evaluate(model: PreTrainedModel, dataloader: DataLoader) -> Tuple[float, float]:
|
||||||
print('\n\n', '*' * 10, 'Testing', '*' * 10)
|
|
||||||
model.load_state_dict(torch.load(output_dir + f'best_model.pt', weights_only=True, map_location=device))
|
|
||||||
model.eval()
|
model.eval()
|
||||||
test_loss = 0.0
|
total_loss: float = 0.0
|
||||||
test_acc = 0.0
|
total_acc: float = 0.0
|
||||||
|
|
||||||
with tqdm(total=len(test_dataloader), desc='Testing') as pbar_test:
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for test_idx, test_batch in enumerate(test_dataloader):
|
for batch in tqdm(dataloader, desc='Validation'):
|
||||||
|
outputs = model(**batch)
|
||||||
|
total_loss += outputs.loss.item()
|
||||||
|
total_acc += outputs.logits.argmax(dim=-1).eq(batch['labels']).sum().item()
|
||||||
|
|
||||||
# Forward pass with mixed precision for testing
|
avg_loss: float = total_loss / len(dataloader)
|
||||||
with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
|
avg_acc: float = total_acc / len(dataloader.dataset)
|
||||||
outputs = model(**test_batch)
|
return avg_loss, avg_acc
|
||||||
|
|
||||||
# Accumulate test loss and accuracy
|
def main() -> None:
|
||||||
test_loss += outputs.loss.item()
|
disable_caching()
|
||||||
test_acc += outputs.logits.argmax(dim=-1).eq(test_batch['labels']).sum().item()
|
|
||||||
pbar_test.update(1)
|
|
||||||
|
|
||||||
# Compute average test loss and accuracy
|
current_dir: Path = Path(__file__).parent
|
||||||
test_loss /= len(test_dataloader)
|
output_dir: Path = setup_directories(current_dir)
|
||||||
test_acc /= len(test_dataloader.dataset)
|
config: Dict[str, Any] = load_config(current_dir / 'config.json')
|
||||||
|
|
||||||
# Log test metrics to Weights & Biases
|
setup_wandb(config)
|
||||||
wandb.log({
|
set_seed(config['seed'])
|
||||||
'test_loss': test_loss,
|
device: torch.device = setup_device()
|
||||||
'test_acc': test_acc,
|
|
||||||
})
|
|
||||||
|
|
||||||
print('\n\n', '*' * 10, 'Test results', '*' * 10)
|
dataset_dir: Path = current_dir.parent / 'data' / 'the-stack-python'
|
||||||
print(f'Test loss: {test_loss}')
|
download_dataset(dataset_dir)
|
||||||
print(f'Test accuracy: {test_acc}')
|
dataset: DatasetDict = load_and_prepare_dataset(dataset_dir, config['seed'])
|
||||||
|
|
||||||
|
tokenizer: PreTrainedTokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', clean_up_tokenization_spaces=True)
|
||||||
|
logger.info(f'Tokenizer loaded: {tokenizer}')
|
||||||
|
|
||||||
|
train_dataloader, valid_dataloader = create_dataloaders(dataset, tokenizer, config, device)
|
||||||
|
|
||||||
|
model, optimizer = setup_model_and_optimizer(config, current_dir)
|
||||||
|
|
||||||
|
num_training_steps: int = config['epochs'] * len(train_dataloader)
|
||||||
|
scheduler = get_linear_schedule_with_warmup(
|
||||||
|
optimizer,
|
||||||
|
num_warmup_steps=config['warmup_steps'],
|
||||||
|
num_training_steps=num_training_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
train_and_evaluate(model, train_dataloader, valid_dataloader, optimizer, scheduler, config, output_dir)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user