import pandas as pd from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification import torch import matplotlib.pyplot as plt from torch.utils.data import TensorDataset, DataLoader, RandomSampler import torch.nn as nn from sklearn.utils.class_weight import compute_class_weight import numpy as np from model import BERT_Arch train_input_path = "train/in.tsv" train_target_path = "train/expected.tsv" train_input = pd.read_csv(train_input_path, sep="\t") train_input.columns=["text", "d"] train_target = pd.read_csv(train_target_path, sep="\t") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device("cuda") # seq_len = [len(i.split()) for i in train_input["text"]] # pd.Series(seq_len).hist(bins = 30) # plt.show() bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') tokens_train = tokenizer.batch_encode_plus( train_input["text"], max_length = 25, padding='max_length', truncation=True ) train_seq = torch.tensor(tokens_train['input_ids']) train_mask = torch.tensor(tokens_train['attention_mask']) train_y = torch.tensor(train_target.to_numpy()) #define a batch size batch_size = 32 # wrap tensors train_data = TensorDataset(train_seq, train_mask, train_y) # sampler for sampling the data during training train_sampler = RandomSampler(train_data) # dataLoader for train set train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) for param in bert.parameters(): param.requires_grad = False model = BERT_Arch(bert) model = model.to(device) # model.cuda(0) optimizer = AdamW(model.parameters(), lr = 1e-5) class_weights = compute_class_weight('balanced', np.unique(train_target.to_numpy()), train_target['1']) weights= torch.tensor(class_weights,dtype=torch.float) weights = weights.to(device) # define the loss function cross_entropy = nn.NLLLoss(weight=weights) # number of training epochs epochs = 10 def train(): model.train() total_loss, total_accuracy = 0, 0 # empty list to save model predictions total_preds=[] # iterate over batches for step,batch in enumerate(train_dataloader): # progress update after every 50 batches. if step % 50 == 0 and not step == 0: print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) # push the batch to gpu batch = [r.to(device) for r in batch] sent_id, mask, labels = batch # clear previously calculated gradients model.zero_grad() # get model predictions for the current batch preds = model(sent_id, mask) # compute the loss between actual and predicted values labels = torch.tensor([x[0] for x in labels]).to(device) loss = cross_entropy(preds, labels) # add on to the total loss total_loss = total_loss + loss.item() # backward pass to calculate the gradients loss.backward() # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # update parameters optimizer.step() # model predictions are stored on GPU. So, push it to CPU preds=preds.detach().cpu().numpy() # append the model predictions total_preds.append(preds) # compute the training loss of the epoch avg_loss = total_loss / len(train_dataloader) # predictions are in the form of (no. of batches, size of batch, no. of classes). # reshape the predictions in form of (number of samples, no. of classes) total_preds = np.concatenate(total_preds, axis=0) #returns the loss and predictions return avg_loss, total_preds def evaluate(): print("\nEvaluating...") # deactivate dropout layers model.eval() total_loss, total_accuracy = 0, 0 # empty list to save the model predictions total_preds = [] # iterate over batches for step,batch in enumerate(train_dataloader): # Progress update every 50 batches. if step % 50 == 0 and not step == 0: # Calculate elapsed time in minutes. # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) # push the batch to gpu batch = [t.to(device) for t in batch] sent_id, mask, labels = batch # deactivate autograd with torch.no_grad(): # model predictions preds = model(sent_id, mask) # compute the validation loss between actual and predicted values labels = torch.tensor([x[0] for x in labels]).to(device) loss = cross_entropy(preds,labels) total_loss = total_loss + loss.item() preds = preds.detach().cpu().numpy() total_preds.append(preds) # compute the validation loss of the epoch avg_loss = total_loss / len(train_dataloader) # reshape the predictions in form of (number of samples, no. of classes) total_preds = np.concatenate(total_preds, axis=0) return avg_loss, total_preds # avg_loss, total_preds = train() # set initial loss to infinite best_valid_loss = float('inf') # empty lists to store training and validation loss of each epoch train_losses=[] valid_losses=[] print("Started training!") #for each epoch for epoch in range(epochs): print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) #train model train_loss, _ = train() #evaluate model valid_loss, _ = evaluate() #save the best model if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'saved_weights.pt') # append training and validation loss train_losses.append(train_loss) valid_losses.append(valid_loss) print(f'\nTraining Loss: {train_loss:.3f}') print(f'Validation Loss: {valid_loss:.3f}') print("Finished !!!")