paranormal-or-skeptic-ISI-p.../main.py

import pandas as pd
from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from model import BERT_Arch

train_input_path = "train/in.tsv"
train_target_path = "train/expected.tsv"

train_input = pd.read_csv(train_input_path, sep="\t")
train_input.columns=["text", "d"]
train_target = pd.read_csv(train_target_path, sep="\t")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda")


# seq_len = [len(i.split()) for i in train_input["text"]]

# pd.Series(seq_len).hist(bins = 30)
# plt.show()

bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')


tokens_train = tokenizer.batch_encode_plus(
    train_input["text"],
    max_length = 25,
    padding='max_length',
    truncation=True
)

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_target.to_numpy())

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

for param in bert.parameters():
    param.requires_grad = False

model = BERT_Arch(bert)
model = model.to(device)
# model.cuda(0)

optimizer = AdamW(model.parameters(), lr = 1e-5)

class_weights = compute_class_weight('balanced', np.unique(train_target.to_numpy()), train_target['1'])
weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10

def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    labels = torch.tensor([x[0] for x in labels]).to(device)
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      labels = torch.tensor([x[0] for x in labels]).to(device)
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(train_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# avg_loss, total_preds = train()
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

print("Started training!")
#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

print("Finished !!!")
w123 2021-06-20 19:42:14 +02:00			`import pandas as pd`
progress 2021-06-20 22:05:07 +02:00			`from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification`
w123 2021-06-20 19:42:14 +02:00			`import torch`
progress 2021-06-20 22:05:07 +02:00			`import matplotlib.pyplot as plt`
			`from torch.utils.data import TensorDataset, DataLoader, RandomSampler`
			`import torch.nn as nn`
			`from sklearn.utils.class_weight import compute_class_weight`
			`import numpy as np`
			`from model import BERT_Arch`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`train_input_path = "train/in.tsv"`
			`train_target_path = "train/expected.tsv"`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`train_input = pd.read_csv(train_input_path, sep="\t")`
w123 2021-06-20 19:42:14 +02:00			`train_input.columns=["text", "d"]`
progress 2021-06-20 22:05:07 +02:00			`train_target = pd.read_csv(train_target_path, sep="\t")`
w123 2021-06-20 19:42:14 +02:00
			`tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')`
			`device = torch.device("cuda")`


progress 2021-06-20 22:05:07 +02:00			`# seq_len = [len(i.split()) for i in train_input["text"]]`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# pd.Series(seq_len).hist(bins = 30)`
			`# plt.show()`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')`
w123 2021-06-20 19:42:14 +02:00

progress 2021-06-20 22:05:07 +02:00			`tokens_train = tokenizer.batch_encode_plus(`
			`train_input["text"],`
			`max_length = 25,`
			`padding='max_length',`
			`truncation=True`
			`)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`train_seq = torch.tensor(tokens_train['input_ids'])`
			`train_mask = torch.tensor(tokens_train['attention_mask'])`
			`train_y = torch.tensor(train_target.to_numpy())`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`#define a batch size`
			`batch_size = 32`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# wrap tensors`
			`train_data = TensorDataset(train_seq, train_mask, train_y)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# sampler for sampling the data during training`
			`train_sampler = RandomSampler(train_data)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# dataLoader for train set`
			`train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`for param in bert.parameters():`
			`param.requires_grad = False`

			`model = BERT_Arch(bert)`
			`model = model.to(device)`
			`# model.cuda(0)`

			`optimizer = AdamW(model.parameters(), lr = 1e-5)`

			`class_weights = compute_class_weight('balanced', np.unique(train_target.to_numpy()), train_target['1'])`
			`weights= torch.tensor(class_weights,dtype=torch.float)`
			`weights = weights.to(device)`

			`# define the loss function`
			`cross_entropy = nn.NLLLoss(weight=weights)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# number of training epochs`
			`epochs = 10`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`def train():`

			`model.train()`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`total_loss, total_accuracy = 0, 0`

			`# empty list to save model predictions`
			`total_preds=[]`

			`# iterate over batches`
			`for step,batch in enumerate(train_dataloader):`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# progress update after every 50 batches.`
			`if step % 50 == 0 and not step == 0:`
			`print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))`

			`# push the batch to gpu`
			`batch = [r.to(device) for r in batch]`

			`sent_id, mask, labels = batch`

			`# clear previously calculated gradients`
			`model.zero_grad()`

			`# get model predictions for the current batch`
			`preds = model(sent_id, mask)`

			`# compute the loss between actual and predicted values`
			`labels = torch.tensor([x[0] for x in labels]).to(device)`
			`loss = cross_entropy(preds, labels)`

			`# add on to the total loss`
			`total_loss = total_loss + loss.item()`

			`# backward pass to calculate the gradients`
			`loss.backward()`

			`# clip the the gradients to 1.0. It helps in preventing the exploding gradient problem`
			`torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)`

			`# update parameters`
			`optimizer.step()`

			`# model predictions are stored on GPU. So, push it to CPU`
			`preds=preds.detach().cpu().numpy()`

			`# append the model predictions`
			`total_preds.append(preds)`

			`# compute the training loss of the epoch`
			`avg_loss = total_loss / len(train_dataloader)`

			`# predictions are in the form of (no. of batches, size of batch, no. of classes).`
			`# reshape the predictions in form of (number of samples, no. of classes)`
			`total_preds = np.concatenate(total_preds, axis=0)`

			`#returns the loss and predictions`
			`return avg_loss, total_preds`

			`def evaluate():`

			`print("\nEvaluating...")`

			`# deactivate dropout layers`
			`model.eval()`

			`total_loss, total_accuracy = 0, 0`

			`# empty list to save the model predictions`
			`total_preds = []`

			`# iterate over batches`
			`for step,batch in enumerate(train_dataloader):`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# Progress update every 50 batches.`
			`if step % 50 == 0 and not step == 0:`

			`# Calculate elapsed time in minutes.`

			`# Report progress.`
			`print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))`

			`# push the batch to gpu`
			`batch = [t.to(device) for t in batch]`

			`sent_id, mask, labels = batch`

			`# deactivate autograd`
			`with torch.no_grad():`

			`# model predictions`
			`preds = model(sent_id, mask)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# compute the validation loss between actual and predicted values`
			`labels = torch.tensor([x[0] for x in labels]).to(device)`
			`loss = cross_entropy(preds,labels)`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`total_loss = total_loss + loss.item()`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`preds = preds.detach().cpu().numpy()`

			`total_preds.append(preds)`

			`# compute the validation loss of the epoch`
			`avg_loss = total_loss / len(train_dataloader)`

			`# reshape the predictions in form of (number of samples, no. of classes)`
			`total_preds = np.concatenate(total_preds, axis=0)`

			`return avg_loss, total_preds`

			`# avg_loss, total_preds = train()`
			`# set initial loss to infinite`
			`best_valid_loss = float('inf')`

			`# empty lists to store training and validation loss of each epoch`
			`train_losses=[]`
			`valid_losses=[]`

			`print("Started training!")`
			`#for each epoch`
			`for epoch in range(epochs):`

			`print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`#train model`
			`train_loss, _ = train()`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`#evaluate model`
			`valid_loss, _ = evaluate()`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`#save the best model`
			`if valid_loss < best_valid_loss:`
			`best_valid_loss = valid_loss`
			`torch.save(model.state_dict(), 'saved_weights.pt')`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`# append training and validation loss`
			`train_losses.append(train_loss)`
			`valid_losses.append(valid_loss)`

			`print(f'\nTraining Loss: {train_loss:.3f}')`
			`print(f'Validation Loss: {valid_loss:.3f}')`
w123 2021-06-20 19:42:14 +02:00
progress 2021-06-20 22:05:07 +02:00			`print("Finished !!!")`