From 023a4e43619f67cc9c0d1b4436f80bf3cea7075c Mon Sep 17 00:00:00 2001 From: wangobango Date: Sun, 20 Jun 2021 22:05:07 +0200 Subject: [PATCH] progress --- generate.py | 56 ++++++++++ main.py | 314 +++++++++++++++++++++++++++++++--------------------- model.py | 44 ++++++++ 3 files changed, 287 insertions(+), 127 deletions(-) create mode 100644 generate.py create mode 100644 model.py diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..405912d --- /dev/null +++ b/generate.py @@ -0,0 +1,56 @@ +import pandas as pd +from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification +import torch +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler +import torch.nn as nn +from sklearn.utils.class_weight import compute_class_weight +import numpy as np +from model import BERT_Arch +from sklearn.metrics import classification_report +from sklearn.metrics import accuracy_score + +path = 'saved_weights.pt' +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +device = torch.device("cuda") + +bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') + +model = BERT_Arch(bert) +model.load_state_dict(torch.load(path)) +model.to(device) + +test_data = pd.read_csv("dev-0/in.tsv", sep="\t") +test_data.columns = ["text", "d"] + +test_target = pd.read_csv("dev-0/expected.tsv", sep="\t") + +tokens_train = tokenizer.batch_encode_plus( + test_data["text"], + max_length = 25, + padding='max_length', + truncation=True +) + +test_seq = torch.tensor(tokens_train['input_ids']) +test_mask = torch.tensor(tokens_train['attention_mask']) + +#define a batch size +batch_size = 32 + +# wrap tensors +test_data = TensorDataset(test_seq, test_mask) + +# sampler for sampling the data during training +test_sampler = RandomSampler(test_data) + +# dataLoader for train set +test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) + +with torch.no_grad(): + preds = model(test_seq.to(device), test_mask.to(device)) + preds = preds.detach().cpu().numpy() + preds = np.argmax(preds, axis = 1) + +print(classification_report(test_target['0'], preds)) +print(accuracy_score(test_target['0'], preds)) \ No newline at end of file diff --git a/main.py b/main.py index c49bcd9..655375b 100644 --- a/main.py +++ b/main.py @@ -1,155 +1,215 @@ import pandas as pd -from transformers import BertTokenizer, BertForSequenceClassification +from transformers import BertTokenizer, AdamW, AutoModelForSequenceClassification import torch -# from torchtext.data import BucketIterator, Iterator +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler +import torch.nn as nn +from sklearn.utils.class_weight import compute_class_weight +import numpy as np +from model import BERT_Arch -train_input_path = "dev-0/in.tsv" -train_target_path = "dev-0/expected.tsv" +train_input_path = "train/in.tsv" +train_target_path = "train/expected.tsv" -train_input = pd.read_csv(train_input_path, sep="\t")[:100] +train_input = pd.read_csv(train_input_path, sep="\t") train_input.columns=["text", "d"] -train_target = pd.read_csv(train_target_path, sep="\t")[:100] +train_target = pd.read_csv(train_target_path, sep="\t") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device("cuda") -MAX_SEQ_LEN = 128 -PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) -UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) -# label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) -# text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=True, include_lengths=False, batch_first=True, -# fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) +# seq_len = [len(i.split()) for i in train_input["text"]] -# fields = [('label', label_field), ('text', text_field),] +# pd.Series(seq_len).hist(bins = 30) +# plt.show() -# valid_iter = BucketIterator(train_input["text"], batch_size=16, sort_key=lambda x: len(x.text), -# device=device, train=True, sort=True, sort_within_batch=True) +bert = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') -class BERT(torch.nn.Module): - def __init__(self): - super(BERT, self).__init__() +tokens_train = tokenizer.batch_encode_plus( + train_input["text"], + max_length = 25, + padding='max_length', + truncation=True +) - options_name = "bert-base-uncased" - self.encoder = BertForSequenceClassification.from_pretrained(options_name) +train_seq = torch.tensor(tokens_train['input_ids']) +train_mask = torch.tensor(tokens_train['attention_mask']) +train_y = torch.tensor(train_target.to_numpy()) - def forward(self, text, label): - loss, text_fea = self.encoder(text, labels=label)[:2] +#define a batch size +batch_size = 32 - return loss, text_fea +# wrap tensors +train_data = TensorDataset(train_seq, train_mask, train_y) -def save_checkpoint(save_path, model, valid_loss): +# sampler for sampling the data during training +train_sampler = RandomSampler(train_data) - if save_path == None: - return +# dataLoader for train set +train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) + +for param in bert.parameters(): + param.requires_grad = False + +model = BERT_Arch(bert) +model = model.to(device) +# model.cuda(0) + +optimizer = AdamW(model.parameters(), lr = 1e-5) + +class_weights = compute_class_weight('balanced', np.unique(train_target.to_numpy()), train_target['1']) +weights= torch.tensor(class_weights,dtype=torch.float) +weights = weights.to(device) + +# define the loss function +cross_entropy = nn.NLLLoss(weight=weights) + +# number of training epochs +epochs = 10 + +def train(): + + model.train() + + total_loss, total_accuracy = 0, 0 + + # empty list to save model predictions + total_preds=[] + + # iterate over batches + for step,batch in enumerate(train_dataloader): - state_dict = {'model_state_dict': model.state_dict(), - 'valid_loss': valid_loss} + # progress update after every 50 batches. + if step % 50 == 0 and not step == 0: + print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) + + # push the batch to gpu + batch = [r.to(device) for r in batch] + + sent_id, mask, labels = batch + + # clear previously calculated gradients + model.zero_grad() + + # get model predictions for the current batch + preds = model(sent_id, mask) + + # compute the loss between actual and predicted values + labels = torch.tensor([x[0] for x in labels]).to(device) + loss = cross_entropy(preds, labels) + + # add on to the total loss + total_loss = total_loss + loss.item() + + # backward pass to calculate the gradients + loss.backward() + + # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + # update parameters + optimizer.step() + + # model predictions are stored on GPU. So, push it to CPU + preds=preds.detach().cpu().numpy() + + # append the model predictions + total_preds.append(preds) + + # compute the training loss of the epoch + avg_loss = total_loss / len(train_dataloader) + + # predictions are in the form of (no. of batches, size of batch, no. of classes). + # reshape the predictions in form of (number of samples, no. of classes) + total_preds = np.concatenate(total_preds, axis=0) + + #returns the loss and predictions + return avg_loss, total_preds + +def evaluate(): + + print("\nEvaluating...") + + # deactivate dropout layers + model.eval() + + total_loss, total_accuracy = 0, 0 + + # empty list to save the model predictions + total_preds = [] + + # iterate over batches + for step,batch in enumerate(train_dataloader): - torch.save(state_dict, save_path) - print(f'Model saved to ==> {save_path}') + # Progress update every 50 batches. + if step % 50 == 0 and not step == 0: + + # Calculate elapsed time in minutes. + + # Report progress. + print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader))) -def load_checkpoint(load_path, model): + # push the batch to gpu + batch = [t.to(device) for t in batch] + + sent_id, mask, labels = batch + + # deactivate autograd + with torch.no_grad(): + + # model predictions + preds = model(sent_id, mask) + + # compute the validation loss between actual and predicted values + labels = torch.tensor([x[0] for x in labels]).to(device) + loss = cross_entropy(preds,labels) + + total_loss = total_loss + loss.item() + + preds = preds.detach().cpu().numpy() + + total_preds.append(preds) + + # compute the validation loss of the epoch + avg_loss = total_loss / len(train_dataloader) + + # reshape the predictions in form of (number of samples, no. of classes) + total_preds = np.concatenate(total_preds, axis=0) + + return avg_loss, total_preds + +# avg_loss, total_preds = train() +# set initial loss to infinite +best_valid_loss = float('inf') + +# empty lists to store training and validation loss of each epoch +train_losses=[] +valid_losses=[] + +print("Started training!") +#for each epoch +for epoch in range(epochs): + + print('\n Epoch {:} / {:}'.format(epoch + 1, epochs)) - if load_path==None: - return + #train model + train_loss, _ = train() - state_dict = torch.load(load_path, map_location=device) - print(f'Model loaded from <== {load_path}') + #evaluate model + valid_loss, _ = evaluate() - model.load_state_dict(state_dict['model_state_dict']) - return state_dict['valid_loss'] - - -def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list): - - if save_path == None: - return + #save the best model + if valid_loss < best_valid_loss: + best_valid_loss = valid_loss + torch.save(model.state_dict(), 'saved_weights.pt') - state_dict = {'train_loss_list': train_loss_list, - 'valid_loss_list': valid_loss_list, - 'global_steps_list': global_steps_list} + # append training and validation loss + train_losses.append(train_loss) + valid_losses.append(valid_loss) - torch.save(state_dict, save_path) - print(f'Model saved to ==> {save_path}') + print(f'\nTraining Loss: {train_loss:.3f}') + print(f'Validation Loss: {valid_loss:.3f}') - -def load_metrics(load_path): - - if load_path==None: - return - - state_dict = torch.load(load_path, map_location=device) - print(f'Model loaded from <== {load_path}') - - return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list'] - -def train(model, - optimizer, - criterion = torch.nn.BCELoss(), - train_data = train_input['text'], - train_target = train_target, - num_epochs = 5, - eval_every = len(train_input) // 2, - file_path = "./", - best_valid_loss = float("Inf")): - - # initialize running values - running_loss = 0.0 - valid_running_loss = 0.0 - global_step = 0 - train_loss_list = [] - valid_loss_list = [] - global_steps_list = [] - - # training loop - model.train() - for epoch in range(num_epochs): - for text, label in zip(train_data, train_target): - output = model(text, label) - loss, _ = output - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # update running values - running_loss += loss.item() - global_step += 1 - - # evaluation step - if global_step % eval_every == 0: - model.eval() - - # evaluation - average_train_loss = running_loss / eval_every - average_valid_loss = valid_running_loss / len(train_data) - train_loss_list.append(average_train_loss) - valid_loss_list.append(average_valid_loss) - global_steps_list.append(global_step) - - # resetting running values - running_loss = 0.0 - valid_running_loss = 0.0 - model.train() - - # print progress - print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}' - .format(epoch+1, num_epochs, global_step, num_epochs*len(train_data), - average_train_loss, average_valid_loss)) - - # checkpoint - if best_valid_loss > average_valid_loss: - best_valid_loss = average_valid_loss - save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss) - save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) - - save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list) - print('Finished Training!') - -model = BERT().to(device) -model.cuda() -optimizer = torch.optim.Adam(model.parameters(), lr=2e-5) - -train(model=model, optimizer=optimizer) \ No newline at end of file +print("Finished !!!") \ No newline at end of file diff --git a/model.py b/model.py new file mode 100644 index 0000000..9d218ce --- /dev/null +++ b/model.py @@ -0,0 +1,44 @@ +import torch.nn as nn + +class BERT_Arch(nn.Module): + + def __init__(self, bert): + + super(BERT_Arch, self).__init__() + + self.bert = bert + + # dropout layer + self.dropout = nn.Dropout(0.1) + + # relu activation function + self.relu = nn.ReLU() + + # dense layer 1 + self.fc1 = nn.Linear(2,512) + + # dense layer 2 (Output layer) + self.fc2 = nn.Linear(512,2) + + #softmax activation function + self.softmax = nn.LogSoftmax(dim=1) + + #define the forward pass + def forward(self, sent_id, mask): + + #pass the inputs to the model + senence_classifier_output = self.bert(sent_id, attention_mask=mask) + x = senence_classifier_output.logits.float() + x = self.fc1(x) + + x = self.relu(x) + + x = self.dropout(x) + + # output layer + x = self.fc2(x) + + # apply softmax activation + x = self.softmax(x) + + return x \ No newline at end of file