FLAN-T5

Colab link better looking output:

https://colab.research.google.com/drive/1bVujvgH49tyY83eqZoWYcDBZ2JurmLTL?usp=sharing

links

model: https://huggingface.co/google/flan-t5-base

!pip install transformers datasets torch sentencepiece

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score

def load_and_process_dataset():
    dataset = load_dataset("sst2")
    dataset.remove_columns('idx')
    del dataset['test']
    dataset['test'] = dataset['validation']
    del dataset['validation']
    split_dataset = dataset['train'].train_test_split(test_size=1600)
    dataset['train'] = split_dataset['train']
    dataset['validation'] = split_dataset['test']
    return dataset

dataset = load_and_process_dataset()
dataset

WARNING:datasets.builder:Found cached dataset sst2 (/root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)

  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 65749
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1600
    })
})

tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

def transform_dataset(dataset):
  new_dataset = []

  for row in dataset['test']:
    text = row['sentence'].replace("\n", "")
    new_row = {}
    new_row['sentence'] = text
    if row['label'] == 0:
      new_row['label'] = "negative"
    else:
      new_row['label'] = "positive"
    new_dataset.append(new_row)
  return new_dataset

def print_with_index(i):
  print(new_dataset[i]['label'],i)

def create_predictions(test_data):
  predictions = []
  expected = []

  for row in tqdm(dataset_copy):
    input_text = few_shot_example + " " + row['sentence'] + "\nlabel: "
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    predictions.append(generated_text)
    expected.append(row['label'])
  return predictions, expected

new_dataset = transform_dataset(dataset)

print_with_index(0)
print_with_index(46)
print_with_index(123)
print_with_index(300)
print_with_index(400)
print_with_index(500)
print_with_index(702)
print_with_index(802)
print_with_index(553)
print_with_index(655)
print_with_index(455)
print_with_index(258)
print_with_index(158)
print_with_index(752)
print_with_index(853)
print_with_index(855)

positive 0
negative 46
positive 123
positive 300
positive 400
negative 500
negative 702
negative 802
positive 553
positive 655
positive 455
negative 258
negative 158
negative 752
positive 853
positive 855

indexes = [0,553,655,455,258]

def create_dataset(indexes):
  few_shot_learning_examples = ''
  for i in indexes:
    few_shot_learning_examples = few_shot_learning_examples + "sentence: " + dataset_copy[i]['sentence'] + "\n" +'label: ' + dataset_copy[i]['label'] + '\n'
  for i in indexes:
    dataset_copy.pop(i)
  few_shot_learning_examples = few_shot_learning_examples + "sentence:"
  return few_shot_learning_examples

dataset_copy = new_dataset.copy()
few_shot_example = create_dataset(indexes)
print(len(dataset_copy),len(new_dataset))
print(few_shot_example)

867 872
sentence: it 's a charming and often affecting journey . 
label: positive
sentence: so unassuming and pure of heart , you ca n't help but warmly extend your arms and yell ` safe ! ' 
label: positive
sentence: birthday girl is an amusing joy ride , with some surprisingly violent moments . 
label: positive
sentence: ( chaiken 's ) talent lies in an evocative , accurate observation of a distinctive milieu and in the lively , convincing dialogue she creates for her characters . 
label: positive
sentence: `` the time machine '' is a movie that has no interest in itself . 
label: negative
sentence:

if torch.cuda.is_available():     
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4

model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')
model.cuda()

pred = create_predictions(dataset_copy)

100%|██████████| 867/867 [00:48<00:00, 18.06it/s]

predictions = pred[0]
expected = pred[1]

accuracy_score(expected,predictions)

0.903114186851211

52 KiB Raw Blame History

FLAN-T5

Colab link better looking output:

links

52 KiB

Raw Blame History