52 KiB
52 KiB
FLAN-T5
Colab link better looking output:
https://colab.research.google.com/drive/1bVujvgH49tyY83eqZoWYcDBZ2JurmLTL?usp=sharing
links
!pip install transformers datasets torch sentencepiece
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score
def load_and_process_dataset():
dataset = load_dataset("sst2")
dataset.remove_columns('idx')
del dataset['test']
dataset['test'] = dataset['validation']
del dataset['validation']
split_dataset = dataset['train'].train_test_split(test_size=1600)
dataset['train'] = split_dataset['train']
dataset['validation'] = split_dataset['test']
return dataset
dataset = load_and_process_dataset()
dataset
WARNING:datasets.builder:Found cached dataset sst2 (/root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
0%| | 0/3 [00:00<?, ?it/s]
DatasetDict({ train: Dataset({ features: ['idx', 'sentence', 'label'], num_rows: 65749 }) test: Dataset({ features: ['idx', 'sentence', 'label'], num_rows: 872 }) validation: Dataset({ features: ['idx', 'sentence', 'label'], num_rows: 1600 }) })
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
Downloading (…)okenizer_config.json: 0%| | 0.00/2.54k [00:00<?, ?B/s]
Downloading (…)"spiece.model";: 0%| | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.42M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/2.20k [00:00<?, ?B/s]
def transform_dataset(dataset):
new_dataset = []
for row in dataset['test']:
text = row['sentence'].replace("\n", "")
new_row = {}
new_row['sentence'] = text
if row['label'] == 0:
new_row['label'] = "negative"
else:
new_row['label'] = "positive"
new_dataset.append(new_row)
return new_dataset
def print_with_index(i):
print(new_dataset[i]['label'],i)
def create_predictions(test_data):
predictions = []
expected = []
for row in tqdm(dataset_copy):
input_text = few_shot_example + " " + row['sentence'] + "\nlabel: "
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
predictions.append(generated_text)
expected.append(row['label'])
return predictions, expected
new_dataset = transform_dataset(dataset)
print_with_index(0)
print_with_index(46)
print_with_index(123)
print_with_index(300)
print_with_index(400)
print_with_index(500)
print_with_index(702)
print_with_index(802)
print_with_index(553)
print_with_index(655)
print_with_index(455)
print_with_index(258)
print_with_index(158)
print_with_index(752)
print_with_index(853)
print_with_index(855)
positive 0 negative 46 positive 123 positive 300 positive 400 negative 500 negative 702 negative 802 positive 553 positive 655 positive 455 negative 258 negative 158 negative 752 positive 853 positive 855
indexes = [0,553,655,455,258]
def create_dataset(indexes):
few_shot_learning_examples = ''
for i in indexes:
few_shot_learning_examples = few_shot_learning_examples + "sentence: " + dataset_copy[i]['sentence'] + "\n" +'label: ' + dataset_copy[i]['label'] + '\n'
for i in indexes:
dataset_copy.pop(i)
few_shot_learning_examples = few_shot_learning_examples + "sentence:"
return few_shot_learning_examples
dataset_copy = new_dataset.copy()
few_shot_example = create_dataset(indexes)
print(len(dataset_copy),len(new_dataset))
print(few_shot_example)
867 872 sentence: it 's a charming and often affecting journey . label: positive sentence: so unassuming and pure of heart , you ca n't help but warmly extend your arms and yell ` safe ! ' label: positive sentence: birthday girl is an amusing joy ride , with some surprisingly violent moments . label: positive sentence: ( chaiken 's ) talent lies in an evocative , accurate observation of a distinctive milieu and in the lively , convincing dialogue she creates for her characters . label: positive sentence: `` the time machine '' is a movie that has no interest in itself . label: negative sentence:
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
There are 1 GPU(s) available. We will use the GPU: Tesla T4
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')
model.cuda()
pred = create_predictions(dataset_copy)
100%|██████████| 867/867 [00:48<00:00, 18.06it/s]
predictions = pred[0]
expected = pred[1]
accuracy_score(expected,predictions)
0.903114186851211