uczenie_glebokie_projekt/flan-t5.ipynb
2023-02-13 17:29:51 +01:00

52 KiB

!pip install transformers datasets torch sentencepiece
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score
def load_and_process_dataset():
    dataset = load_dataset("sst2")
    dataset.remove_columns('idx')
    del dataset['test']
    dataset['test'] = dataset['validation']
    del dataset['validation']
    split_dataset = dataset['train'].train_test_split(test_size=1600)
    dataset['train'] = split_dataset['train']
    dataset['validation'] = split_dataset['test']
    return dataset
dataset = load_and_process_dataset()
dataset
WARNING:datasets.builder:Found cached dataset sst2 (/root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
  0%|          | 0/3 [00:00<?, ?it/s]
DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 65749
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1600
    })
})
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]
Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]
def transform_dataset(dataset):
  new_dataset = []

  for row in dataset['test']:
    text = row['sentence'].replace("\n", "")
    new_row = {}
    new_row['sentence'] = text
    if row['label'] == 0:
      new_row['label'] = "negative"
    else:
      new_row['label'] = "positive"
    new_dataset.append(new_row)
  return new_dataset

def print_with_index(i):
  print(new_dataset[i]['label'],i)

def create_predictions(test_data):
  predictions = []
  expected = []

  for row in tqdm(dataset_copy):
    input_text = few_shot_example + " " + row['sentence'] + "\nlabel: "
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    generated_ids = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=200)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    predictions.append(generated_text)
    expected.append(row['label'])
  return predictions, expected
new_dataset = transform_dataset(dataset)
print_with_index(0)
print_with_index(46)
print_with_index(123)
print_with_index(300)
print_with_index(400)
print_with_index(500)
print_with_index(702)
print_with_index(802)
print_with_index(553)
print_with_index(655)
print_with_index(455)
print_with_index(258)
print_with_index(158)
print_with_index(752)
print_with_index(853)
print_with_index(855)
positive 0
negative 46
positive 123
positive 300
positive 400
negative 500
negative 702
negative 802
positive 553
positive 655
positive 455
negative 258
negative 158
negative 752
positive 853
positive 855
indexes = [0,553,655,455,258]
def create_dataset(indexes):
  few_shot_learning_examples = ''
  for i in indexes:
    few_shot_learning_examples = few_shot_learning_examples + "sentence: " + dataset_copy[i]['sentence'] + "\n" +'label: ' + dataset_copy[i]['label'] + '\n'
  for i in indexes:
    dataset_copy.pop(i)
  few_shot_learning_examples = few_shot_learning_examples + "sentence:"
  return few_shot_learning_examples
dataset_copy = new_dataset.copy()
few_shot_example = create_dataset(indexes)
print(len(dataset_copy),len(new_dataset))
print(few_shot_example)
867 872
sentence: it 's a charming and often affecting journey . 
label: positive
sentence: so unassuming and pure of heart , you ca n't help but warmly extend your arms and yell ` safe ! ' 
label: positive
sentence: birthday girl is an amusing joy ride , with some surprisingly violent moments . 
label: positive
sentence: ( chaiken 's ) talent lies in an evocative , accurate observation of a distinctive milieu and in the lively , convincing dialogue she creates for her characters . 
label: positive
sentence: `` the time machine '' is a movie that has no interest in itself . 
label: negative
sentence:
if torch.cuda.is_available():     
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
There are 1 GPU(s) available.
We will use the GPU: Tesla T4
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')
model.cuda()
pred = create_predictions(dataset_copy)
100%|██████████| 867/867 [00:48<00:00, 18.06it/s]
predictions = pred[0]
expected = pred[1]
accuracy_score(expected,predictions)
0.903114186851211