In [1]:
!pip install transformers datasets torch sentencepiece




In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import pipeline
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [3]:
def load_and_process_dataset():
    dataset = load_dataset("sst2")
    dataset.remove_columns('idx')
    del dataset['test']
    dataset['test'] = dataset['validation']
    del dataset['validation']
    split_dataset = dataset['train'].train_test_split(test_size=1600)
    dataset['train'] = split_dataset['train']
    dataset['validation'] = split_dataset['test']
    return dataset

In [4]:
dataset = load_and_process_dataset()
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 65749
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1600
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

In [6]:
def transform_dataset(dataset):
    new_dataset = []
    for row in dataset['test']:
        text = row['sentence'].replace("\n", "")
        new_row = {'sentence': text, 'label': "negative" if row['label'] == 0 else "positive"}
        new_dataset.append(new_row)
    return new_dataset

new_dataset = transform_dataset(dataset)
dataset_copy = new_dataset.copy()

model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sentiment_classifier = pipeline("sentiment-analysis")

def create_predictions(test_data):
    predictions = []
    expected = []

    for row in tqdm(test_data):
        input_text = row['sentence']
        result = sentiment_classifier(input_text)
        label = result[0]['label'].lower()

        # Zamiana etykiet na format używany w zbiorze danych
        if label == 'positive':
            label = 'positive'
        else:
            label = 'negative'

        predictions.append(label)
        expected.append(row['label'])

    return predictions, expected


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [7]:
pred = create_predictions(dataset_copy)
predictions = pred[0]
expected = pred[1]


100%|██████████| 872/872 [01:08<00:00, 12.65it/s]


In [8]:
accuracy = accuracy_score(expected, [p if p in ["positive", "negative"] else "negative" for p in predictions])
print("Accuracy:", accuracy)


Accuracy: 0.9105504587155964


In [14]:
example_sentences = [
    "This movie was an amazing journey.",
    "I really did not like the new web design.",
    "The team did a great job with this project.",
    "I am not happy with the service.",
    "This is the best book I have ever read!",
]

for sentence in example_sentences:
    result = sentiment_classifier(sentence)
    print(f"Sentence: '{sentence}'\nSentiment: {result[0]['label']}, Score: {result[0]['score']:.2f}\n")


Sentence: 'This movie was an amazing journey.'
Sentiment: POSITIVE, Score: 1.00

Sentence: 'I really did not like the new web design.'
Sentiment: NEGATIVE, Score: 1.00

Sentence: 'The team did a great job with this project.'
Sentiment: POSITIVE, Score: 1.00

Sentence: 'I am not happy with the service.'
Sentiment: NEGATIVE, Score: 1.00

Sentence: 'This is the best book I have ever read!'
Sentiment: POSITIVE, Score: 1.00



In [13]:
example_sentences = [
"The cat is sitting on the mat.",
"There are clouds in the sky.",
"The book is on the table.",
"A car is parked outside.",
"The door is closed.",
]

for sentence in example_sentences:
    result = sentiment_classifier(sentence)
    print(f"Sentence: '{sentence}'\nSentiment: {result[0]['label']}, Score: {result[0]['score']:.2f}\n")

Sentence: 'The cat is sitting on the mat.'
Sentiment: NEGATIVE, Score: 0.98

Sentence: 'There are clouds in the sky.'
Sentiment: POSITIVE, Score: 1.00

Sentence: 'The book is on the table.'
Sentiment: POSITIVE, Score: 0.99

Sentence: 'A car is parked outside.'
Sentiment: POSITIVE, Score: 0.90

Sentence: 'The door is closed.'
Sentiment: NEGATIVE, Score: 0.98

