# Setup

## Requirements

In [67]:
!pip install torch
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install evaluate
!pip install accelerate
!pip install sentencepiece
!pip install protobuf
!pip install sacrebleu
!pip install py7zr


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Imports

In [68]:
import os
import json
import torch
from google.colab import drive
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
from transformers import T5Tokenizer

## Loading data

In [69]:
loaded_data = load_dataset('emotion')
!mkdir -v -p data
train_path = Path('data/train.json')
valid_path = Path('data/valid.json')
test_path = Path('data/test.json')
data_train, data_valid, data_test = [], [], []



  0%|          | 0/3 [00:00<?, ?it/s]

In [70]:
for source_data, dataset, max_size in [
  (loaded_data['train'], data_train, None),
  (loaded_data['validation'], data_valid, None),
  (loaded_data['test'], data_test, None),
]:
  for i, data in enumerate(source_data):
    if max_size is not None and i >= max_size:
      break
    data_line = {
      'label': int(data['label']),
      'text': data['text'],
    }
    dataset.append(data_line)

print(f'Train: {len(data_train):6d}')
print(f'Valid: {len(data_valid):6d}')
print(f'Test: {len(data_test):6d}')

Train:  16000
Valid:   2000
Test:   2000


In [71]:
MAP_LABEL_TRANSLATION = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise',
}

In [72]:
def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None:
    file_name = 's2s-' + original_save_path.name
    file_path = original_save_path.parent / file_name

    print(f'Saving into: {file_path}')
    with open(file_path, 'wt') as f_write:
        for data_line in data_to_save:
            label = data_line['label']
            new_label = MAP_LABEL_TRANSLATION[label]
            data_line['label'] = new_label
            data_line_str = json.dumps(data_line)
            f_write.write(f'{data_line_str}\n')

In [73]:
for file_path, data_to_save in [(train_path, data_train), (valid_path, data_valid), (test_path, data_test)]:
  print(f'Saving into: {file_path}')
  with open(file_path, 'wt') as f_write:
    for data_line in data_to_save:
      data_line_str = json.dumps(data_line)
      f_write.write(f'{data_line_str}\n')
  
  save_as_translations(file_path, data_to_save)

Saving into: data/train.json
Saving into: data/s2s-train.json
Saving into: data/valid.json
Saving into: data/s2s-valid.json
Saving into: data/test.json
Saving into: data/s2s-test.json


In [74]:
!head data/train.json

{"label": 0, "text": "i didnt feel humiliated"}
{"label": 0, "text": "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"}
{"label": 3, "text": "im grabbing a minute to post i feel greedy wrong"}
{"label": 2, "text": "i am ever feeling nostalgic about the fireplace i will know that it is still on the property"}
{"label": 3, "text": "i am feeling grouchy"}
{"label": 0, "text": "ive been feeling a little burdened lately wasnt sure why that was"}
{"label": 5, "text": "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny"}
{"label": 4, "text": "i feel as confused about life as a teenager or as jaded as a year old man"}
{"label": 1, "text": "i have been with petronas for years i feel that petronas has performed well and made a huge profit"}
{"label": 2, "text": "i feel romantic too"}


In [75]:
!head data/s2s-train.json

{"label": "sadness", "text": "i didnt feel humiliated"}
{"label": "sadness", "text": "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"}
{"label": "anger", "text": "im grabbing a minute to post i feel greedy wrong"}
{"label": "love", "text": "i am ever feeling nostalgic about the fireplace i will know that it is still on the property"}
{"label": "anger", "text": "i am feeling grouchy"}
{"label": "sadness", "text": "ive been feeling a little burdened lately wasnt sure why that was"}
{"label": "surprise", "text": "ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny"}
{"label": "fear", "text": "i feel as confused about life as a teenager or as jaded as a year old man"}
{"label": "joy", "text": "i have been with petronas for years i feel that petronas has performed well and made a huge profit"}
{"label": "love", "text": "i feel romantic too"}


In [76]:
# create tiny datasets for debugging purposes
for file_name in ["s2s-train", "s2s-valid", "s2s-test"]:
  print(f"=== {file_name} ===")
  all_text = Path(f"data/{file_name}.json").read_text().split('\n')
  text = all_text[:250] + all_text[-250:]
  Path(f"data/{file_name}-500.json").write_text("\n".join(text))

=== s2s-train ===
=== s2s-valid ===
=== s2s-test ===


In [77]:
!wc -l data/*

    499 data/s2s-test-500.json
   2000 data/s2s-test.json
    499 data/s2s-train-500.json
  16000 data/s2s-train.json
    499 data/s2s-valid-500.json
   2000 data/s2s-valid.json
   2000 data/test.json
  16000 data/train.json
   2000 data/valid.json
  41497 total


# Zero Shot

In [78]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import json
import time

In [96]:
!nvidia-smi

Mon Feb 13 23:18:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |   7320MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [89]:
if torch.cuda.is_available():
    device = 0
else:
    device = -1

In [91]:
def get_pipeline(pipeline_type: str, model_name: str, torch_dtype: torch.dtype="auto"):
    class_type = AutoModelForSeq2SeqLM
    model = class_type.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float32)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return pipeline(pipeline_type, model=model, tokenizer=tokenizer, device=device)

In [92]:
lm_pipeline = get_pipeline('text2text-generation', 'google/flan-t5-large')

In [97]:
def generate_prompt(text):
    labels = "possible labels: sadness, joy, love, anger, surprise, fear"
    prompt = labels + '\n' + f'text: {text}' + '\n' + 'label: '
    return prompt

In [98]:
def predict(text):
  return lm_pipeline(generate_prompt(text), do_sample=False)[0]['generated_text']

In [99]:
with open('data/s2s-test.json') as f:
    time_start = time.time()
    total = 0
    correct = 0
    lines = f.readlines()
    test_cases_amount = len(lines)
    for line in lines:
        item = json.loads(line)
        text = item['text']
        label = item['label']
        total += 1
        if total % 50 == 0:
            print(f'{total}/{test_cases_amount}')
        if predict(text) == label:
          correct += 1
    time_end = time.time()
    print(f'Minutes elapsed: {(time_end - time_start) / 60}')
    print(f'Accuracy: {correct/total}')

50/2000
100/2000
150/2000
200/2000
250/2000
300/2000
350/2000
400/2000
450/2000
500/2000
550/2000
600/2000
650/2000
700/2000
750/2000
800/2000
850/2000
900/2000
950/2000
1000/2000
1050/2000
1100/2000
1150/2000
1200/2000
1250/2000
1300/2000
1350/2000
1400/2000
1450/2000
1500/2000
1550/2000
1600/2000
1650/2000
1700/2000
1750/2000
1800/2000
1850/2000
1900/2000
1950/2000
2000/2000
Minutes elapsed: 3.088933833440145
Accuracy: 0.6505
